Merge remote-tracking branch 'mothur/master'

author Pat Schloss <pschloss@umich.edu>

Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)

committer Pat Schloss <pschloss@umich.edu>

Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)
author Pat Schloss <pschloss@umich.edu>
Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)
committer Pat Schloss <pschloss@umich.edu>
Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index 0e170bca666d877d35499134d28bc26e595f350c..ff18d5890cef43d65aabd4f444cef07b6bf2b0fe 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -15,6 +15,7 @@
                 A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
                 A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
                 A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
+               A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; };
                 A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
                 A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
                 A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
@@ -63,6 +64,7 @@
                 A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
                 A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
                 A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
+               A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; };
                 A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
                 A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; };
                 A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; };
@@ -207,7 +209,6 @@
                 A7E9B91312D37EC400DA6239 /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78312D37EC400DA6239 /* parsimony.cpp */; };
                 A7E9B91412D37EC400DA6239 /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78512D37EC400DA6239 /* parsimonycommand.cpp */; };
                 A7E9B91512D37EC400DA6239 /* pcoacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */; };
-               A7E9B91612D37EC400DA6239 /* phylodiversity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */; };
                 A7E9B91712D37EC400DA6239 /* phylodiversitycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */; };
                 A7E9B91812D37EC400DA6239 /* phylosummary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */; };
                 A7E9B91912D37EC400DA6239 /* phylotree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78F12D37EC400DA6239 /* phylotree.cpp */; };
@@ -394,6 +395,8 @@
                 A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = "<group>"; };
                 A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = "<group>"; };
                 A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = "<group>"; };
+               A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = "<group>"; };
+               A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = "<group>"; };
                 A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
                 A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = "<group>"; };
                 A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearmemorycommand.h; sourceTree = "<group>"; };
@@ -498,6 +501,8 @@
                 A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cooccurrencecommand.h; sourceTree = "<group>"; };
                 A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trialSwap2.cpp; sourceTree = "<group>"; };
                 A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = "<group>"; };
+               A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = "<group>"; };
+               A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = "<group>"; };
                 A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
                 A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = "<group>"; };
                 A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = "<group>"; };
@@ -803,8 +808,6 @@
                 A7E9B78612D37EC400DA6239 /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = "<group>"; };
                 A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcoacommand.cpp; sourceTree = "<group>"; };
                 A7E9B78812D37EC400DA6239 /* pcoacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcoacommand.h; sourceTree = "<group>"; };
-               A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversity.cpp; sourceTree = "<group>"; };
-               A7E9B78A12D37EC400DA6239 /* phylodiversity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversity.h; sourceTree = "<group>"; };
                 A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversitycommand.cpp; sourceTree = "<group>"; };
                 A7E9B78C12D37EC400DA6239 /* phylodiversitycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversitycommand.h; sourceTree = "<group>"; };
                 A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = "<group>"; };
@@ -1417,6 +1420,8 @@
                                 A7E9B73D12D37EC400DA6239 /* listseqscommand.cpp */,
                                 A7FA10001302E096003860FE /* mantelcommand.h */,
                                 A7FA10011302E096003860FE /* mantelcommand.cpp */,
+                               A724D2B4153C8600000A826F /* makebiomcommand.h */,
+                               A724D2B6153C8628000A826F /* makebiomcommand.cpp */,
                                 A799F5B71309A3E000AEEFA0 /* makefastqcommand.h */,
                                 A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */,
                                 A7E9B74412D37EC400DA6239 /* makegroupcommand.h */,
@@ -1826,8 +1831,6 @@
                                 A7E9B68F12D37EC400DA6239 /* classify.h */,
                                 A7E9B73812D37EC400DA6239 /* knn.h */,
                                 A7E9B73712D37EC400DA6239 /* knn.cpp */,
-                               A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */,
-                               A7E9B78A12D37EC400DA6239 /* phylodiversity.h */,
                                 A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */,
                                 A7E9B78E12D37EC400DA6239 /* phylosummary.h */,
                                 A7E9B78F12D37EC400DA6239 /* phylotree.cpp */,
@@ -1879,6 +1882,8 @@
                                 A713EBAB12DC7613000092AC /* readphylipvector.cpp */,
                                 A7E9B84312D37EC400DA6239 /* splitmatrix.cpp */,
                                 A7E9B84412D37EC400DA6239 /* splitmatrix.h */,
+                               A7D755D71535F665009BF21A /* treereader.h */,
+                               A7D755D91535F679009BF21A /* treereader.cpp */,
                         );
                         name = read;
                         sourceTree = "<group>";
@@ -2104,7 +2109,6 @@
                                 A7E9B91312D37EC400DA6239 /* parsimony.cpp in Sources */,
                                 A7E9B91412D37EC400DA6239 /* parsimonycommand.cpp in Sources */,
                                 A7E9B91512D37EC400DA6239 /* pcoacommand.cpp in Sources */,
-                               A7E9B91612D37EC400DA6239 /* phylodiversity.cpp in Sources */,
                                 A7E9B91712D37EC400DA6239 /* phylodiversitycommand.cpp in Sources */,
                                 A7E9B91812D37EC400DA6239 /* phylosummary.cpp in Sources */,
                                 A7E9B91912D37EC400DA6239 /* phylotree.cpp in Sources */,
@@ -2302,6 +2306,8 @@
                                 A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */,
                                 A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */,
                                 A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */,
+                               A7D755DA1535F679009BF21A /* treereader.cpp in Sources */,
+                               A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
@@ -2382,8 +2388,8 @@
                                 GCC_MODEL_TUNING = "";
                                 GCC_OPTIMIZATION_LEVEL = 0;
                                 GCC_PREPROCESSOR_DEFINITIONS = (
-                                       "VERSION=\"\\\"1.24.0\\\"\"",
-                                       "RELEASE_DATE=\"\\\"3/12/2012\\\"\"",
+                                       "VERSION=\"\\\"1.25.0\\\"\"",
+                                       "RELEASE_DATE=\"\\\"4/30/2012\\\"\"",
                                 );
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                 GCC_WARN_ABOUT_RETURN_TYPE = YES;
diff --git a/bayesian.cpp b/bayesian.cpp

index 54a123c5a7835d7c1a2c48d36fe3ec7070462027..1dc38337aef1bcc3b695ff56e86061cdab58c13d 100644 (file)
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -27,7 +27,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                 if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); }
                 
                 /************calculate the probablity that each word will be in a specific taxonomy*************/
-               string tfileroot = baseTName.substr(0,baseTName.find_last_of(".")+1);
+               string tfileroot = m->getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
                 string tempfileroot = m->getRootName(m->getSimpleName(baseName));
                 string phyloTreeName = tfileroot + "tree.train";
                 string phyloTreeSumName = tfileroot + "tree.sum";
@@ -230,7 +230,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                 delete phyloTree;
                                 
                                 phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
-                               
+                
                                 //save probabilities
                                 if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
                         }
diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp

index f238094c9563587be29064a8f3fdbce54314c0f9..026e91b01f188ce180c3b645de8451138efe0e02 100644 (file)
--- a/chimerauchimecommand.cpp
+++ b/chimerauchimecommand.cpp
@@ -544,7 +544,7 @@ int ChimeraUchimeCommand::execute(){
                                 int totalSeqs = 0;
                                 
                                 if(processors == 1)     {       totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups);     }
-                               else                            {       totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups);                      }
+                               else                            {       totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]);                      }
  
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
  
@@ -1473,7 +1473,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename
  }
  /**************************************************************************************************/
  
-int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector<string> groups) {
+int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector<string> groups, string nameFile, string groupFile, string fastaFile) {
         try {
                 
                 processIDS.clear();
@@ -1552,7 +1552,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o
                         // Allocate memory for thread data.
                         string extension = toString(i) + ".temp";
                         
-                       uchimeData* tempUchime = new uchimeData(outputFName+extension, templatefile, filename+extension, fastafile, namefile, groupfile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end,  i);
+                       uchimeData* tempUchime = new uchimeData(outputFName+extension, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end,  i);
                         tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
                         tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
                         
diff --git a/chimerauchimecommand.h b/chimerauchimecommand.h

index 499b18298dfffc66657f62fd00d48d7c3846bb15..b7da889701f3ac2bc7883cf5c39356550eb2e25d 100644 (file)
--- a/chimerauchimecommand.h
+++ b/chimerauchimecommand.h
@@ -59,7 +59,7 @@ private:
         int printFile(vector<seqPriorityNode>&, string);
         int deconvoluteResults(SequenceParser&, string, string, string);
         int driverGroups(SequenceParser&, string, string, string, string, int, int, vector<string>);
-       int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>);
+       int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>, string, string, string);
  
  
  };
diff --git a/classifytreecommand.cpp b/classifytreecommand.cpp

index 9ec4e6f89a40d12a1661144f8835cd6e1a4ccc84..bcf27698ce2bf68bf3dc6a3909228a9d4ccd5cb2 100644 (file)
--- a/classifytreecommand.cpp
+++ b/classifytreecommand.cpp
@@ -8,6 +8,7 @@
  
  #include "classifytreecommand.h"
  #include "phylotree.h"
+#include "treereader.h"
  
  //**********************************************************************************************************************
  vector<string> ClassifyTreeCommand::setParameters(){   
@@ -86,12 +87,6 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
                         vector<string> tempOutNames;
                         outputTypes["tree"] = tempOutNames;
                         outputTypes["summary"] = tempOutNames;
@@ -195,74 +190,19 @@ int ClassifyTreeCommand::execute(){
                 //    reading tree info                                                    //
                 /***************************************************/
          m->setTreeFile(treefile);
-        if (groupfile != "") {
-                       //read in group map info.
-                       tmap = new TreeMap(groupfile);
-                       tmap->readMap();
-               }else{ //fake out by putting everyone in one group
-                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       tmap = new TreeMap();
-                       
-                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               }
-               
-               if (namefile != "") { readNamesFile(); }
-               
-               read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
-               
-               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-               
-               read->AssembleTrees();
-               vector<Tree*> T = read->getTrees();
-        Tree* outputTree = T[0]; 
-               delete read;
-               
-               //make sure all files match
-               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-               int numNamesInTree;
-               if (namefile != "")  {  
-                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-                       else {   numNamesInTree = m->Treenames.size();  }
-               }else {  numNamesInTree = m->Treenames.size();  }
-               
-               
-               //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                               //is that name in the tree?
-                               int count = 0;
-                               for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                       count++;
-                               }
-                               
-                               if (m->control_pressed) { 
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
-                               }
-                               
-                               //then you did not find it so report it 
-                               if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
-                               }
-                       }
-               }
+        
+        TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        vector<Tree*> T = reader->getTrees();
+        TreeMap* tmap = T[0]->getTreeMap();
+        Tree* outputTree = T[0];
+        delete reader;
+
+        if (namefile != "") { readNamesFile(); }
                          
-        if (m->control_pressed) { delete outputTree; delete tmap;  return 0; }
+        if (m->control_pressed) { delete tmap;  delete outputTree;  return 0; }
                 
          readTaxonomyFile();
          
-        
          /***************************************************/
          //             get concensus taxonomies                    //
          /***************************************************/
@@ -484,6 +424,7 @@ map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i
                 
                 int lc = T->tree[i].getLChild();
                 int rc = T->tree[i].getRChild();
+        TreeMap* tmap = T->getTreeMap();
                 
                 if (lc == -1) { //you are a leaf your only descendant is yourself
              string group = tmap->getGroup(T->tree[i].getName());
diff --git a/classifytreecommand.h b/classifytreecommand.h

index 026e4bae414d227dab7750d623999b86d4aa4a73..30957af065686f1f8958f7ea84d4be49c187debe 100644 (file)
--- a/classifytreecommand.h
+++ b/classifytreecommand.h
@@ -30,8 +30,6 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       ReadTree* read;
-    TreeMap* tmap;
         string treefile, taxonomyfile, groupfile, namefile, outputDir;
         bool abort;
         vector<string> outputNames;
diff --git a/commandfactory.cpp b/commandfactory.cpp

index 0c9504d09a109a39168af9b9d582ef20732e6647..527f7bfe4fddf6ecf1ddce45ab7c93a161f52ad1 100644 (file)
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -130,6 +130,7 @@
  #include "cooccurrencecommand.h"
  #include "pcrseqscommand.h"
  #include "createdatabasecommand.h"
+#include "makebiomcommand.h"
  
  /*******************************************************/
  
@@ -281,6 +282,7 @@ CommandFactory::CommandFactory(){
      commands["cooccurrence"]        = "cooccurrence";
      commands["pcr.seqs"]            = "pcr.seqs";
      commands["create.database"]     = "create.database";
+    commands["make.biom"]           = "make.biom";
         commands["quit"]                                = "MPIEnabled"; 
  
  }
@@ -304,7 +306,49 @@ CommandFactory::~CommandFactory(){
         delete shellcommand;
         delete pipecommand;
  }
+/***********************************************************/
  
+/***********************************************************/
+int CommandFactory::checkForRedirects(string optionString) {
+    try {
+        
+        int pos = optionString.find("outputdir");
+        if (pos != string::npos) { //user has set outputdir in command option string
+            string outputOption = "";
+            bool foundEquals = false;
+            for(int i=pos;i<optionString.length();i++){
+                if(optionString[i] == ',')       { break;               }              
+                else if(optionString[i] == '=')  { foundEquals = true; }
+                if (foundEquals)       {   outputOption += optionString[i]; }
+            }
+            if(m->dirCheck(outputOption)){ 
+                setOutputDirectory(outputOption); 
+                m->mothurOut("Setting output directory to: " + outputOption); m->mothurOutEndLine();
+            }
+        }
+        
+        pos = optionString.find("inputdir");
+        if (pos != string::npos) { //user has set inputdir in command option string
+            string intputOption = "";
+            bool foundEquals = false;
+            for(int i=pos;i<optionString.length();i++){
+                if(optionString[i] == ',')       { break;               }              
+                else if(optionString[i] == '=')  { foundEquals = true; }
+                if (foundEquals)       {   intputOption += optionString[i]; }
+            }
+            if(m->dirCheck(intputOption)){ 
+                setInputDirectory(intputOption); 
+                m->mothurOut("Setting input directory to: " + intputOption); m->mothurOutEndLine();
+            }
+        }
+        
+        return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "CommandFactory", "getCommand");
+               exit(1);
+       }
+}
  /***********************************************************/
  
  /***********************************************************/
@@ -313,7 +357,9 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
         try {
          
                 delete command;   //delete the old command
-               
+        
+        checkForRedirects(optionString);
+                       
                 //user has opted to redirect output from dir where input files are located to some other place
                 if (outputDir != "") { 
                         if (optionString != "") { optionString += ", outputdir=" + outputDir; }
@@ -446,6 +492,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
          else if(commandName == "cooccurrence")          {      command = new CooccurrenceCommand(optionString);            }
          else if(commandName == "pcr.seqs")              {      command = new PcrSeqsCommand(optionString);                 }
          else if(commandName == "create.database")       {      command = new CreateDatabaseCommand(optionString);          }
+        else if(commandName == "make.biom")             {      command = new MakeBiomCommand(optionString);                }
                 else                                                                                    {       command = new NoCommand(optionString);                                          }
  
                 return command;
@@ -463,6 +510,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
         try {
                 delete pipecommand;   //delete the old command
                 
+        checkForRedirects(optionString);
+        
                 //user has opted to redirect output from dir where input files are located to some other place
                 if (outputDir != "") { 
                         if (optionString != "") { optionString += ", outputdir=" + outputDir; }
@@ -594,6 +643,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
          else if(commandName == "cooccurrence")          {      pipecommand = new CooccurrenceCommand(optionString);            }
          else if(commandName == "pcr.seqs")              {      pipecommand = new PcrSeqsCommand(optionString);                 }
          else if(commandName == "create.database")       {      pipecommand = new CreateDatabaseCommand(optionString);          }
+        else if(commandName == "make.biom")             {      pipecommand = new MakeBiomCommand(optionString);                }
                 else                                                                                    {       pipecommand = new NoCommand(optionString);                                              }
  
                 return pipecommand;
@@ -730,6 +780,7 @@ Command* CommandFactory::getCommand(string commandName){
          else if(commandName == "cooccurrence")          {      shellcommand = new CooccurrenceCommand();           }
          else if(commandName == "pcr.seqs")              {      shellcommand = new PcrSeqsCommand();                }
          else if(commandName == "create.database")       {      shellcommand = new CreateDatabaseCommand();         }
+        else if(commandName == "make.biom")             {      shellcommand = new MakeBiomCommand();               }
                 else                                                                                    {       shellcommand = new NoCommand();                                         }
  
                 return shellcommand;
diff --git a/commandfactory.hpp b/commandfactory.hpp

index 3c19e87cdde276b42aeb7488243de568812ffbca..e95db8016a4149c9c2defdbb9e7933ff6ee3905a 100644 (file)
--- a/commandfactory.hpp
+++ b/commandfactory.hpp
@@ -48,6 +48,8 @@ private:
         string outputDir, inputDir, logFileName;\r
         bool append;\r
         \r
+    int checkForRedirects(string);\r
+    \r
         static CommandFactory* _uniqueInstance;\r
         CommandFactory( const CommandFactory& ); // Disable copy constructor\r
         void operator=( const CommandFactory& ); // Disable assignment operator\r
diff --git a/commandoptionparser.cpp b/commandoptionparser.cpp

index e356afe5c74972e5298ddfa87a0672c97c4facfe..dfad533c8898ff3d79067d5f16c6869803e21e12 100644 (file)
--- a/commandoptionparser.cpp
+++ b/commandoptionparser.cpp
@@ -22,16 +22,17 @@ CommandOptionParser::CommandOptionParser(string input){
                 optionString = "";
                 commandString = "";
  
-               if(openParen != -1 && closeParen != -1){                        
-                       commandString = input.substr(0, openParen);   //commandString contains everything before "("
+               if(openParen != -1 && closeParen != -1){        
+            //gobble extra spaces
+            int spot = 0;
+            for (int i = 0; i < input.length(); i++) {  if (!(isspace(input[i]))) { spot = i; break; } }
+            if (spot > openParen) { spot = 0; }
+                       commandString = input.substr(spot, openParen-spot);   //commandString contains everything before "("
                         optionString = input.substr((openParen+1), (closeParen-openParen-1)); //optionString contains everything between "(" and ")".
                 }
                 else if (openParen == -1) { m->mothurOut("[ERROR]: You are missing ("); m->mothurOutEndLine(); }
                 else if (closeParen == -1) { m->mothurOut("[ERROR]:You are missing )"); m->mothurOutEndLine(); }
-                                       
-               //GlobalData* globaldata = GlobalData::getInstance();
-               //globaldata->parseGlobalData(commandString, optionString);                     //parser to separate and check options
-       }
+    }
         catch(exception& e) {
                 m->errorOut(e, "CommandOptionParser", "CommandOptionParser");
                 exit(1);
diff --git a/consensus.cpp b/consensus.cpp

index 04671f87913b1fad40f060654942cdaae8d5ff89..1be052f3aee81f6f0aded053de8249dd958118dd 100644 (file)
--- a/consensus.cpp
+++ b/consensus.cpp
@@ -10,7 +10,7 @@
  #include "consensus.h"
  
  //**********************************************************************************************************************
-Tree* Consensus::getTree(vector<Tree*>& t, TreeMap* tmap){
+Tree* Consensus::getTree(vector<Tree*>& t){
         try {
                 numNodes = t[0]->getNumNodes();
                 numLeaves = t[0]->getNumLeaves();
@@ -21,7 +21,7 @@ Tree* Consensus::getTree(vector<Tree*>& t, TreeMap* tmap){
                 
                 if (m->control_pressed) { return 0; }
                 
-               consensusTree = new Tree(tmap);
+               consensusTree = new Tree(t[0]->getTreeMap());
                 
                 it2 = nodePairs.find(treeSet);
                 
@@ -35,11 +35,12 @@ Tree* Consensus::getTree(vector<Tree*>& t, TreeMap* tmap){
                 
                 buildConsensusTree(treeSet);
                 
-               if (m->control_pressed) { delete consensusTree; return 0; }
+               if (m->control_pressed) {  delete consensusTree; return 0; }
                 
-               consensusTree->assembleTree();
+        map<string, string> empty;
+               consensusTree->assembleTree(empty);
                 
-               if (m->control_pressed) { delete consensusTree; return 0; }
+               if (m->control_pressed) {  delete consensusTree; return 0; }
                                 
                 return consensusTree; 
                 
diff --git a/consensus.h b/consensus.h

index 630ee8d527e0abe2e9456f4f663ea041d25aa728..faa4e42990316c6f9a441335c531279a8c031ec9 100644 (file)
--- a/consensus.h
+++ b/consensus.h
@@ -25,7 +25,7 @@ public:
         Consensus() { m = MothurOut::getInstance(); }   
         ~Consensus() {}
         
-    Tree* getTree(vector<Tree*>&, TreeMap*);
+    Tree* getTree(vector<Tree*>&);
                 
  private:
      MothurOut* m;
diff --git a/cooccurrencecommand.cpp b/cooccurrencecommand.cpp

index 6864f799f99a1533766f7350f0a6d8a409fe2876..8c763e275e5d9cebf885794fafdc3c9f44f0cb6a 100644 (file)
--- a/cooccurrencecommand.cpp
+++ b/cooccurrencecommand.cpp
@@ -136,7 +136,7 @@ CooccurrenceCommand::CooccurrenceCommand(string option) {
                                 m->mothurOut("[ERROR]: " + metric + " is not a valid metric option for the cooccurrence command. Choices are cscore, checker, combo, vratio."); m->mothurOutEndLine(); abort = true; 
                         }
                         
-                       matrix = validParameter.validFile(parameters, "matrix", false);                         if (matrix == "not found") { matrix = "sim2"; }
+                       matrix = validParameter.validFile(parameters, "matrixmodel", false);                            if (matrix == "not found") { matrix = "sim2"; }
                         
                         if ((matrix != "sim1") && (matrix != "sim2") && (matrix != "sim3") && (matrix != "sim4") && (matrix != "sim5" ) && (matrix != "sim6" ) && (matrix != "sim7" ) && (matrix != "sim8" ) && (matrix != "sim9" )) {
                                 m->mothurOut("[ERROR]: " + matrix + " is not a valid matrix option for the cooccurrence command. Choices are sim1, sim2, sim3, sim4, sim5, sim6, sim7, sim8, sim9."); m->mothurOutEndLine(); abort = true; 
@@ -270,162 +270,278 @@ int CooccurrenceCommand::execute(){
  //**********************************************************************************************************************
  
  int CooccurrenceCommand::getCooccurrence(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
-       try {
+    try {
          int numOTUS = thisLookUp[0]->getNumBins();
-        vector< vector<int> > initmatrix; initmatrix.resize(thisLookUp.size());
          vector< vector<int> > co_matrix; co_matrix.resize(thisLookUp[0]->getNumBins());
          for (int i = 0; i < thisLookUp[0]->getNumBins(); i++) { co_matrix[i].resize((thisLookUp.size()), 0); }
-        for (int i = 0; i < thisLookUp.size(); i++) { initmatrix[i].resize((thisLookUp[i]->getNumBins()), 0); }
          vector<int> columntotal; columntotal.resize(thisLookUp.size(), 0);
          vector<int> rowtotal; rowtotal.resize(numOTUS, 0);
          
-        int rowcount = 0;
-        for (int i = 0; i < thisLookUp.size(); i++) {
-                       for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) {
-                               if (m->control_pressed) { return 0; }                   
-                               int abund = thisLookUp[i]->getAbundance(j);
-                               
-                               if(abund > 0) {
-                                   initmatrix[i][j] = 1;
+        for (int i = 0; i < thisLookUp.size(); i++) { //nrows in the shared file
+            for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { //cols of original shared file
+                if (m->control_pressed) { return 0; }
+                int abund = thisLookUp[i]->getAbundance(j);
+                
+                if(abund > 0) {
                      co_matrix[j][i] = 1;
-                    rowcount++;
-                    columntotal[j]++;
-                               }
-                       }
-            rowtotal[i] = rowcount;
-            rowcount = 0;
+                    rowtotal[j]++;
+                    columntotal[i]++;
+                }
+            }
          }
          
          //nrows is ncols of inital matrix. All the functions need this value. They assume the transposition has already taken place and nrows and ncols refer to that matrix.
          //comatrix and initmatrix are still vectors of vectors of ints as in the original script. The abundancevector is only what was read in ie not a co-occurrence matrix!
-        int ncols = numOTUS;//rows of inital matrix
-        int nrows = thisLookUp.size();//groups
+        int nrows = numOTUS;//rows of inital matrix
+        int ncols = thisLookUp.size();//groups
          double initscore = 0.0;
-        //transpose matrix
-        int newmatrows = ncols;
-        int newmatcols = nrows;
-      
-        //swap for transposed matrix
-        nrows = newmatrows;//ncols;
-        ncols = newmatcols;//nrows;
          
-        vector<int> initcolumntotal; initcolumntotal.resize(ncols, 0);
-        vector<int> initrowtotal; initrowtotal.resize(nrows, 0);
          vector<double> stats;
-               
+        double probabilityMatrix[ncols * nrows];
+        vector<vector<int> > nullmatrix(nrows, vector<int>(ncols, 0));
+        
          TrialSwap2 trial;
          
-        initcolumntotal = rowtotal;
-        initrowtotal = columntotal;
-        trial.update_row_col_totals(co_matrix, rowtotal, columntotal);
+        int n = accumulate( columntotal.begin(), columntotal.end(), 0 );
          
-        if (metric == "cscore")         { initscore = trial.calc_c_score(co_matrix, rowtotal);    }
-        else if (metric == "checker")   { initscore = trial.calc_checker(co_matrix, rowtotal);    }
-        else if (metric == "vratio")    { initscore = trial.calc_vratio(rowtotal, columntotal);   }
-        else if (metric == "combo")     { initscore = trial.calc_combo(co_matrix);                }
-        else                            {  m->mothurOut("[ERROR]: No metric selected!\n");  m->control_pressed = true; return 1;            }
+        //============================================================
          
-        m->mothurOut("Initial c score: " + toString(initscore)); m->mothurOutEndLine();
+        //generate a probability matrix. Only do this once.
+        float start = 0.0;
          
-        //nullmatrix burn in
-        for(int i=0;i<10000;i++) {
-            if (m->control_pressed) { return 0; }
-            if (matrix == "sim1") {
-                trial.sim1(co_matrix);
-            }else if (matrix == "sim2") {
-                trial.sim2(co_matrix);
-            }else if (matrix == "sim3") {
-                trial.sim3(initmatrix);
-                co_matrix = initmatrix;
-            }else if (matrix == "sim4") {
-                trial.sim4(columntotal, rowtotal, co_matrix);
-            }else if (matrix == "sim5") {
-                trial.sim5(initcolumntotal, initrowtotal, initmatrix);
-                trial.transpose_matrix(initmatrix,co_matrix);
-            }else if (matrix == "sim6") {
-                trial.sim6(columntotal, co_matrix);
-            }else if (matrix == "sim7") {
-                trial.sim7(initcolumntotal, initmatrix);          
-                co_matrix = initmatrix;
-            }else if (matrix == "sim8") {
-                trial.sim8(columntotal, rowtotal, co_matrix);
-            }else if (matrix == "sim9") {
-                trial.swap_checkerboards (co_matrix);
-            }else{
-                m->mothurOut("[ERROR]: No model selected! \n");
-                m->control_pressed = true;
+        if (matrix == "sim1") {
+            for(int i=0;i<nrows;i++) {
+                for(int j=0;j<ncols;j++) {
+                    probabilityMatrix[ncols * i + j] = start + 1/double(nrows*ncols);
+                    start = start + 1/double(nrows*ncols);
+                }
              }
          }
-                
-        //run
-        for(int i=0;i<runs;i++) {
-            if (m->control_pressed) { return 0; }
-            //calc metric of nullmatrix
-            if (matrix == "sim1") {
-                trial.sim1(co_matrix);
-            }else if (matrix == "sim2") {
-                trial.sim2(co_matrix);
-            }else if (matrix == "sim3") {
-                trial.sim3(initmatrix);
-                co_matrix = initmatrix;
-            }else if (matrix == "sim4") {
-                trial.sim4(columntotal, rowtotal, co_matrix);
-            }else if (matrix == "sim5") {
-                trial.sim5(initcolumntotal, initrowtotal, initmatrix);
-                trial.transpose_matrix(initmatrix,co_matrix);
-            }else if (matrix == "sim6") {
-                trial.sim6(columntotal, co_matrix);
-            }else if (matrix == "sim7") {
-                trial.sim7(initcolumntotal, initmatrix);          
-                co_matrix = initmatrix;
-            }else if (matrix == "sim8") {
-                trial.sim8(columntotal, rowtotal, co_matrix);
-            }else if (matrix == "sim9") {
-                trial.swap_checkerboards (co_matrix);
-            }else{
-                 m->mothurOut("[ERROR]: No model selected! \n");
-                 m->control_pressed = true;
+        //don't need a prob matrix because we just shuffle the rows, may use this in the future
+        else if (matrix == "sim2") { }
+//            for(int i=0;i<nrows;i++) {
+//                start = 0.0;
+//                for(int j=0;j<ncols;j++) {
+//                    probabilityMatrix[ncols * i + j] = start + 1/double(ncols);
+//                    start = start + 1/double(ncols);
+//                }
+//            }
+//        }
+        
+        else if (matrix == "sim3") {
+            for(int j=0;j<ncols;j++) {
+                start = 0.0;
+                for(int i=0;i<nrows;i++) {
+                    probabilityMatrix[ncols * i + j] = start + 1/double(nrows);
+                    start = start + 1/double(nrows);
+                }
+            }
+        }
+        
+        else if (matrix == "sim4") {
+            for(int i=0;i<nrows;i++) {
+                start = 0.0;
+                for(int j=0;j<ncols;j++) {
+                    probabilityMatrix[ncols * i + j] = start + columntotal[j]/double(n);
+                    start = start + columntotal[j]/double(n);
+                }
+            }
+        }
+        
+        else if (matrix == "sim5") {
+            for(int j=0;j<ncols;j++) {
+                start = 0.0;
+                for(int i=0;i<nrows;i++) {
+                    probabilityMatrix[ncols * i + j] = start + rowtotal[i]/double(n);
+                    start = start + rowtotal[i]/double(n);
+                }
+            }
+        }
+        
+        else if (matrix == "sim6") {
+            for(int i=0;i<nrows;i++) {
+                for(int j=0;j<ncols;j++) {
+                    probabilityMatrix[ncols * i + j] = start + columntotal[j]/double(n*nrows);
+                    start = start + columntotal[j]/double(n*nrows);
+                }
+            }
+        }
+        
+        
+        else if (matrix == "sim7") {
+            for(int i=0;i<nrows;i++) {
+                for(int j=0;j<ncols;j++) {
+                    probabilityMatrix[ncols * i + j] = start + rowtotal[i]/double(n*ncols);
+                    start = start + rowtotal[i]/double(n*ncols);
+                }
+            }
+        }
+        
+        else if (matrix == "sim8") {
+            for(int i=0;i<nrows;i++) {
+                for(int j=0;j<ncols;j++) {
+                    probabilityMatrix[ncols * i + j] = start + (rowtotal[i]*columntotal[j])/double(n*n);
+                    start = start + (rowtotal[i]*columntotal[j])/double(n*n);
+                }
+            }
+        }
+        else if (matrix == "sim9" || matrix == "sim2") { }
+        else {
+            m->mothurOut("[ERROR]: No model selected! \n");
+            m->control_pressed = true;
+        }
+        
+        
+        //co_matrix is the transposed shared file, initmatrix is the original shared file
+        if (metric == "cscore") { initscore = trial.calc_c_score(co_matrix, rowtotal, ncols, nrows); }
+        else if (metric == "checker") { initscore = trial.calc_checker(co_matrix, rowtotal, ncols, nrows); }
+        else if (metric == "vratio") { initscore = trial.calc_vratio(nrows, ncols, rowtotal, columntotal); }
+        else if (metric == "combo") { initscore = trial.calc_combo(nrows, ncols, co_matrix); }
+        else { m->mothurOut("[ERROR]: No metric selected!\n"); m->control_pressed = true; return 1; }
+        
+        m->mothurOut("Initial c score: " + toString(initscore)); m->mothurOutEndLine();
+        
+        double previous;
+        double current;
+        double randnum;
+        int count;
+
+        //burn-in for sim9    
+        if(matrix == "sim9") {
+            for(int i=0;i<10000;i++) trial.swap_checkerboards (co_matrix, ncols, nrows);
+        }
+
+        //populate null matrix from probability matrix, do this a lot.
+        for(int k=0;k<runs;k++){
+            nullmatrix.clear();
+            //zero-fill the null matrix
+            nullmatrix.assign(nrows, vector<int>(ncols, 0));
+            
+            if(matrix == "sim1" || matrix == "sim6" || matrix == "sim8" || matrix == "sim7") {
+                count = 0;
+                while(count < n) {
+                    if (m->control_pressed) { return 0; }
+                nextnum2:
+                    previous = 0.0;
+                    randnum = rand() / double(RAND_MAX);
+                    for(int i=0;i<nrows;i++) {
+                        for(int j=0;j<ncols;j++) {
+                            current = probabilityMatrix[ncols * i + j];
+                            if(randnum <= current && randnum > previous) {
+                                nullmatrix[i][j] = 1;
+                                count++;
+                                if (count > n) break;
+                                else
+                                    goto nextnum2;
+                            }
+                            previous = current;
+                        }
+                    }
+                }
+            }
+            
+            else if (matrix == "sim2") {
+                for(int i=0;i<nrows;i++) {
+                    random_shuffle( co_matrix[i].begin(), co_matrix[i].end() ); 
+                }
+                //do this for the scoring since those all have nullmatrix as a parameter
+                //nullmatrix gets cleared at the begining of each run
+                nullmatrix = co_matrix;
+            }
+            
+            else if(matrix == "sim4") {
+                for(int i=0;i<nrows;i++) {
+                    count = 0;
+                    while(count < rowtotal[i]) {
+                        previous = 0.0;
+                        if (m->control_pressed) { return 0; }
+                        randnum = rand() / double(RAND_MAX);
+                        for(int j=0;j<ncols;j++) {
+                            current = probabilityMatrix[ncols * i + j];
+                            if(randnum <= current && randnum > previous && nullmatrix[i][j] != 1) {
+                                nullmatrix[i][j] = 1;
+                                count++;
+                                previous = 0.0;
+                                break;
+                            }
+                            previous = current;
+                        }
+                    }
+                }
+            }
+            
+            else if(matrix == "sim3" || matrix == "sim5") {
+                //columns
+                for(int j=0;j<ncols;j++) {
+                    count = 0;
+                    while(count < columntotal[j]) {
+                        if (m->control_pressed) { return 0; }
+                        randnum = rand() / double(RAND_MAX);
+                        for(int i=0;i<nrows;i++) {
+                            current = probabilityMatrix[ncols * i + j];
+                            if(randnum <= current && randnum > previous && nullmatrix[i][j] != 1) {
+                                nullmatrix[i][j] = 1;
+                                count++;
+                                previous = 0.0;
+                                break;
+                            }
+                            previous = current;
+                        }
+                    }
+                }
+            }
+            
+            //swap_checkerboards takes the original matrix and swaps checkerboards
+            else if(matrix == "sim9") {
+                trial.swap_checkerboards (co_matrix, ncols, nrows);
+            }
+            else {
+                m->mothurOut("[ERROR]: No null model selected!\n\n"); m->control_pressed = true;
+                return 1;
              }
-            //
-            //            
-            trial.update_row_col_totals(co_matrix, rowtotal, columntotal); 
              
-            if (metric == "cscore") { 
-                stats.push_back(trial.calc_c_score(co_matrix, rowtotal));
-            }else if (metric == "checker") { 
-                stats.push_back(trial.calc_checker(co_matrix, rowtotal));
-            }else if (metric == "vratio") { 
-                stats.push_back(trial.calc_vratio(rowtotal, columntotal));
-            }else if (metric == "combo") { 
-                stats.push_back(trial.calc_combo(co_matrix));
-            }else {
-                m->mothurOut("[ERROR]: No metric selected!\n");
-                m->control_pressed = true;
+            //run metric on null matrix and add score to the stats vector
+            if (metric == "cscore"){
+                stats.push_back(trial.calc_c_score(nullmatrix, rowtotal, ncols, nrows));
+            }
+            else if (metric == "checker") {
+                stats.push_back(trial.calc_checker(nullmatrix, rowtotal, ncols, nrows));
+            }
+            else if (metric == "vratio") {
+                stats.push_back(trial.calc_vratio(nrows, ncols, rowtotal, columntotal));
+            }
+            else if (metric == "combo") {
+                stats.push_back(trial.calc_combo(nrows, ncols, nullmatrix));
+            }
+            else {
+                m->mothurOut("[ERROR]: No metric selected!\n\n"); m->control_pressed = true;
                  return 1;
              }
              
          }
-
+        
+        
+        
          double total = 0.0;
-        for (int i=0; i<stats.size();i++)   {   total+=stats[i];   }
+        for (int i=0; i<stats.size();i++) { total+=stats[i]; }
          
-        double nullMean = double (total/(double)stats.size()); 
+        double nullMean = double (total/(double)stats.size());
          
          m->mothurOutEndLine(); m->mothurOut("average metric score: " + toString(nullMean)); m->mothurOutEndLine();
          
          double pvalue = 0.0;
-        if (metric == "cscore" || metric == "checker") {    pvalue = trial.calc_pvalue_greaterthan (stats, initscore);   }
-        else{   pvalue = trial.calc_pvalue_lessthan (stats, initscore); }
+        if (metric == "cscore" || metric == "checker") { pvalue = trial.calc_pvalue_greaterthan (stats, initscore); }
+        else{ pvalue = trial.calc_pvalue_lessthan (stats, initscore); }
          
          m->mothurOut("pvalue: " + toString(pvalue)); m->mothurOutEndLine();
          out << metric << '\t' << thisLookUp[0]->getLabel() << '\t' << nullMean << '\t' << pvalue << endl;
          
          return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "CooccurrenceCommand", "Cooccurrence");
-               exit(1);
-       }
+    }
+    catch(exception& e) {
+        m->errorOut(e, "CooccurrenceCommand", "Cooccurrence");
+        exit(1);
+    }
  }
  //**********************************************************************************************************************
  
diff --git a/deuniquetreecommand.cpp b/deuniquetreecommand.cpp

index 64ea9b700d3e7193fb3502ae2bd5dd160cb26297..c33c8e4ce19014c0059a89469217993a3f2fe59d 100644 (file)
--- a/deuniquetreecommand.cpp
+++ b/deuniquetreecommand.cpp
@@ -8,6 +8,7 @@
   */
  
  #include "deuniquetreecommand.h"
+#include "treereader.h"
  
  //**********************************************************************************************************************
  vector<string> DeuniqueTreeCommand::setParameters(){   
@@ -103,13 +104,7 @@ DeuniqueTreeCommand::DeuniqueTreeCommand(string option)  {
                                 }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
-                       //check for required parameters
+            //check for required parameters
                         treefile = validParameter.validFile(parameters, "tree", true);
                         if (treefile == "not open") { abort = true; }
                         else if (treefile == "not found") {                             //if there is a current design file, use it
@@ -144,72 +139,21 @@ int DeuniqueTreeCommand::execute() {
                 
                 m->setTreeFile(treefile);
                 
-               //extracts names from tree to make faked out groupmap
-               Tree* tree = new Tree(treefile); delete tree;  
-               tmap = new TreeMap();
-               for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               
-               if (m->control_pressed) {  delete tmap;  return 0; }
-               
-               readNamesFile(); 
-               
-               if (m->control_pressed) {  delete tmap;  return 0; }
-               
-               ReadTree* read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
-               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-               
-               read->AssembleTrees();
-               vector<Tree*> T = read->getTrees();
-               delete read;
-               
-               //make sure all files match
-               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-               int numNamesInTree;
-               if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-               else {   numNamesInTree = m->Treenames.size();  }
-               
-               //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                               //is that name in the tree?
-                               int count = 0;
-                               for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                       count++;
-                               }
-                               
-                               if (m->control_pressed) { 
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
-                               }
-                               
-                               //then you did not find it so report it 
-                               if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
-                               }
-                       }
-               }
-               
+               TreeReader* reader = new TreeReader(treefile, "", namefile);
+        vector<Tree*> T = reader->getTrees();
+        map<string, string> nameMap = reader->getNameMap();
+        delete reader;         
                 
                 //print new Tree
                 string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + "deunique.tre";
                 outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
                 ofstream out;
                 m->openOutputFile(outputFile, out);
-               T[0]->print(out, "deunique");
+               T[0]->print(out, nameMap);
                 out.close();
                 
-               delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+        delete (T[0]->getTreeMap());
+               for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                 
                 //set phylip file as new current phylipfile
                 string current = "";
@@ -231,46 +175,6 @@ int DeuniqueTreeCommand::execute() {
                 exit(1);
         }
  }
-/*****************************************************************/
-int DeuniqueTreeCommand::readNamesFile() {
-       try {
-               m->names.clear();
-               numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = m->names.find(first);
-                       if (itNames == m->names.end()) {  
-                               m->names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = dupNames[i]; 
-                                       if (i != 0) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, aborting."); m->mothurOutEndLine(); in.close(); m->names.clear(); m->control_pressed = true; return 1; }                     
-               }
-               in.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "DeuniqueTreeCommand", "readNamesFile");
-               exit(1);
-       }
-}
  /***********************************************************/
  
  
diff --git a/deuniquetreecommand.h b/deuniquetreecommand.h

index 6d72253bad9b1556aed865cae0a1e8b52c91cdea..18a6e5e95491603f051b847467db80512d3ad517 100644 (file)
--- a/deuniquetreecommand.h
+++ b/deuniquetreecommand.h
@@ -12,7 +12,6 @@
  
  
  #include "command.hpp"
-#include "treemap.h"
  #include "sharedutilities.h"
  #include "readtree.h"
  
@@ -35,7 +34,6 @@ public:
         
         
  private:
-       TreeMap* tmap;
         int numUniquesInName;
         
         bool abort;
diff --git a/engine.cpp b/engine.cpp

index ffbe324acd490ec6eb621ce0fe8d26b3b38d3ac0..24adcb53b59f39f049269d74b4da953a5f387262 100644 (file)
--- a/engine.cpp
+++ b/engine.cpp
@@ -183,7 +183,6 @@ bool InteractEngine::getInput(){
                                         mout->clearGroups();
                                         mout->clearAllGroups();
                                         mout->Treenames.clear();
-                                       mout->names.clear();
                                         mout->saveNextLabel = "";
                                         mout->printedHeaders = false;
                                         mout->commandInputsConvertError = false;
@@ -369,7 +368,6 @@ bool BatchEngine::getInput(){
                                         mout->clearGroups();
                                         mout->clearAllGroups();
                                         mout->Treenames.clear();
-                                       mout->names.clear();
                                         mout->saveNextLabel = "";
                                         mout->printedHeaders = false;
                                         mout->commandInputsConvertError = false;
@@ -538,7 +536,6 @@ bool ScriptEngine::getInput(){
                                         mout->clearGroups();
                                         mout->clearAllGroups();
                                         mout->Treenames.clear();
-                                       mout->names.clear();
                                         mout->saveNextLabel = "";
                                         mout->printedHeaders = false;
                                         mout->commandInputsConvertError = false;
diff --git a/getcurrentcommand.cpp b/getcurrentcommand.cpp

index 12dcc82ff6a1bfb1a105d3d6deac903e855cea9b..ca832314d66dbe371e314db292ab6f441d2ee351 100644 (file)
--- a/getcurrentcommand.cpp
+++ b/getcurrentcommand.cpp
@@ -138,6 +138,8 @@ int GetCurrentCommand::execute(){
                                         m->setTaxonomyFile("");
                                 }else if (types[i] == "flow") {
                                         m->setFlowFile("");
+                }else if (types[i] == "biom") {
+                                       m->setBiomFile("");
                                 }else if (types[i] == "processors") {
                                         m->setProcessors("1");
                                 }else if (types[i] == "all") {
diff --git a/indicatorcommand.cpp b/indicatorcommand.cpp

index 01d8c2e8634d65c31f11515f89ecfa950b81a1ae..97f480e749b7042a07a39d0f48890ff137d49dd0 100644 (file)
--- a/indicatorcommand.cpp
+++ b/indicatorcommand.cpp
@@ -99,7 +99,6 @@ IndicatorCommand::IndicatorCommand(string option)  {
                         m->clearGroups();
                         m->clearAllGroups();
                         m->Treenames.clear();
-                       m->names.clear();
                         
                         vector<string> tempOutNames;
                         outputTypes["tree"] = tempOutNames;
@@ -236,11 +235,10 @@ int IndicatorCommand::execute(){
                         designMap->readDesignMap();
                         
                         //fill Groups - checks for "all" and for any typo groups
-                       SharedUtil* util = new SharedUtil();
+                       SharedUtil util;
                         vector<string> nameGroups = designMap->getNamesOfGroups();
-                       util->setGroups(Groups, nameGroups);
+                       util.setGroups(Groups, nameGroups);
                         designMap->setNamesOfGroups(nameGroups);
-                       delete util;
                         
                         //loop through the Groups and fill Globaldata's Groups with the design file info
                         vector<string> namesSeqs = designMap->getNamesSeqs(Groups);
@@ -320,8 +318,9 @@ int IndicatorCommand::execute(){
                                 else { for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  } }
                                 for (int i = 0; i < T.size(); i++) {  delete T[i];  }  delete treeMap; return 0; 
                         }
-                               
-                       T[0]->assembleTree();
+            
+                       map<string, string> nameMap;    
+                       T[0]->assembleTree(nameMap);
                                         
                         /***************************************************/
                         //    create ouptut tree - respecting pickedGroups //
@@ -329,12 +328,11 @@ int IndicatorCommand::execute(){
                         Tree* outputTree = new Tree(m->getNumGroups(), treeMap); 
                         
                         outputTree->getSubTree(T[0], m->getGroups());
-                       outputTree->assembleTree();
+                       outputTree->assembleTree(nameMap);
                                 
                         //no longer need original tree, we have output tree to use and label
                         for (int i = 0; i < T.size(); i++) {  delete T[i];  } 
                         
-                                       
                         if (m->control_pressed) { 
                                 if (designfile != "") { delete designMap; }
                                 if (sharedfile != "") {  for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } }
@@ -470,17 +468,17 @@ int IndicatorCommand::GetIndicatorSpecies(){
                                 
                         if (m->control_pressed) { out.close(); return 0; }
                         
-                       out << (j+1) << '\t' << indicatorValues[j] << '\t'; 
+                       out << m->currentBinLabels[j] << '\t' << indicatorValues[j] << '\t'; 
                         
                         if (pValues[j] > (1/(float)iters)) { out << pValues[j] << endl; } 
                         else { out << "<" << (1/(float)iters) << endl; }
                         
                         if (pValues[j] <= 0.05) {
-                               cout << "OTU" << j+1 << '\t' << indicatorValues[j]  << '\t';
+                               cout << m->currentBinLabels[j] << '\t' << indicatorValues[j]  << '\t';
                                 string pValueString = "<" + toString((1/(float)iters)); 
                                 if (pValues[j] > (1/(float)iters)) { pValueString = toString(pValues[j]); cout << pValues[j];} 
                                 else { cout << "<" << (1/(float)iters); }
-                               m->mothurOutJustToLog("OTU" + toString(j+1) + "\t" + toString(indicatorValues[j]) + "\t" + pValueString); 
+                               m->mothurOutJustToLog(m->currentBinLabels[j] + "\t" + toString(indicatorValues[j]) + "\t" + pValueString); 
                                 m->mothurOutEndLine(); 
                         }
                 }
@@ -516,7 +514,7 @@ int IndicatorCommand::GetIndicatorSpecies(Tree*& T){
                 
                 //print headings
                 out << "TreeNode\t";
-               for (int i = 0; i < numBins; i++) { out << "OTU" << (i+1) << "_IndValue" << '\t' << "pValue" << '\t'; }
+               for (int i = 0; i < numBins; i++) { out << m->currentBinLabels[i] << "_IndValue" << '\t' << "pValue" << '\t'; }
                 out << endl;
                 
                 m->mothurOutEndLine(); m->mothurOut("Node\tSpecies\tIndicatorValue\tpValue\n");
@@ -673,11 +671,11 @@ int IndicatorCommand::GetIndicatorSpecies(Tree*& T){
                                 }
                                 
                                 if (pValues[j] <= 0.05) {
-                                       cout << i+1 << "\tOTU" << j+1 << '\t' << indicatorValues[j]  << '\t';
+                                       cout << i+1 << '\t' << m->currentBinLabels[j] << '\t' << indicatorValues[j]  << '\t';
                                         string pValueString = "<" + toString((1/(float)iters)); 
                                         if (pValues[j] > (1/(float)iters)) { pValueString = toString(pValues[j]); cout << pValues[j];} 
                                         else { cout << "<" << (1/(float)iters); }
-                                       m->mothurOutJustToLog(toString(i) + "\tOTU" + toString(j+1) + "\t" + toString(indicatorValues[j]) + "\t" + pValueString); 
+                                       m->mothurOutJustToLog(toString(i) + "\t" + m->currentBinLabels[j] + "\t" + toString(indicatorValues[j]) + "\t" + pValueString); 
                                         m->mothurOutEndLine(); 
                                 }
                         }
diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp

new file mode 100644 (file)

index 0000000..acd0500
--- /dev/null
+++ b/makebiomcommand.cpp
@@ -0,0 +1,643 @@
+//
+//  makebiomcommand.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/16/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "makebiomcommand.h"
+#include "sharedrabundvector.h"
+#include "inputdata.h"
+
+//taken from http://biom-format.org/documentation/biom_format.html
+/* Minimal Sparse 
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 0.9.1",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision 1.4.0-dev",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "sparse",
+ "matrix_element_type": "int",
+ "shape": [5, 6],
+ "data":[[0,2,1],
+ [1,0,5],
+ [1,1,1],
+ [1,3,2],
+ [1,4,3],
+ [1,5,1],
+ [2,2,1],
+ [2,3,4],
+ [2,4,2],
+ [3,0,2],
+ [3,1,1],
+ [3,2,1],
+ [3,5,1],
+ [4,1,1],
+ [4,2,1]
+ ]
+ }
+ */
+/* Minimal dense
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 0.9.1",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision 1.4.0-dev",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "dense",
+ "matrix_element_type": "int",
+ "shape": [5,6],
+ "data":  [[0,0,1,0,0,0],
+ [5,1,0,2,3,1],
+ [0,0,1,4,2,0],
+ [2,1,1,0,0,1],
+ [0,1,1,0,0,0]]
+ }
+ */
+//**********************************************************************************************************************
+vector<string> MakeBiomCommand::setParameters(){       
+       try {
+               CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
+        CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcontaxonomy);
+               CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
+               CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+        CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "",false,false); parameters.push_back(pmatrixtype);
+
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string MakeBiomCommand::getHelpString(){       
+       try {
+               string helpString = "";
+               helpString += "The make.biom command parameters are shared, contaxonomy, groups, matrixtype and label.  shared is required, unless you have a valid current file.\n";
+               helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
+               helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
+               helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
+        helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance.  ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled.\n";
+               helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
+               helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
+               helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
+               helpString += "The make.biom command outputs a .biom file.\n";
+               helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+MakeBiomCommand::MakeBiomCommand(){    
+       try {
+               abort = true; calledHelp = true; 
+               setParameters();
+               vector<string> tempOutNames;
+               outputTypes["biom"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+MakeBiomCommand::MakeBiomCommand(string option) {
+       try {
+               abort = false; calledHelp = false;   
+               allLines = 1;
+        
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       map<string,string>::iterator it;
+                       
+                       ValidParameters validParameter;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //initialize outputTypes
+                       vector<string> tempOutNames;
+                       outputTypes["biom"] = tempOutNames;
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("shared");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["shared"] = inputDir + it->second;           }
+                               }
+                
+                it = parameters.find("contaxonomy");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["contaxonomy"] = inputDir + it->second;              }
+                               }
+                       }
+            
+                       //get shared file
+                       sharedfile = validParameter.validFile(parameters, "shared", true);
+                       if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
+                       else if (sharedfile == "not found") { 
+                               //if there is a current shared file, use it
+                               sharedfile = m->getSharedFile(); 
+                               if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setSharedFile(sharedfile); }
+                       
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);             }
+            
+            contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
+                       if (contaxonomyfile == "not found") {  contaxonomyfile = "";  }
+                       else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
+
+            
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       label = validParameter.validFile(parameters, "label", false);                   
+                       if (label == "not found") { label = ""; }
+                       else { 
+                               if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+                       
+                       groups = validParameter.validFile(parameters, "groups", false);                 
+                       if (groups == "not found") { groups = ""; }
+                       else { 
+                               m->splitAtDash(groups, Groups);
+                               m->setGroups(Groups);
+                       }
+                       
+            if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
+            
+                       format = validParameter.validFile(parameters, "matrixtype", false);                             if (format == "not found") { format = "sparse"; }
+                       
+                       if ((format != "sparse") && (format != "dense")) {
+                               m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true; 
+                       }
+               }
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+int MakeBiomCommand::execute(){
+       try {
+        
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+            
+               InputData input(sharedfile, "sharedfile");
+               vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+               string lastLabel = lookup[0]->getLabel();
+        
+        //if user did not specify a label, then use first one
+        if ((contaxonomyfile != "") && (labels.size() == 0)) {
+            allLines = 0;
+            labels.insert(lastLabel);
+        }
+               
+               //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+               set<string> processedLabels;
+               set<string> userLabels = labels;
+        
+               //as long as you are not at the end of the file or done wih the lines you want
+               while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+                       
+                       if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
+            
+                       if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
+                
+                               m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                               getBiom(lookup);
+                               
+                               processedLabels.insert(lookup[0]->getLabel());
+                               userLabels.erase(lookup[0]->getLabel());
+                       }
+                       
+                       if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                               string saveLabel = lookup[0]->getLabel();
+                
+                               for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
+                               lookup = input.getSharedRAbundVectors(lastLabel);
+                               m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                               
+                               getBiom(lookup);
+                               
+                               processedLabels.insert(lookup[0]->getLabel());
+                               userLabels.erase(lookup[0]->getLabel());
+                               
+                               //restore real lastlabel to save below
+                               lookup[0]->setLabel(saveLabel);
+                       }
+                       
+                       lastLabel = lookup[0]->getLabel();
+            
+                       //prevent memory leak and get next set
+                       for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
+                       lookup = input.getSharedRAbundVectors();                                
+               }
+               
+        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); }  return 0; }     
+        
+               //output error messages about any remaining user labels
+               set<string>::iterator it;
+               bool needToRun = false;
+               for (it = userLabels.begin(); it != userLabels.end(); it++) {  
+                       m->mothurOut("Your file does not include the label " + *it); 
+                       if (processedLabels.count(lastLabel) != 1) {
+                               m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                               needToRun = true;
+                       }else {
+                               m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+                       }
+               }
+        
+               //run last label if you need to
+               if (needToRun == true)  {
+                       for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
+                       lookup = input.getSharedRAbundVectors(lastLabel);
+                       
+                       m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+            getBiom(lookup);
+                       
+                       for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+               }
+               
+        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); }  return 0; }     
+               
+        //set sabund file as new current sabundfile
+        string current = "";
+               itTypes = outputTypes.find("biom");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
+               }
+
+        
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
+       try {
+        
+        string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + ".biom";
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
+
+        string mothurString = "mothur" + toString(m->getVersion());
+        time_t rawtime;
+        struct tm * timeinfo;
+        time ( &rawtime );
+        timeinfo = localtime ( &rawtime );
+        string dateString = asctime (timeinfo);
+        int pos = dateString.find('\n');
+        if (pos != string::npos) { dateString = dateString.substr(0, pos);}
+        string spaces = "      ";
+        
+        //standard 
+        out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
+        out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
+        
+        vector<string> metadata = getMetaData(lookup);  
+        
+        if (m->control_pressed) {  out.close(); return 0; }
+        
+        //get row info
+        /*"rows":[
+                {"id":"GG_OTU_1", "metadata":null},
+                {"id":"GG_OTU_2", "metadata":null},
+                {"id":"GG_OTU_3", "metadata":null},
+                {"id":"GG_OTU_4", "metadata":null},
+                {"id":"GG_OTU_5", "metadata":null}
+                ],*/
+        out << spaces + "\"rows\":[\n";
+        string rowFront = spaces + spaces + "{\"id\":\"";
+        string rowBack = "\", \"metadata\":";
+        for (int i = 0; i < m->currentBinLabels.size()-1; i++) {
+            if (m->control_pressed) {  out.close(); return 0; }
+            out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n";
+        }
+        out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n";
+        
+        //get column info
+        /*"columns": [
+                    {"id":"Sample1", "metadata":null},
+                    {"id":"Sample2", "metadata":null},
+                    {"id":"Sample3", "metadata":null},
+                    {"id":"Sample4", "metadata":null},
+                    {"id":"Sample5", "metadata":null},
+                    {"id":"Sample6", "metadata":null}
+                    ],*/
+        
+        string colBack = "\", \"metadata\":null}";
+        out << spaces + "\"columns\":[\n";
+        for (int i = 0; i < lookup.size()-1; i++) {
+            if (m->control_pressed) {  out.close(); return 0; }
+            out << rowFront << lookup[i]->getGroup() << colBack << ",\n";
+        }
+        out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << "\n" + spaces + "],\n";
+        
+        out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
+        out <<  spaces + "\"shape\": [" << m->currentBinLabels.size() << "," << lookup.size() << "],\n";
+        out << spaces + "\"data\":  [";
+        
+        vector<string> dataRows;
+        if (format == "sparse") {
+            /*"data":[[0,2,1],
+             [1,0,5],
+             [1,1,1],
+             [1,3,2],
+             [1,4,3],
+             [1,5,1],
+             [2,2,1],
+             [2,3,4],
+             [2,4,2],
+             [3,0,2],
+             [3,1,1],
+             [3,2,1],
+             [3,5,1],
+             [4,1,1],
+             [4,2,1]
+             ]*/
+            string output = "";
+            for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+                
+                if (m->control_pressed) { out.close(); return 0; }
+                
+                for (int j = 0; j < lookup.size(); j++) {
+                    string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
+                    //only print non zero values
+                    if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
+                }
+            }
+        }else {
+            
+            /* "matrix_type": "dense",
+             "matrix_element_type": "int",
+             "shape": [5,6],
+             "data":  [[0,0,1,0,0,0],
+             [5,1,0,2,3,1],
+             [0,0,1,4,2,0],
+             [2,1,1,0,0,1],
+             [0,1,1,0,0,0]]*/
+            
+            for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+                
+                if (m->control_pressed) { out.close(); return 0; }
+                
+                string binInfo = "[";
+                for (int j = 0; j < lookup.size()-1; j++) {
+                    binInfo += toString(lookup[j]->getAbundance(i)) + ",";
+                }
+                binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
+                dataRows.push_back(binInfo);
+            }
+        }
+        
+        for (int i = 0; i < dataRows.size()-1; i++) {
+            out << dataRows[i] << ",\n" + spaces  + spaces;
+        }
+        out << dataRows[dataRows.size()-1] << "]\n";
+        
+        out << "}\n";
+        out.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "getBiom");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
+       try {
+        vector<string> metadata;
+        
+        if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) {  metadata.push_back("null");  } }
+        else {
+            
+            //read constaxonomy file storing in a map, otulabel -> taxonomy
+            //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
+            ifstream in;
+            m->openInputFile(contaxonomyfile, in);
+            
+            //grab headers
+            m->getline(in); m->gobble(in);
+            
+            string otuLabel, tax;
+            int size;
+            vector<string> otuLabels;
+            vector<string> taxs;
+            while (!in.eof()) {
+                
+                if (m->control_pressed) { in.close(); return metadata; }
+                
+                in >> otuLabel >> size >> tax; m->gobble(in);
+                
+                otuLabels.push_back(otuLabel);
+                taxs.push_back(tax);
+            }
+            in.close();
+            
+            //should the labels be Otu001 or PhyloType001
+            string firstBin = m->currentBinLabels[0];
+            string binTag = "Otu";
+            if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType";  }
+            
+            //convert list file bin labels to shared file bin labels
+            //parse tax strings
+            //save in map
+            map<string, string> labelTaxMap;
+            string snumBins = toString(otuLabels.size());
+            for (int i = 0; i < otuLabels.size(); i++) {  
+                
+                if (m->control_pressed) { return metadata; }
+                
+                //if there is a bin label use it otherwise make one
+                string binLabel = binTag;
+                string sbinNumber = otuLabels[i];
+                if (sbinNumber.length() < snumBins.length()) { 
+                    int diff = snumBins.length() - sbinNumber.length();
+                    for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                }
+                binLabel += sbinNumber;
+                
+                labelTaxMap[binLabel] = taxs[i];
+            }
+            
+            
+            //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
+            
+            //traverse the binLabels forming the metadata strings and saving them
+            //make sure to sanity check
+            map<string, string>::iterator it;
+            for (int i = 0; i < m->currentBinLabels.size(); i++) {
+                
+                if (m->control_pressed) { return metadata; }
+                
+                it = labelTaxMap.find(m->currentBinLabels[i]);
+                
+                if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentBinLabels[i] + ".\n"); m->control_pressed = true; }
+                else {
+                    vector<string> bootstrapValues;
+                    string data = "{\"taxonomy\":[";
+            
+                    vector<string> scores;
+                    vector<string> taxonomies = parseTax(it->second, scores);
+                    
+                    for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
+                    data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
+                    
+                    //add bootstrap values if available
+                    if (scores[0] != "null") {
+                        data += ", \"bootstrap\":[";
+                        
+                        for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
+                        data += scores[scores.size()-1] + "]";
+
+                    }
+                    data += "}";
+                    
+                    metadata.push_back(data);
+                }
+            }
+        }
+        
+        return metadata;
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "getMetadata");
+               exit(1);
+       }
+
+}
+/**************************************************************************************************/
+//returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
+vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
+       try {
+               
+               string taxon;
+        vector<string> taxs;
+               
+               while (tax.find_first_of(';') != -1) {
+                       
+                       if (m->control_pressed) { return taxs; }
+                       
+                       //get taxon
+                       taxon = tax.substr(0,tax.find_first_of(';'));
+            
+                       int pos = taxon.find_last_of('(');
+                       if (pos != -1) {
+                               //is it a number?
+                               int pos2 = taxon.find_last_of(')');
+                               if (pos2 != -1) {
+                                       string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
+                                       if (m->isNumeric1(confidenceScore)) {
+                                               taxon = taxon.substr(0, pos); //rip off confidence 
+                        scores.push_back(confidenceScore);
+                                       }else{ scores.push_back("null"); }
+                               }
+                       }
+                       
+            //strip "" if they are there
+            pos = taxon.find("\"");
+            if (pos != string::npos) {
+                string newTax = "";
+                for (int k = 0; k < taxon.length(); k++) {
+                    if (taxon[k] != '\"') { newTax += taxon[k]; }
+                }
+                taxon = newTax;
+            }
+            
+            //look for bootstrap value
+                       taxs.push_back(taxon);
+            tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+               }
+               
+               return taxs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeBiomCommand", "parseTax");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+
+
diff --git a/makebiomcommand.h b/makebiomcommand.h

new file mode 100644 (file)

index 0000000..9f80c2e
--- /dev/null
+++ b/makebiomcommand.h
@@ -0,0 +1,49 @@
+#ifndef Mothur_makebiomcommand_h
+#define Mothur_makebiomcommand_h
+
+//
+//  makebiomcommand.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/16/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+
+#include "command.hpp"
+#include "sharedrabundvector.h"
+#include "inputdata.h"
+
+
+class MakeBiomCommand : public Command {
+       
+public:
+       MakeBiomCommand(string);
+       MakeBiomCommand();      
+       ~MakeBiomCommand(){}
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "make.biom";   }
+       string getCommandCategory()             { return "General";             }
+       string getHelpString(); 
+       string getCitation() { return "http://biom-format.org/documentation/biom_format.html, http://www.mothur.org/wiki/Make.biom"; }
+       string getDescription()         { return "creates a biom file"; }
+    
+       int execute(); 
+       void help() { m->mothurOut(getHelpString()); }  
+       
+private:
+    
+       string sharedfile, contaxonomyfile, groups, outputDir, format, label;
+       vector<string> outputNames, Groups;
+       set<string> labels;
+    
+       bool abort, allLines;
+    
+    int getBiom(vector<SharedRAbundVector*>&);
+    vector<string> getMetaData(vector<SharedRAbundVector*>&);
+    vector<string> parseTax(string tax, vector<string>& scores);
+};
+
+
+#endif
diff --git a/makefile b/makefile

index d8e6dcd603913805b0355d3f69173f25cf7803b2..b8d4e2c3e30844c436e44015696ec04d5c4264d8 100644 (file)
--- a/makefile
+++ b/makefile
@@ -15,8 +15,8 @@ USEREADLINE ?= yes
  CYGWIN_BUILD ?= no
  USECOMPRESSION ?= no
  MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"3/16/2012\""
-VERSION = "\"1.24.1\""
+RELEASE_DATE = "\"4/30/2012\""
+VERSION = "\"1.25.0\""
  FORTAN_COMPILER = gfortran
  FORTRAN_FLAGS = 
  
diff --git a/metastatscommand.cpp b/metastatscommand.cpp

index 4744424d426d61243894d4779fa60dcd20781be3..c5d349bc77b685b67a1e3251fa70f15602bb215d 100644 (file)
--- a/metastatscommand.cpp
+++ b/metastatscommand.cpp
@@ -483,25 +483,16 @@ int MetaStatsCommand::driver(int start, int num, vector<SharedRAbundVector*>& th
                                 outputNames.pop_back();
                         }else {
                  
-                ofstream outTemp;
-                string tempOut = outputDir + "data." + setA + "-" + setB + ".matrix";
-                m->openOutputFile(tempOut, outTemp);
-                for (int i = 0; i < subset.size(); i++) { outTemp << '\t' << subset[i]->getGroup(); }
-                outTemp << endl;
-                
-                
                                 //fill data
                                 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
                                         //data[j] = new double[subset.size()];
                                         data2[j].resize(subset.size(), 0.0);
-                    outTemp << "OTU" << (j+1);
+                   
                                         for (int i = 0; i < subset.size(); i++) {
                                                 data2[j][i] = (subset[i]->getAbundance(j));
-                        outTemp << '\t' << subset[i]->getAbundance(j);
                                         }
-                    outTemp << endl;
                                 }
-                               outTemp.close();
+                               
                                 m->mothurOut("Comparing " + setA + " and " + setB + "..."); m->mothurOutEndLine(); 
                                 //metastat_main(output, thisLookUp[0]->getNumBins(), subset.size(), threshold, iters, data, setACount);
                                 
diff --git a/mothurout.cpp b/mothurout.cpp

index 98f5ce09608855690a9fc7442814841a064db379..0431d36796dd32cf4ac404dee2843150a4b5b110 100644 (file)
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -40,6 +40,7 @@ void MothurOut::printCurrentFiles()  {
                 if (taxonomyfile != "")         {  mothurOut("taxonomy=" + taxonomyfile); mothurOutEndLine();           }
                 if (treefile != "")                     {  mothurOut("tree=" + treefile); mothurOutEndLine();                           }
                 if (flowfile != "")                     {  mothurOut("flow=" + flowfile); mothurOutEndLine();                           }
+        if (biomfile != "")                    {  mothurOut("biom=" + biomfile); mothurOutEndLine();                           }
                 if (processors != "1")          {  mothurOut("processors=" + processors); mothurOutEndLine();           }
                 
         }
@@ -73,6 +74,7 @@ bool MothurOut::hasCurrentFiles()  {
                 if (taxonomyfile != "")         {  return true;                 }
                 if (treefile != "")                     {  return true;                 }
                 if (flowfile != "")                     {  return true;                 }
+        if (biomfile != "")                    {  return true;                 }
                 if (processors != "1")          {  return true;                 }
                 
                 return hasCurrent;
@@ -107,6 +109,7 @@ void MothurOut::clearCurrentFiles()  {
                 accnosfile = "";
                 taxonomyfile = "";      
                 flowfile = "";
+        biomfile = "";
                 processors = "1";
         }
         catch(exception& e) {
@@ -598,6 +601,48 @@ string MothurOut::getPathName(string longName){
  }
  /***********************************************************************/
  
+bool MothurOut::dirCheck(string& dirName){
+       try {
+        
+        string tag = "";
+        #ifdef USE_MPI
+            int pid; 
+            MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+               
+            tag = toString(pid);
+        #endif
+
+        //add / to name if needed
+        string lastChar = dirName.substr(dirName.length()-1);
+        #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+        if (lastChar != "/") { dirName += "/"; }
+        #else
+        if (lastChar != "\\") { dirName += "\\"; }     
+        #endif
+
+        //test to make sure directory exists
+        dirName = getFullPathName(dirName);
+        string outTemp = dirName + tag + "temp";
+        ofstream out;
+        out.open(outTemp.c_str(), ios::trunc);
+        if(!out) {
+            mothurOut(dirName + " directory does not exist or is not writable."); mothurOutEndLine(); 
+        }else{
+            out.close();
+            mothurRemove(outTemp);
+            return true;
+        }
+        
+        return false;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "dirCheck");
+               exit(1);
+       }       
+    
+}
+/***********************************************************************/
+
  string MothurOut::hasPath(string longName){
         try {
                 string path = "";
@@ -972,7 +1017,8 @@ int MothurOut::appendFiles(string temp, string filename) {
                 
                 int numLines = 0;
                 if (ableToOpen == 0) { //you opened it
-                       while(char c = input.get()){
+                       while(!input.eof()){
+                char c = input.get();
                                 if(input.eof())         {       break;                  }
                                 else                            {       output << c;    if (c == '\n') {numLines++;} }
                         }
@@ -1402,7 +1448,8 @@ map<string, int> MothurOut::readNames(string namefile) {
                         if (control_pressed) { break; }
                         
                         string firstCol, secondCol;
-                       in >> firstCol >> secondCol; gobble(in);
+                       in >> firstCol;  gobble(in);
+            in >> secondCol; gobble(in);
                         
                         int num = getNumNames(secondCol);
                         
@@ -1919,6 +1966,25 @@ void MothurOut::splitAtComma(string& estim, vector<string>& container) {
                 exit(1);
         }       
  }
+/***********************************************************************/
+//This function splits up the various option parameters
+void MothurOut::splitAtChar(string& prefix, string& suffix, char c){
+       try {
+               prefix = suffix.substr(0,suffix.find_first_of(c));
+               if ((suffix.find_first_of(c)+2) <= suffix.length()) {  //checks to make sure you don't have comma at end of string
+                       suffix = suffix.substr(suffix.find_first_of(c)+1, suffix.length());
+                       string space = " ";
+                       while(suffix.at(0) == ' ')
+                               suffix = suffix.substr(1, suffix.length());
+               }
+        
+       }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "splitAtComma");
+               exit(1);
+       }       
+}
+
  /***********************************************************************/
  
  //This function splits up the various option parameters
diff --git a/mothurout.h b/mothurout.h

index e1c8222ae1938e8b63422cc9ee6567d309b668f5..9ce698be4e1963e6af9abee67673ded3e1743fde 100644 (file)
--- a/mothurout.h
+++ b/mothurout.h
@@ -65,7 +65,7 @@ class MothurOut {
         
                 vector<string> getAllGroups() { sort(namesOfGroups.begin(), namesOfGroups.end()); return namesOfGroups; }
                 vector<string> Treenames;
-               map<string, string> names;
+               //map<string, string> names;
                 vector<string> binLabelsInFile;
                 vector<string> currentBinLabels;
                 string saveNextLabel, argv, sharedHeaderMode;
@@ -73,6 +73,7 @@ class MothurOut {
                 
                 //functions from mothur.h
                 //file operations
+        bool dirCheck(string&); //completes path, appends appropriate / or \, makes sure dir is writable.
                 vector<unsigned long long> divideFile(string, int&);
                 int divideFile(string, int&, vector<string>&);
                 vector<unsigned long long> setFilePosEachLine(string, int&);
@@ -127,6 +128,7 @@ class MothurOut {
                 void splitAtDash(string&, set<string>&);
                 void splitAtDash(string&, vector<string>&);
                 void splitAtChar(string&, vector<string>&, char);
+        void splitAtChar(string&, string&, char);
                 int removeConfidences(string&);
                 
                 //math operation
@@ -161,6 +163,7 @@ class MothurOut {
                 string getAccnosFile()          { return accnosfile;            }
                 string getTaxonomyFile()        { return taxonomyfile;          }
                 string getFlowFile()            { return flowfile;                      }
+        string getBiomFile()           { return biomfile;                      }
                 string getProcessors()          { return processors;            }
                 
                 void setListFile(string f)                      { listfile = getFullPathName(f);                        }
@@ -183,6 +186,7 @@ class MothurOut {
                 void setAccnosFile(string f)            { accnosfile = getFullPathName(f);                      }
                 void setTaxonomyFile(string f)          { taxonomyfile = getFullPathName(f);            }
                 void setFlowFile(string f)                      { flowfile = getFullPathName(f);                        }
+        void setBiomFile(string f)                     { biomfile = getFullPathName(f);                        }
                 void setProcessors(string p)            { processors = p;                                                       }
                 
                 void printCurrentFiles();
@@ -216,6 +220,7 @@ class MothurOut {
                         taxonomyfile = "";
                         processors = "1";
                         flowfile = "";
+            biomfile = "";
                         gui = false;
                         printedHeaders = false;
                         commandInputsConvertError = false;
@@ -228,7 +233,7 @@ class MothurOut {
                 string defaultPath, outputDir;
                 string releaseDate, version;
         
-               string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile;
+               string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile;
                 string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile;
  
                 vector<string> Groups;
diff --git a/optionparser.cpp b/optionparser.cpp

index 06a900d8137a031236d52261882c744bb169b440..0d6ed2d4d0bfa681a7aa30693bcb7ae30a9c77fb 100644 (file)
--- a/optionparser.cpp
+++ b/optionparser.cpp
@@ -91,6 +91,8 @@ map<string, string> OptionParser::getParameters() {
                                         it->second = m->getAccnosFile();
                                 }else if (it->first == "taxonomy") {
                                         it->second = m->getTaxonomyFile();
+                }else if (it->first == "biom") {
+                        it->second = m->getBiomFile();
                                 }else {
                                         m->mothurOut("[ERROR]: mothur does not save a current file for " + it->first); m->mothurOutEndLine();
                                 }
diff --git a/otuassociationcommand.cpp b/otuassociationcommand.cpp

index eeefb41ffa9781aacad02a594cd4d61859631cb0..0d41e63f3526bc9af4e3933d5f3337d62ebd0f2a 100644 (file)
--- a/otuassociationcommand.cpp
+++ b/otuassociationcommand.cpp
@@ -308,8 +308,7 @@ int OTUAssociationCommand::process(vector<SharedRAbundVector*>& lookup){
                                 else if (method == "kendall")   {       coef = linear.calcKendall(xy[i], xy[k], sig);   }                   
                                 else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
                         
-                               if (m->binLabelsInFile.size() != 0) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
-                else { out << i+1 << '\t' << k+1 << '\t' << coef << '\t' << sig << endl; }
+                out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl;
                         }
                 }
                 
@@ -437,8 +436,7 @@ int OTUAssociationCommand::process(vector<SharedRAbundFloatVector*>& lookup){
                                 else if (method == "kendall")   {       coef = linear.calcKendall(xy[i], xy[k], sig);   }                   
                                 else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
                                 
-                if (m->binLabelsInFile.size() != 0) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
-                else { out << i+1 << '\t' << k+1 << '\t' << coef << '\t' << sig << endl; }
+                out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; 
                         }
                 }
                 
diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp

index 98b0fde32dbb4cf9bf86aad283fa1d6ad3a7ea44..a4ccbf8f33b34b5dd8eccebf91242249cc465f28 100644 (file)
--- a/pairwiseseqscommand.cpp
+++ b/pairwiseseqscommand.cpp
@@ -203,12 +203,15 @@ PairwiseSeqsCommand::PairwiseSeqsCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
                         m->mothurConvert(temp, misMatch);  
+            if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
                         
                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
                         m->mothurConvert(temp, gapOpen);  
+            if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
                         
                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
                         m->mothurConvert(temp, gapExtend); 
+            if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
                         
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
diff --git a/parsimony.cpp b/parsimony.cpp

index d26bc270efcdd7fff7cb1292ddc2bfb786a48665..3b0f31759c0563031874aa16cb0d3d515a65653b 100644 (file)
--- a/parsimony.cpp
+++ b/parsimony.cpp
@@ -15,6 +15,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
         try {
                 processors = p;
                 outputDir = o;
+        TreeMap* tmap = t->getTreeMap();
                 
                 //if the users enters no groups then give them the score of all groups
                 vector<string> mGroups = m->getGroups();
@@ -56,7 +57,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
                 
         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 if(processors == 1){
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
                 }else{
                         lines.clear();
                         int numPairs = namesOfGroupCombos.size();
@@ -73,10 +74,10 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
                                 lines.push_back(linePair(startPos, numPairsPerProcessor));
                         }
                         
-                       data = createProcesses(t, namesOfGroupCombos);
+                       data = createProcesses(t, namesOfGroupCombos, tmap);
                 }
         #else
-               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
         #endif
                 
                 return data;
@@ -89,7 +90,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -106,7 +107,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -126,7 +127,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -169,7 +170,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
         }
  }
  /**************************************************************************************************/
-EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) { 
+EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
         try {
                 
                 EstOutput results; results.resize(num);
diff --git a/parsimony.h b/parsimony.h

index b116aa2d1a55e0ba84571f1f4c9500c6be5d8fcc..7316d508dd52729c8ddec87c34a08c76586cd4d1 100644 (file)
--- a/parsimony.h
+++ b/parsimony.h
@@ -19,10 +19,9 @@
  class Parsimony : public TreeCalculator  {
         
         public:
-               Parsimony(TreeMap* t) : tmap(t) {};
+               Parsimony() {};
                 ~Parsimony() {};
                 EstOutput getValues(Tree*, int, string);
-               //EstOutput getValues(Tree*, string, string) { return data; }
                 
         private:
                 struct linePair {
@@ -33,12 +32,11 @@ class Parsimony : public TreeCalculator  {
                 vector<linePair> lines;
         
                 EstOutput data;
-               TreeMap* tmap;
                 int processors;
                 string outputDir;
         
-               EstOutput driver(Tree*, vector< vector<string> >, int, int); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
  };
  
  /***********************************************************************/
diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp

index 2d46efc9efebcfbe7807292cb814cf58686f9ac4..50e1bfa7e590d6e2fa47b4c184700b440cdb9621 100644 (file)
--- a/parsimonycommand.cpp
+++ b/parsimonycommand.cpp
@@ -8,6 +8,7 @@
   */
  
  #include "parsimonycommand.h"
+#include "treereader.h"
  
  //**********************************************************************************************************************
  vector<string> ParsimonyCommand::setParameters(){      
@@ -125,12 +126,6 @@ ParsimonyCommand::ParsimonyCommand(string option)  {
                                 }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
                         
                         randomtree = validParameter.validFile(parameters, "random", false);             if (randomtree == "not found") { randomtree = ""; }
@@ -203,68 +198,11 @@ int ParsimonyCommand::execute() {
                         
                         m->setTreeFile(treefile);
                         
-                       if (groupfile != "") {
-                               //read in group map info.
-                               tmap = new TreeMap(groupfile);
-                               tmap->readMap();
-                       }else{ //fake out by putting everyone in one group
-                               Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                               tmap = new TreeMap();
-                               
-                               for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-                       }
-                       
-                       if (namefile != "") { readNamesFile(); }
-                       
-                       read = new ReadNewickTree(treefile);
-                       int readOk = read->read(tmap); 
-                       
-                       if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-                       
-                       read->AssembleTrees();
-                       T = read->getTrees();
-                       delete read;
-
-                       //make sure all files match
-                       //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-                       int numNamesInTree;
-                       if (namefile != "")  {  
-                               if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-                               else {   numNamesInTree = m->Treenames.size();  }
-                       }else {  numNamesInTree = m->Treenames.size();  }
-                       
-                       
-                       //output any names that are in group file but not in tree
-                       if (numNamesInTree < tmap->getNumSeqs()) {
-                               for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                                       //is that name in the tree?
-                                       int count = 0;
-                                       for (int j = 0; j < m->Treenames.size(); j++) {
-                                               if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                               count++;
-                                       }
-                                       
-                                       if (m->control_pressed) { 
-                                               delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                                               for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                               m->clearGroups();
-                                               return 0;
-                                       }
-                                       
-                                       //then you did not find it so report it 
-                                       if (count == m->Treenames.size()) { 
-                                               //if it is in your namefile then don't remove
-                                               map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                               
-                                               if (it == nameMap.end()) {
-                                                       m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                                       tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                                       i--; //need this because removeSeq removes name from namesOfSeqs
-                                               }
-                                       }
-                               }
-                       }
-                               
+            TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+            T = reader->getTrees();
+            tmap = T[0]->getTreeMap();
+            delete reader;
+       
                         if(outputDir == "") { outputDir += m->hasPath(treefile); }
                         output = new ColumnFile(outputDir + m->getSimpleName(treefile)  +  ".parsimony", itersString);
                         outputNames.push_back(outputDir + m->getSimpleName(treefile)  +  ".parsimony");
@@ -284,24 +222,23 @@ int ParsimonyCommand::execute() {
                 }
                         
                 //set users groups to analyze
-               util = new SharedUtil();
+               SharedUtil util;
                 vector<string> mGroups = m->getGroups();
                 vector<string> tGroups = tmap->getNamesOfGroups();
-               util->setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony");   //sets the groups the user wants to analyze
-               util->getCombos(groupComb, mGroups, numComp);
+               util.setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony");    //sets the groups the user wants to analyze
+               util.getCombos(groupComb, mGroups, numComp);
                 m->setGroups(mGroups);
-               delete util;
                         
                 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
                         
-               pars = new Parsimony(tmap);
+               Parsimony pars;
                 counter = 0;
         
                 Progress* reading;
                 reading = new Progress("Comparing to random:", iters);
                 
                 if (m->control_pressed) { 
-                       delete reading; delete pars; delete output;
+                       delete reading; delete output;
                         delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                         if (randomtree == "") {  outSum.close();  }
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
@@ -323,10 +260,10 @@ int ParsimonyCommand::execute() {
                 if (randomtree == "") {
                         //get pscores for users trees
                         for (int i = 0; i < T.size(); i++) {
-                               userData = pars->getValues(T[i], processors, outputDir);  //data = AB, AC, BC, ABC.
+                               userData = pars.getValues(T[i], processors, outputDir);  //data = AB, AC, BC, ABC.
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete pars; delete output;
+                                       delete reading; delete output;
                                         delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                         if (randomtree == "") {  outSum.close();  }
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
@@ -362,10 +299,10 @@ int ParsimonyCommand::execute() {
                                 randT->assembleRandomTree();
  
                                 //get pscore of random tree
-                               randomData = pars->getValues(randT, processors, outputDir);
+                               randomData = pars.getValues(randT, processors, outputDir);
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete pars; delete output; delete randT;
+                                       delete reading;  delete output; delete randT;
                                         if (randomtree == "") {  outSum.close();  }
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                                         delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
@@ -403,23 +340,17 @@ int ParsimonyCommand::execute() {
                                 randT->assembleRandomTree();
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete pars; delete output; delete randT;
-                                       delete tmap; 
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
+                                       delete reading; delete output; delete randT; delete tmap; 
+                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
                                 }
  
  
                                 //get pscore of random tree
-                               randomData = pars->getValues(randT, processors, outputDir);
+                               randomData = pars.getValues(randT, processors, outputDir);
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete pars;  delete output; delete randT;
-                                       delete tmap; 
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
+                                       delete reading; delete output; delete randT; delete tmap; 
+                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
                                 }
                         
                                 for(int r = 0; r < numComp; r++) {
@@ -471,27 +402,21 @@ int ParsimonyCommand::execute() {
                 }
                 
                 if (m->control_pressed) { 
-                               delete reading; delete pars; delete output;
+                               delete reading; delete output;
                                 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                 if (randomtree == "") {  outSum.close();  }
                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                               m->clearGroups();
                                 return 0;
                 }
                 
                 //finish progress bar
                 reading->finish();
                 delete reading;
-
                 
                 printParsimonyFile();
                 if (randomtree == "") { printUSummaryFile(); }
-               
-               //reset groups parameter
-               m->clearGroups(); 
-               
-               delete pars; delete output; 
-               delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+                               
+        delete output; delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;}
                 
@@ -623,46 +548,6 @@ void ParsimonyCommand::getUserInput() {
                 exit(1);
         }
  }
-/*****************************************************************/
-int ParsimonyCommand::readNamesFile() {
-       try {
-               m->names.clear();
-               numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = m->names.find(first);
-                       if (itNames == m->names.end()) {  
-                               m->names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = dupNames[i]; 
-                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
-               }
-               in.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ParsimonyCommand", "readNamesFile");
-               exit(1);
-       }
-}
  /***********************************************************/
  
  
diff --git a/parsimonycommand.h b/parsimonycommand.h

index 81fa99ba06c691f9e0ee06f4b73521655b046e8c..917255696166a505ae5734e20f0aaab47573876c 100644 (file)
--- a/parsimonycommand.h
+++ b/parsimonycommand.h
@@ -36,15 +36,12 @@ public:
         void help() { m->mothurOut(getHelpString()); }
         
  private:
-       ReadTree* read;
-       SharedUtil* util;
         FileOutput* output;
         vector<Tree*> T;           //user trees
         Tree* randT;  //random tree
         Tree* copyUserTree; 
         TreeMap* tmap; 
         TreeMap* savetmap;
-       Parsimony* pars;
         vector<string> groupComb; // AB. AC, BC...
         string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile;
         int iters, numGroups, numComp, counter, processors, numUniquesInName;
diff --git a/pcrseqscommand.h b/pcrseqscommand.h

index 45ce6f3f153e6020f00c859b1928cc1d74ea2a55..420a5ebc3e4b0cb527a3c2c692797c0b627c3b46 100644 (file)
--- a/pcrseqscommand.h
+++ b/pcrseqscommand.h
@@ -45,7 +45,7 @@ private:
         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
      bool abort, keepprimer, keepdots;
         string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
-       int start, end, pdiffs, processors, length;
+       int start, end, processors, length;
         
      vector<string> revPrimer, outputNames;
         vector<string> primers;
diff --git a/phylodiversity.cpp b/phylodiversity.cpp

deleted file mode 100644 (file)

index d5ccaf9..0000000
--- a/phylodiversity.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- *  phylodiversity.cpp
- *  Mothur
- *
- *  Created by westcott on 4/30/10.
- *  Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "phylodiversity.h"
-
-/**************************************************************************************************
-EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes, vector< vector<float> >& data) {
-    try {
-               
-               map<string, float> DScore;
-               float totalLength = 0.0;
-               data.clear();
-               
-               //initialize Dscore
-               for (int i=0; i<globaldata->Groups.size(); i++) {               DScore[globaldata->Groups[i]] = 0.0;    }
-       
-               ********************************************************
-               //calculate a D value for each group 
-               for(int v=0;v<treeNodes.size();v++){
-                               
-                       if (m->control_pressed) { return data; }
-                       
-                       //calc the branch length
-                       //while you aren't at root
-                       float sum = 0.0;
-                       int index = treeNodes[v];
-
-                       while(t->tree[index].getParent() != -1){
-                               
-                               //if you have a BL
-                               if(t->tree[index].getBranchLength() != -1){
-                                       sum += abs(t->tree[index].getBranchLength());
-                               }
-                               index = t->tree[index].getParent();
-                       }
-                               
-                       //get last breanch length added
-                       if(t->tree[index].getBranchLength() != -1){
-                               sum += abs(t->tree[index].getBranchLength());
-                       }
-                               
-                       //for each group in the groups update the total branch length accounting for the names file
-                       vector<string> groups = t->tree[treeNodes[v]].getGroup();
-                       for (int j = 0; j < groups.size(); j++) {
-                               int numSeqsInGroupJ = 0;
-                               map<string, int>::iterator it;
-                               it = t->tree[treeNodes[v]].pcount.find(groups[j]);
-                               if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j
-                                       numSeqsInGroupJ = it->second;
-                               }
-
-                               //add branch length to total for group
-                               DScore[groups[j]] += (numSeqsInGroupJ * sum);
-                       }
-                       
-               }
-               
-       
-               for (int i=0; i<globaldata->Groups.size(); i++) {   
-                       float percent = DScore[globaldata->Groups[i]]; 
-                       data.push_back(percent);  
-                       
-               }
-               
-               return data;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "PhyloDiversity", "getValues");
-               exit(1);
-       }
-}
-**************************************************************************************************/
-
-
-
diff --git a/phylodiversity.h b/phylodiversity.h

deleted file mode 100644 (file)

index a789efa..0000000
--- a/phylodiversity.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef PHYLODIVERSITY_H
-#define PHYLODIVERSITY_H
-
-
-/*
- *  phylodiversity.h
- *  Mothur
- *
- *  Created by westcott on 4/30/10.
- *  Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "treemap.h"
-#include "mothurout.h"
-
-
-/***********************************************************************/
-
-class PhyloDiversity  {
-       
-       public:
-               PhyloDiversity(TreeMap* t) : tmap(t) {  m = MothurOut::getInstance(); }
-               ~PhyloDiversity() {};
-               
-               //int getValues(Tree*, vector<int>, vector< vector< float> >&);
-               
-               
-       private:
-               MothurOut* m;
-               TreeMap* tmap;
-};
-
-/***********************************************************************/
-
-
-#endif
-
diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp

index abf9591f4060482a158df9b008e0f357d6c6b069..3db101a89f3c60e93ebb7961ac75bdea5524f630 100644 (file)
--- a/phylodiversitycommand.cpp
+++ b/phylodiversitycommand.cpp
@@ -8,6 +8,7 @@
   */
  
  #include "phylodiversitycommand.h"
+#include "treereader.h"
  
  //**********************************************************************************************************************
  vector<string> PhyloDiversityCommand::setParameters(){ 
@@ -136,12 +137,6 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
                                 }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
                         //check for required parameters
                         treefile = validParameter.validFile(parameters, "tree", true);
                         if (treefile == "not open") { treefile = ""; abort = true; }
@@ -218,74 +213,15 @@ int PhyloDiversityCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 m->setTreeFile(treefile);
-               
-               if (groupfile != "") {
-                       //read in group map info.
-                       tmap = new TreeMap(groupfile);
-                       tmap->readMap();
-               }else{ //fake out by putting everyone in one group
-                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       tmap = new TreeMap();
-                       
-                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               }
-               
-               if (namefile != "") { readNamesFile(); }
-               
-               read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
-               
-               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-               
-               read->AssembleTrees();
-               vector<Tree*> trees = read->getTrees();
-               delete read;
-               
-               //make sure all files match
-               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-               int numNamesInTree;
-               if (namefile != "")  {  
-                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-                       else {   numNamesInTree = m->Treenames.size();  }
-               }else {  numNamesInTree = m->Treenames.size();  }
-               
-               
-               //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                               //is that name in the tree?
-                               int count = 0;
-                               for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                       count++;
-                               }
-                               
-                               if (m->control_pressed) { 
-                                       delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
-                               }
-                               
-                               //then you did not find it so report it 
-                               if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
-                               }
-                       }
-               }
-               
-               SharedUtil* util = new SharedUtil();
+        TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        vector<Tree*> trees = reader->getTrees();
+        tmap = trees[0]->getTreeMap();
+        delete reader;
+
+               SharedUtil util;
                 vector<string> mGroups = m->getGroups();
                 vector<string> tGroups = tmap->getNamesOfGroups();
-               util->setGroups(mGroups, tGroups, "phylo.diversity");   //sets the groups the user wants to analyze
-               delete util;
+               util.setGroups(mGroups, tGroups, "phylo.diversity");    //sets the groups the user wants to analyze
                 
                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
                 for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i);  break; }  }
@@ -711,47 +647,6 @@ vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< st
                 exit(1);
         }
  }
-/*****************************************************************/
-int PhyloDiversityCommand::readNamesFile() {
-       try {
-               m->names.clear();
-               numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = m->names.find(first);
-                       if (itNames == m->names.end()) {  
-                               m->names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = dupNames[i]; 
-                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
-               }
-               in.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
-               exit(1);
-       }
-}
-
  //**********************************************************************************************************************
  
  
diff --git a/phylodiversitycommand.h b/phylodiversitycommand.h

index b44e6caed27037ddb177acdd1416df40550dee5c..5d0cccf249099ac51d86b58e41e5098793ef3f3c 100644 (file)
--- a/phylodiversitycommand.h
+++ b/phylodiversitycommand.h
@@ -12,9 +12,8 @@
  
  #include "command.hpp"
  #include "treemap.h"
-#include "readtree.h"
  #include "sharedutilities.h"
-
+#include "tree.h"
  
  class PhyloDiversityCommand : public Command {
         
@@ -33,14 +32,12 @@ class PhyloDiversityCommand : public Command {
                 int execute();
                 void help() { m->mothurOut(getHelpString()); }
  private:
-               ReadTree* read;
                 TreeMap* tmap;
                 float freq;
                 int iters, processors, numUniquesInName;  
                 bool abort, rarefy, summary, collect, scale;
                 string groups, outputDir, treefile, groupfile, namefile;
                 vector<string> Groups, outputNames; //holds groups to be used, and outputFile names
-               map<string, string> nameMap;
                 
                 int readNamesFile();
                 void printData(set<int>&, map< string, vector<float> >&, ofstream&, int);
diff --git a/phylosummary.cpp b/phylosummary.cpp

index 7d9fcac2558da8f50653a11ddafd997b58c1c286..2f565150aabeec35cc8a3c03a459d8b85090550b 100644 (file)
--- a/phylosummary.cpp
+++ b/phylosummary.cpp
@@ -25,7 +25,7 @@ PhyloSummary::PhyloSummary(string refTfile, string groupFile){
                 }
                                 
                 //check for necessary files
-               string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
+               string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
                 ifstream FileTest(taxFileNameTest.c_str());
                 
                 if (!FileTest) { 
diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp

index d9c37769b07d24f40efbfef9e076da33f5c86c29..725136bf56a2512355cf170634b625bfa5df1581 100644 (file)
--- a/prcseqscommand.cpp
+++ b/prcseqscommand.cpp
@@ -20,7 +20,6 @@ vector<string> PcrSeqsCommand::setParameters(){
                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
                 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
-               CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
          CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
@@ -40,8 +39,15 @@ vector<string> PcrSeqsCommand::setParameters(){
  string PcrSeqsCommand::getHelpString(){        
         try {
                 string helpString = "";
-               helpString += "The pcr.seqs command reads a fasta file ...\n";
-               
+               helpString += "The pcr.seqs command reads a fasta file.\n";
+        helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
+               helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
+        helpString += "The start parameter allows you to provide a starting position to trim to.\n";
+        helpString += "The end parameter allows you to provide a ending position to trim from.\n";
+        helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file.  if nomatch=true, then do nothing to sequence.\n";
+        helpString += "The processors parameter allows you to use multiple processors.\n";
+        helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
+        helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
                 return helpString;
@@ -159,8 +165,6 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                                 
                         }
              
-            //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
                         
                         //check for required parameters
                         fastafile = validParameter.validFile(parameters, "fasta", true);
@@ -171,7 +175,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                         }else if (fastafile == "not open") { fastafile = ""; abort = true; }    
                         else { m->setFastaFile(fastafile); }
                         
-            
+            //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(fastafile);      }
+
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
                         string temp;
@@ -204,10 +210,7 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                         if (taxfile == "not found"){    taxfile = "";           }
                         else if(taxfile == "not open"){ taxfile = ""; abort = true;     } 
              else { m->setTaxonomyFile(taxfile); }
-                       
-                       temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
-                       m->mothurConvert(temp, pdiffs);
-                       
+                                               
                         temp = validParameter.validFile(parameters, "start", false);    if (temp == "not found") { temp = "-1"; }
                         m->mothurConvert(temp, start);
              
diff --git a/preclustercommand.cpp b/preclustercommand.cpp

index f2fbc80088b1adf2927ee78d5b1e02fbf9691cb1..bcff0fcd820a497da04405bef0ddb42d7d60ded9 100644 (file)
--- a/preclustercommand.cpp
+++ b/preclustercommand.cpp
@@ -314,7 +314,17 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                 process++;
                         }else if (pid == 0){
+                outputNames.clear();
                                 num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
+                
+                string tempFile = toString(getpid()) + ".outputNames.temp";
+                ofstream outTemp;
+                m->openOutputFile(tempFile, outTemp);
+                
+                outTemp << outputNames.size();
+                for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
+                outTemp.close();
+                
                                 exit(0);
                         }else { 
                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
@@ -331,7 +341,23 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                         int temp = processIDS[i];
                         wait(&temp);
                 }
-               
+        
+        for (int i = 0; i < processIDS.size(); i++) {
+            string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
+            ifstream intemp;
+            m->openInputFile(tempFile, intemp);
+            
+            int num;
+            intemp >> num;
+            for (int k = 0; k < num; k++) {
+                string name = "";
+                intemp >> name; m->gobble(intemp);
+                
+                outputNames.push_back(name); outputTypes["map"].push_back(name);
+            }
+            intemp.close();
+            m->mothurRemove(tempFile);
+        }
  #else
                 
                 //////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp

index dabcef486ee7541c183fc594f10da9a9f6624561..82ff7faef0d8886d21fc6b642f34c88c1b143ea1 100644 (file)
--- a/rarefactcommand.cpp
+++ b/rarefactcommand.cpp
@@ -474,7 +474,6 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                 
                 //find different types of files
                 map<string, map<string, string> > typesFiles;
-        map<string, string> temp; 
                 for (int i = 0; i < outputNames.size(); i++) {
                         string extension = m->getExtension(outputNames[i]);
                         
@@ -485,9 +484,15 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                         string newLine = labels.substr(0, labels.find_first_of('\t'));
                         
                         newLine += "\tGroup" + labels.substr(labels.find_first_of('\t'));
-                       
-            temp[outputNames[i]] = file2Group[i];
-                       typesFiles[extension] = temp;
+            
+            map<string, map<string, string> >::iterator itfind = typesFiles.find(extension);
+            if (itfind != typesFiles.end()) {
+                (itfind->second)[outputNames[i]] = file2Group[i];
+            }else {
+                map<string, string> temp;  
+                temp[outputNames[i]] = file2Group[i];
+                typesFiles[extension] = temp;
+            }
                         
                         string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
                         
@@ -507,7 +512,6 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                         string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
                         m->openOutputFileAppend(combineFileName, out);
                         newFileNames.push_back(combineFileName);
-                       
                         map<string, string> thisTypesFiles = it->second;
                 
                         //open each type summary file
@@ -518,7 +522,7 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                  
                  string thisfilename = itFileNameGroup->first;
                  string group = itFileNameGroup->second;
-                
+               
                                 ifstream temp;
                                 m->openInputFile(thisfilename, temp);
                                 
@@ -541,7 +545,6 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                                         count++;
                                                                         
                                         thisFilesLines.push_back(thisLine);
-                                       
                                         m->gobble(temp);
                                 }
                                 
@@ -562,7 +565,6 @@ vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map
                                 for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
                      
                                         string thisfilename = itFileNameGroup->first;
-                    
                                         map<int, int>::iterator itLine = lineToNumber.find(k);
                                         if (itLine != lineToNumber.end()) {
                                                 string output = toString(itLine->second);
diff --git a/readtree.cpp b/readtree.cpp

index 74b4268a8d75997edc28f4795866177057f9f813..6fa4c3da814347cc60c0807bd16cb373fff8c962 100644 (file)
--- a/readtree.cpp
+++ b/readtree.cpp
@@ -20,12 +20,12 @@ ReadTree::ReadTree() {
         }
  }
  /***********************************************************************/
-int ReadTree::AssembleTrees() {
+int ReadTree::AssembleTrees(map<string, string> nameMap) {
          try {
                  //assemble users trees
                  for (int i = 0; i < Trees.size(); i++) {
                          if (m->control_pressed) { return 0;  }
-                        Trees[i]->assembleTree();
+                        Trees[i]->assembleTree(nameMap);
                  }
                  return 0;
          }
diff --git a/readtree.h b/readtree.h

index b5b26ed0b81ee7d3ad5a87a3515e54881b5daf78..6b074de839070a86bd3e010bbf222a7ec4011dce 100644 (file)
--- a/readtree.h
+++ b/readtree.h
@@ -30,7 +30,7 @@ class ReadTree {
                 float readBranchLength(istream& f);
         
                 vector<Tree*> getTrees() { return Trees; }
-               int AssembleTrees();
+               int AssembleTrees(map<string, string>);
                 
         protected:
                 vector<Tree*> Trees;
diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp

index 49674f57ea9c11a46bd987e587e3678b10fb1c4b..801029c06383e7811e5cfb14ea0c5cd629b66b67 100644 (file)
--- a/removegroupscommand.cpp
+++ b/removegroupscommand.cpp
@@ -465,7 +465,6 @@ int RemoveGroupsCommand::readShared(){
                 delete tempInput;
                 m->setGroups(groupsToKeep);
                 m->clearAllGroups();
-               m->names.clear();
                 m->saveNextLabel = "";
                 m->printedHeaders = false;
                 m->currentBinLabels.clear();
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp

index 68d3bf583f9c7de2db85ebfa27a2fecdd7738922..1ea58c584054b9b4b5944db18d198fd49e78cc51 100644 (file)
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -414,7 +414,7 @@ int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = nameMap.find(current.getName());
                                         
-                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
                                         else { num = it->second; }
                                 }
                                 
diff --git a/setcurrentcommand.cpp b/setcurrentcommand.cpp

index 5582abd03d6220da752943278cad50f4bceb1a76..96735405c5e148159ab762a720a93f88ec2354ee 100644 (file)
--- a/setcurrentcommand.cpp
+++ b/setcurrentcommand.cpp
@@ -15,6 +15,7 @@ vector<string> SetCurrentCommand::setParameters(){
                 
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pflow);
+        CommandParameter pbiom("biom", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pbiom);
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pphylip);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcolumn);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
@@ -52,7 +53,7 @@ string SetCurrentCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The set.current command allows you to set the current files saved by mothur.\n";
-               helpString += "The set.current command parameters are: clear, phylip, column, list, rabund, sabund, name, group, design, order, tree, shared, ordergroup, relabund, fasta, qfile, sff, oligos, accnos, taxonomy.\n";
+               helpString += "The set.current command parameters are: clear, phylip, column, list, rabund, sabund, name, group, design, order, tree, shared, ordergroup, relabund, fasta, qfile, sff, oligos, accnos, biom and taxonomy.\n";
                 helpString += "The clear paramter is used to indicate which file types you would like to clear values for, multiple types can be separated by dashes.\n";
                 helpString += "The set.current command should be in the following format: \n";
                 helpString += "set.current(fasta=yourFastaFile) or set.current(fasta=amazon.fasta, clear=name-accnos)\n";
@@ -272,6 +273,14 @@ SetCurrentCommand::SetCurrentCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("biom");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["biom"] = inputDir + it->second;             }
+                               }
                         }
                         
                         //check for parameters
@@ -374,6 +383,11 @@ SetCurrentCommand::SetCurrentCommand(string option)  {
                         if (flowfile == "not open") { m->mothurOut("Ignoring: " + parameters["flow"]); m->mothurOutEndLine(); flowfile = ""; }
                         else if (flowfile == "not found") {  flowfile = "";  }  
                         if (flowfile != "") { m->setFlowFile(flowfile); }
+            
+            biomfile = validParameter.validFile(parameters, "biom", true);
+                       if (biomfile == "not open") { m->mothurOut("Ignoring: " + parameters["biom"]); m->mothurOutEndLine(); biomfile = ""; }
+                       else if (biomfile == "not found") {  biomfile = "";  }  
+                       if (biomfile != "") { m->setBiomFile(biomfile); }
                         
                         processors = validParameter.validFile(parameters, "processors", false);
                         if (processors == "not found") {  processors = "1";  }  
@@ -444,6 +458,8 @@ int SetCurrentCommand::execute(){
                                         m->setTaxonomyFile("");
                                 }else if (types[i] == "flow") {
                                         m->setFlowFile("");
+                }else if (types[i] == "biom") {
+                                       m->setBiomFile("");
                                 }else if (types[i] == "processors") {
                                         m->setProcessors("1");
                                 }else if (types[i] == "all") {
diff --git a/setcurrentcommand.h b/setcurrentcommand.h

index 0033ed58cf25a274e9e4bbe2a17aa4e4cb298ef0..3949519e9ec3f04392d555539fd5409dcab59dea 100644 (file)
--- a/setcurrentcommand.h
+++ b/setcurrentcommand.h
@@ -38,7 +38,7 @@ private:
         string clearTypes;
         vector<string> types;
         
-       string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile;
+       string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile;
         string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile;
  
         
diff --git a/setdircommand.cpp b/setdircommand.cpp

index 081a306d712a8ee10bb67cf1d72bbf0663d0ff07..55b752dba4f40e76f11a46d2608e75d43c0de506 100644 (file)
--- a/setdircommand.cpp
+++ b/setdircommand.cpp
@@ -101,14 +101,6 @@ int SetDirectoryCommand::execute(){
                 
                 commandFactory = CommandFactory::getInstance();
                 
-               string tag = "";
-#ifdef USE_MPI
-               int pid; 
-               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-               
-               tag = toString(pid);
-#endif
-               
                 m->mothurOut("Mothur's directories:"); m->mothurOutEndLine();
                 
                 //redirect output
@@ -120,27 +112,10 @@ int SetDirectoryCommand::execute(){
                         m->mothurOut("outputDir=" + output); m->mothurOutEndLine();  
                         commandFactory->setOutputDirectory(output);
                 }else {
-                       //add / to name if needed
-                       string lastChar = output.substr(output.length()-1);
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                               if (lastChar != "/") { output += "/"; }
-                       #else
-                               if (lastChar != "\\") { output += "\\"; }       
-                       #endif
-                       
-                       //test to make sure directory exists
-                       output = m->getFullPathName(output);
-                       string outTemp = output + tag + "temp";
-                       ofstream out;
-                       out.open(outTemp.c_str(), ios::trunc);
-                       if(!out) {
-                               m->mothurOut(output + " directory does not exist or is not writable."); m->mothurOutEndLine(); 
-                       }else{
-                               out.close();
-                               m->mothurRemove(outTemp);
-                               m->mothurOut("outputDir=" + output); m->mothurOutEndLine();  
+            if (m->dirCheck(output)) {
+                m->mothurOut("outputDir=" + output); m->mothurOutEndLine();  
                                 commandFactory->setOutputDirectory(output);
-                       }
+            }
                 }
                 
                 //redirect input
@@ -152,28 +127,11 @@ int SetDirectoryCommand::execute(){
                         m->mothurOut("inputDir=" + input); m->mothurOutEndLine();  
                         commandFactory->setInputDirectory(input);
                 }else {
-                       //add / to name if needed
-                       string lastChar = input.substr(input.length()-1);
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                               if (lastChar != "/") { input += "/"; }
-                       #else
-                               if (lastChar != "\\") { input += "\\"; }        
-                       #endif
-                       
-                       //test to make sure directory exists
-                       input = m->getFullPathName(input);
-                       string inTemp = input + tag + "temp";
-                       ofstream in;
-                       in.open(inTemp.c_str(), ios::trunc);
-                       if(!in) {
-                               m->mothurOut(input + " directory does not exist or is not writable."); m->mothurOutEndLine(); 
-                       }else{
-                               in.close();
-                               m->mothurRemove(inTemp);
-                               m->mothurOut("inputDir=" + input); m->mothurOutEndLine();  
+            if (m->dirCheck(input)) {
+                m->mothurOut("inputDir=" + input); m->mothurOutEndLine();  
                                 commandFactory->setInputDirectory(input); 
-                       }
-               }
+            }
+        }
                 
                 //set default
                 if (tempdefault == "clear") {  
@@ -194,28 +152,11 @@ int SetDirectoryCommand::execute(){
                         m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
                         m->setDefaultPath(tempdefault);
                 }else {
-                       //add / to name if needed
-                       string lastChar = tempdefault.substr(tempdefault.length()-1);
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                               if (lastChar != "/") { tempdefault += "/"; }
-                       #else
-                               if (lastChar != "\\") { tempdefault += "\\"; }  
-                       #endif
-                       
-                       //test to make sure directory exists
-                       tempdefault = m->getFullPathName(tempdefault);
-                       string inTemp = tempdefault + tag + "temp";
-                       ofstream in;
-                       in.open(inTemp.c_str(), ios::trunc);
-                       if(!in) {
-                               m->mothurOut(tempdefault + " directory does not exist or is not writable."); m->mothurOutEndLine(); 
-                       }else{
-                               in.close();
-                               m->mothurRemove(inTemp);
-                               m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
-                               m->setDefaultPath(tempdefault);
-                       }
-               }
+            if (m->dirCheck(tempdefault)) {
+                m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
+                               m->setDefaultPath(tempdefault); 
+            }
+        }
  
                 return 0;
         }
diff --git a/sharedcommand.cpp b/sharedcommand.cpp

index 63f83e19a61fec97a6a64121da4f5ea973b5815a..8f05cfcc567a8f0e5d9cbdc867bf15bebd887f11 100644 (file)
--- a/sharedcommand.cpp
+++ b/sharedcommand.cpp
@@ -8,6 +8,8 @@
   */
  
  #include "sharedcommand.h"
+#include "sharedutilities.h"
+
  //********************************************************************************************************************
  //sorts lowest to highest
  inline bool compareSharedRabunds(SharedRAbundVector* left, SharedRAbundVector* right){
@@ -16,8 +18,9 @@ inline bool compareSharedRabunds(SharedRAbundVector* left, SharedRAbundVector* r
  //**********************************************************************************************************************
  vector<string> SharedCommand::setParameters(){ 
         try {
-               CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
+        CommandParameter pbiom("biom", "InputTypes", "", "", "BiomListGroup", "BiomListGroup", "none",false,false); parameters.push_back(pbiom);
+               CommandParameter plist("list", "InputTypes", "", "", "BiomListGroup", "BiomListGroup", "ListGroup",false,false); parameters.push_back(plist);
+               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "ListGroup",false,false); parameters.push_back(pgroup);
                 //CommandParameter pordergroup("ordergroup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pordergroup);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
@@ -37,10 +40,10 @@ vector<string> SharedCommand::setParameters(){
  string SharedCommand::getHelpString(){ 
         try {
                 string helpString = "";
-               helpString += "The make.shared command reads a list and group file and creates a shared file, as well as a rabund file for each group.\n";
-               helpString += "The make.shared command parameters are list, group, ordergroup, groups and label. list and group are required unless a current file is available.\n";
+               helpString += "The make.shared command reads a list and group file or a biom file and creates a shared file. If a list and group are provided a rabund file is created for each group.\n";
+               helpString += "The make.shared command parameters are list, group, biom, groups and label. list and group are required unless a current file is available or you provide a biom file.\n";
                 helpString += "The groups parameter allows you to indicate which groups you want to include, group names should be separated by dashes. ex. groups=A-B-C. Default is all groups in your groupfile.\n";
-               helpString += "The label parameter allows you to indicate which labels you want to include, label names should be separated by dashes. Default is all labels in your list file.\n";
+               helpString += "The label parameter is only valid with the list and group option and allows you to indicate which labels you want to include, label names should be separated by dashes. Default is all labels in your list file.\n";
                 //helpString += "The ordergroup parameter allows you to indicate the order of the groups in the sharedfile, by default the groups are listed alphabetically.\n";
                 return helpString;
         }
@@ -111,12 +114,20 @@ SharedCommand::SharedCommand(string option)  {
                                          if (path == "") {      parameters["group"] = inputDir + it->second;            }
                                  }
                          
-                                it = parameters.find("ordergroup");
+                                /*it = parameters.find("ordergroup");
                                  //user has given a template file
                                  if(it != parameters.end()){ 
                                          path = m->hasPath(it->second);
                                          //if the user has not given a path then, add inputdir. else leave path alone.
                                          if (path == "") {      parameters["ordergroup"] = inputDir + it->second;               }
+                                }*/
+                 
+                 it = parameters.find("biom");
+                                //user has given a template file
+                                if(it != parameters.end()){ 
+                                        path = m->hasPath(it->second);
+                                        //if the user has not given a path then, add inputdir. else leave path alone.
+                                        if (path == "") {      parameters["biom"] = inputDir + it->second;             }
                                  }
                          }
                          
@@ -127,11 +138,13 @@ SharedCommand::SharedCommand(string option)  {
                          //check for required parameters
                          listfile = validParameter.validFile(parameters, "list", true);
                          if (listfile == "not open") { listfile = ""; abort = true; }
-                        else if (listfile == "not found") { 
-                                listfile = m->getListFile(); 
-                                if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
-                                else {         m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
-                        }else { m->setListFile(listfile); }    
+                        else if (listfile == "not found") { listfile = "";  }
+                        else { m->setListFile(listfile); }     
+            
+             biomfile = validParameter.validFile(parameters, "biom", true);
+             if (biomfile == "not open") { biomfile = ""; abort = true; }
+             else if (biomfile == "not found") { biomfile = "";  }
+             else { m->setBiomFile(biomfile); }                
                                                         
                          ordergroupfile = validParameter.validFile(parameters, "ordergroup", true);
                          if (ordergroupfile == "not open") { abort = true; }    
@@ -139,28 +152,37 @@ SharedCommand::SharedCommand(string option)  {
                                                  
                          groupfile = validParameter.validFile(parameters, "group", true);
                          if (groupfile == "not open") { groupfile = ""; abort = true; } 
-                        else if (groupfile == "not found") { 
-                                groupfile = m->getGroupFile(); 
-                                if (groupfile != "") { 
-                                        m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine();
-                                        groupMap = new GroupMap(groupfile);
-                                        
-                                        int error = groupMap->readMap();
-                                        if (error == 1) { abort = true; }
-                                        vector<string> allGroups = groupMap->getNamesOfGroups();
-                                        m->setAllGroups(allGroups);
-                                }
-                                else {         m->mothurOut("You have no current group file and the group parameter is required."); m->mothurOutEndLine(); abort = true; }
-                        }else {  
-                                groupMap = new GroupMap(groupfile);
-                        
-                                int error = groupMap->readMap();
-                                if (error == 1) { abort = true; }
-                                vector<string> allGroups = groupMap->getNamesOfGroups();
-                                m->setAllGroups(allGroups);
-                                m->setGroupFile(groupfile);
-                        }
+                        else if (groupfile == "not found") { groupfile = ""; }
+                        else {  m->setGroupFile(groupfile); }
                          
+            if ((biomfile == "") && (listfile == "")) { 
+                               //is there are current file available for either of these?
+                               //give priority to list, then biom
+                               listfile = m->getListFile(); 
+                               if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
+                               else { 
+                                       biomfile = m->getBiomFile(); 
+                                       if (biomfile != "") {  m->mothurOut("Using " + biomfile + " as input file for the biom parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               m->mothurOut("No valid current files. You must provide a list or biom file before you can use the make.shared command."); m->mothurOutEndLine(); 
+                                               abort = true;
+                                       }
+                               }
+                       }
+                       else if ((biomfile != "") && (listfile != "")) { m->mothurOut("When executing a make.shared command you must enter ONLY ONE of the following: list or biom."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if (listfile != "") {
+                               if (groupfile == "") { 
+                                       groupfile = m->getGroupFile(); 
+                                       if (groupfile != "") {  m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               m->mothurOut("You need to provide a groupfle if you are going to use the list format."); m->mothurOutEndLine(); 
+                                               abort = true; 
+                                       }       
+                               }
+                       }
+
+                        
                          string groups = validParameter.validFile(parameters, "groups", false);                 
                          if (groups == "not found") { groups = ""; }
                          else { 
@@ -190,238 +212,23 @@ int SharedCommand::execute(){
         try {
                 
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
-               
+                       
                 //getting output filename
-               filename = listfile;
+        string filename = "";
+               if (listfile != "") { filename = listfile; }
+        else { filename = biomfile; }
                 
                 if (outputDir == "") { outputDir += m->hasPath(filename); }
                 
                 filename = outputDir + m->getRootName(m->getSimpleName(filename));
                 filename = filename + "shared";
-               outputTypes["shared"].push_back(filename);
-               
-               m->openOutputFile(filename, out);
-               pickedGroups = false;
-               
-               //if hte user has not specified any groups then use them all
-               if (Groups.size() == 0) {
-                       Groups = groupMap->getNamesOfGroups(); m->setGroups(Groups);
-               }else { pickedGroups = true; }
-               
-               //fill filehandles with neccessary ofstreams
-               int i;
-               ofstream* temp;
-               for (i=0; i<Groups.size(); i++) {
-                       temp = new ofstream;
-                       filehandles[Groups[i]] = temp;
-               }
-               
-               //set fileroot
-               fileroot = outputDir + m->getRootName(m->getSimpleName(listfile));
-               
-               //clears file before we start to write to it below
-               for (int i=0; i<Groups.size(); i++) {
-                       m->mothurRemove((fileroot + Groups[i] + ".rabund"));
-                       outputNames.push_back((fileroot + Groups[i] + ".rabund"));
-                       outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
-               }
-               
-               //lookup.clear();
-               string errorOff = "no error";
-               //errorOff = "";
-               
-               //if user provided an order file containing the order the shared file should be in read it
-               if (ordergroupfile != "") { readOrderFile(); }
-               
-               input = new InputData(listfile, "shared");
-               SharedList = input->getSharedListVector();
-               string lastLabel = SharedList->getLabel();
-               vector<SharedRAbundVector*> lookup; 
-               
-               if (m->control_pressed) { 
-                       delete input; delete SharedList; delete groupMap; 
-                       for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
-                       out.close(); m->mothurRemove(filename); 
-                       for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                       return 0; 
-               }
-               
-               //sanity check
-               int error = ListGroupSameSeqs();
-               
-               if ((!pickedGroups) && (SharedList->getNumSeqs() != groupMap->getNumSeqs())) {  //if the user has not specified any groups and their files don't match exit with error
-                       m->mothurOut("Your group file contains " + toString(groupMap->getNumSeqs()) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); 
-                       
-                       out.close();
-                       m->mothurRemove(filename); //remove blank shared file you made
-                       
-                       createMisMatchFile();
-                       
-                       //delete memory
-                       for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
-                               delete it3->second;
-                       }
-               
-                       delete input; delete SharedList; delete groupMap; 
-                       
-                       return 0; 
-               }
-               
-               if (error == 1) { m->control_pressed = true; }
-               
-               //if user has specified groups make new groupfile for them
-               if (pickedGroups) { //make new group file
-                       string groups = "";
-                       if (m->getNumGroups() < 4) {
-                               for (int i = 0; i < m->getNumGroups(); i++) {
-                                       groups += (m->getGroups())[i] + ".";
-                               }
-                       }else { groups = "merge"; }
-               
-                       string newGroupFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + groups + "groups";
-                       outputTypes["group"].push_back(newGroupFile); 
-                       outputNames.push_back(newGroupFile);
-                       ofstream outGroups;
-                       m->openOutputFile(newGroupFile, outGroups);
-               
-                       vector<string> names = groupMap->getNamesSeqs();
-                       string groupName;
-                       for (int i = 0; i < names.size(); i++) {
-                               groupName = groupMap->getGroup(names[i]);
-                               if (isValidGroup(groupName, m->getGroups())) {
-                                       outGroups << names[i] << '\t' << groupName << endl;
-                               }
-                       }
-                       outGroups.close();
-               }
-               
-               //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
-               set<string> processedLabels;
-               set<string> userLabels = labels;        
-       
-               while((SharedList != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
-                       if (m->control_pressed) { 
-                               delete input; delete SharedList; delete groupMap;
-                               for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
-                               out.close(); m->mothurRemove(filename); 
-                               for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                               return 0; 
-                       }
-               
-                       if(allLines == 1 || labels.count(SharedList->getLabel()) == 1){
-                                       
-                                       lookup = SharedList->getSharedRAbundVector();
-                                       
-                                       m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
-                                       if (pickedGroups) { //check for otus with no seqs in them
-                                               eliminateZeroOTUS(lookup);
-                                       }
-                                       
-                                       if (m->control_pressed) { 
-                                               delete input; delete SharedList; delete groupMap; 
-                                               for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                                               for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
-                                               out.close(); m->mothurRemove(filename); 
-                                               for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                                               return 0; 
-                                       }
-                                       
-                                       if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
-                                       printSharedData(lookup); //prints info to the .shared file
-                                       for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                               
-                                       processedLabels.insert(SharedList->getLabel());
-                                       userLabels.erase(SharedList->getLabel());
-                       }
-                       
-                       if ((m->anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) == true) && (processedLabels.count(lastLabel) != 1)) {
-                                       string saveLabel = SharedList->getLabel();
-                                       
-                                       delete SharedList;
-                                       SharedList = input->getSharedListVector(lastLabel); //get new list vector to process
-                                       
-                                       lookup = SharedList->getSharedRAbundVector();
-                                       m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
-                                       if (pickedGroups) { //check for otus with no seqs in them
-                                               eliminateZeroOTUS(lookup);
-                                       }
-                                       
-                                       
-                                       if (m->control_pressed) { 
-                                               delete input; delete SharedList; delete groupMap; 
-                                               for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                                               for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
-                                               out.close(); m->mothurRemove(filename); 
-                                               for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                                               return 0; 
-                                       }
-                                       
-                                       if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
-                                       printSharedData(lookup); //prints info to the .shared file
-                                       for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                                       
-                                       processedLabels.insert(SharedList->getLabel());
-                                       userLabels.erase(SharedList->getLabel());
-                                       
-                                       //restore real lastlabel to save below
-                                       SharedList->setLabel(saveLabel);
-                       }
-                       
-               
-                       lastLabel = SharedList->getLabel();
-                               
-                       delete SharedList;
-                       SharedList = input->getSharedListVector(); //get new list vector to process
-               }
-               
-               //output error messages about any remaining user labels
-               set<string>::iterator it;
-               bool needToRun = false;
-               for (it = userLabels.begin(); it != userLabels.end(); it++) {  
-                       if (processedLabels.count(lastLabel) != 1) {
-                               needToRun = true;
-                       }
-               }
-               
-               //run last label if you need to
-               if (needToRun == true)  {
-                       if (SharedList != NULL) {       delete SharedList;      }
-                       SharedList = input->getSharedListVector(lastLabel); //get new list vector to process
-                                       
-                       lookup = SharedList->getSharedRAbundVector();
-                       m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
-                       if (pickedGroups) { //check for otus with no seqs in them
-                               eliminateZeroOTUS(lookup);
-                       }
-                       
-                       if (m->control_pressed) { 
-                               delete input;  delete groupMap;
-                                       for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;   }
-                                       out.close(); m->mothurRemove(filename); 
-                                       for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                                       return 0; 
-                       }
-                       
-                       if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
-                       printSharedData(lookup); //prints info to the .shared file
-                       for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                       delete SharedList;
-               }
-               
-               out.close();
-               
-               for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
-                       delete it3->second;
-               }
-
-               delete input; delete groupMap;
-               
-               if (m->control_pressed) { 
-                               m->mothurRemove(filename); 
-                               for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));             }
-                               return 0; 
-               }
+               outputNames.push_back(filename); outputTypes["shared"].push_back(filename);
                 
+        if (listfile != "") {  createSharedFromListGroup(filename);  }
+        else {   createSharedFromBiom(filename);  }
+        
+        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); }  }
+        
                 //set rabund file as new current rabundfile
                 string current = "";
                 itTypes = outputTypes.find("rabund");
@@ -442,7 +249,6 @@ int SharedCommand::execute(){
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
-               m->mothurOut(filename); m->mothurOutEndLine();
                 m->mothurOutEndLine();
                 
                 return 0;
@@ -453,7 +259,718 @@ int SharedCommand::execute(){
         }
  }
  //**********************************************************************************************************************
-void SharedCommand::printSharedData(vector<SharedRAbundVector*> thislookup) {
+int SharedCommand::createSharedFromBiom(string filename) {
+       try {
+        ofstream out;
+        m->openOutputFile(filename, out);
+        
+        /*{
+            "id":"/Users/SarahsWork/Desktop/release/temp.job2.shared-unique",
+            "format": "Biological Observation Matrix 0.9.1",
+            "format_url": "http://biom-format.org",
+            "type": "OTU table",
+            "generated_by": "mothur1.24.0",
+            "date": "Tue Apr 17 13:12:07 2012", */
+        
+        ifstream in;
+        m->openInputFile(biomfile, in);
+        
+        m->getline(in); m->gobble(in);  //grab first '{'
+        
+        string matrixFormat = "";
+        int numRows = 0;
+        int numCols = 0;
+        int shapeNumRows = 0;
+        int shapeNumCols = 0;
+        vector<string> otuNames;
+        vector<string> groupNames;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { break; }
+            
+            string line = m->getline(in); m->gobble(in);
+            
+            string tag = getTag(line);
+            
+            if (tag == "type") {
+                //check to make sure this is an OTU table
+                string type = getTag(line);
+                if (type != "OTU table") { m->mothurOut("[ERROR]: " + type + " is not a valid biom type for mothur. Only type allowed is OTU table.\n"); m->control_pressed = true; }
+            }else if (tag == "matrix_type") {
+                //get type and check type
+                matrixFormat = getTag(line);
+                if ((matrixFormat != "sparse") && (matrixFormat != "dense")) { m->mothurOut("[ERROR]: " + matrixFormat + " is not a valid biom matrix_type for mothur. Types allowed are sparse and dense.\n"); m->control_pressed = true; }
+            }else if (tag == "matrix_element_type") {
+                //get type and check type
+                string matrixElementType = getTag(line);
+                if (matrixElementType != "int") { m->mothurOut("[ERROR]: " + matrixElementType + " is not a valid matrix_element_type for mothur. Only type allowed is int.\n"); m->control_pressed = true; }
+            }else if (tag == "rows") {
+                //read otu names
+                otuNames = readRows(line, in, numRows);  
+            }else if (tag == "columns") {
+                //read sample names
+                groupNames = readRows(line, in, numCols); 
+                
+                //if users selected groups, then remove the groups not wanted.
+                SharedUtil util;
+                vector<string> Groups = m->getGroups();
+                vector<string> allGroups = groupNames;
+                util.setGroups(Groups, allGroups);
+                m->setGroups(Groups);
+                
+                //fill filehandles with neccessary ofstreams
+                int i;
+                ofstream* temp;
+                for (i=0; i<Groups.size(); i++) {
+                    temp = new ofstream;
+                    filehandles[Groups[i]] = temp;
+                }
+                
+                //set fileroot
+                fileroot = outputDir + m->getRootName(m->getSimpleName(biomfile));
+                
+                //clears file before we start to write to it below
+                for (int i=0; i<Groups.size(); i++) {
+                    m->mothurRemove((fileroot + Groups[i] + ".rabund"));
+                    outputNames.push_back((fileroot + Groups[i] + ".rabund"));
+                    outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
+                }
+
+            }else if (tag == "shape") {
+                getDims(line, shapeNumRows, shapeNumCols);
+                
+                //check shape
+                if (shapeNumCols != numCols) {
+                    m->mothurOut("[ERROR]: shape indicates " + toString(shapeNumCols) + " columns, but I only read " + toString(numCols) + " columns.\n"); m->control_pressed = true;
+                }
+                
+                if (shapeNumRows != numRows) {
+                    m->mothurOut("[ERROR]: shape indicates " + toString(shapeNumRows) + " rows, but I only read " + toString(numRows) + " rows.\n"); m->control_pressed = true;
+                }
+            }else if (tag == "data") {
+                m->currentBinLabels = otuNames;
+                
+                //read data
+                vector<SharedRAbundVector*> lookup = readData(matrixFormat, line, in, groupNames, otuNames.size());
+
+                m->mothurOutEndLine(); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                lookup[0]->printHeaders(out); 
+                printSharedData(lookup, out);
+            }
+        }
+        in.close();
+        
+                
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "createSharedFromBiom");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<SharedRAbundVector*> SharedCommand::readData(string matrixFormat, string line, ifstream& in, vector<string>& groupNames, int numOTUs) {
+       try {
+        
+        vector<SharedRAbundVector*> lookup; 
+        
+        //creates new sharedRAbunds
+        for (int i = 0; i < groupNames.size(); i++) {
+            SharedRAbundVector* temp = new SharedRAbundVector(numOTUs); //sets all abunds to 0
+            temp->setLabel("dummy");
+            temp->setGroup(groupNames[i]);
+            lookup.push_back(temp);
+        }
+        
+        bool dataStart = false;
+        bool inBrackets = false;
+        string num = "";
+        vector<int> nums;
+        int otuCount = 0;
+        for (int i = 0; i < line.length(); i++) {
+            
+            if (m->control_pressed) { return lookup; }
+            
+            //look for opening [ to indicate data is starting
+            if ((line[i] == '[') && (!dataStart)) { dataStart = true; i++;  if (!(i < line.length())) { break; } }
+            else if ((line[i] == ']') && dataStart && (!inBrackets)) { break; } //we are done reading data
+                
+            if (dataStart) {
+                if ((line[i] == '[') && (!inBrackets)) { inBrackets = true; i++;  if (!(i < line.length())) { break; } }
+                else if ((line[i] == ']') && (inBrackets)) { 
+                    inBrackets = false; 
+                    int temp;
+                    m->mothurConvert(num, temp);
+                    nums.push_back(temp);
+                    num = "";
+                    
+                    //save info to vectors
+                    if (matrixFormat == "dense") {
+                        
+                        //sanity check
+                        if (nums.size() != lookup.size()) { m->mothurOut("[ERROR]: trouble parsing OTU data.  OTU " + toString(otuCount) + " causing errors.\n"); m->control_pressed = true; }
+                        
+                        //set abundances for this otu
+                        //nums contains [abundSample0, abundSample1, abundSample2, ...] for current OTU
+                        for (int j = 0; j < lookup.size(); j++) { lookup[j]->set(otuCount, nums[j], groupNames[j]); }
+                        
+                        otuCount++;
+                    }else {
+                        //sanity check
+                        if (nums.size() != 3) { m->mothurOut("[ERROR]: trouble parsing OTU data.\n"); m->control_pressed = true; }
+                        
+                        //nums contains [otuNum, sampleNum, abundance]
+                        lookup[nums[1]]->set(nums[0], nums[2], groupNames[nums[1]]);
+                    }
+                    nums.clear();
+                }
+                
+                if (inBrackets) {
+                    if (line[i] == ',') {
+                        int temp;
+                        m->mothurConvert(num, temp);
+                        nums.push_back(temp);
+                        num = "";
+                    }else { if (!isspace(line[i])) { num += line[i]; }  }
+                }
+            }
+        }
+        
+        //same as above just reading from file.
+        while (!in.eof()) {
+            
+            char c = in.get(); m->gobble(in);
+            
+            if (m->control_pressed) { return lookup; }
+            
+            //look for opening [ to indicate data is starting
+            if ((c == '[') && (!dataStart)) { dataStart = true; c = in.get();  if (in.eof()) { break; } }
+            else if ((c == ']') && dataStart && (!inBrackets)) { break; } //we are done reading data
+              
+            if (dataStart) {
+                if ((c == '[') && (!inBrackets)) { inBrackets = true; c = in.get();  if (in.eof()) { break; }  }
+                else if ((c == ']') && (inBrackets)) { 
+                    inBrackets = false; 
+                    int temp;
+                    m->mothurConvert(num, temp);
+                    nums.push_back(temp);
+                    num = "";
+                    
+                    //save info to vectors
+                    if (matrixFormat == "dense") {
+                        
+                        //sanity check
+                        if (nums.size() != lookup.size()) { m->mothurOut("[ERROR]: trouble parsing OTU data.  OTU " + toString(otuCount) + " causing errors.\n"); m->control_pressed = true; }
+                        
+                        //set abundances for this otu
+                        //nums contains [abundSample0, abundSample1, abundSample2, ...] for current OTU
+                        for (int j = 0; j < lookup.size(); j++) { lookup[j]->set(otuCount, nums[j], groupNames[j]); }
+                        
+                        otuCount++;
+                    }else {
+                        //sanity check
+                        if (nums.size() != 3) { m->mothurOut("[ERROR]: trouble parsing OTU data.\n"); m->control_pressed = true; }
+                        
+                        //nums contains [otuNum, sampleNum, abundance]
+                        lookup[nums[1]]->set(nums[0], nums[2], groupNames[nums[1]]);
+                    }
+                    nums.clear();
+                }
+                
+                if (inBrackets) {
+                    if (c == ',') {
+                        int temp;
+                        m->mothurConvert(num, temp);
+                        nums.push_back(temp);
+                        num = "";
+                    }else { if (!isspace(c)) { num += c; }  }
+                }
+            }
+        }
+        
+        SharedUtil util;
+        
+               bool remove = false;
+               for (int i = 0; i < lookup.size(); i++) {
+                       //if this sharedrabund is not from a group the user wants then delete it.
+                       if (util.isValidGroup(lookup[i]->getGroup(), m->getGroups()) == false) { 
+                               remove = true;
+                               delete lookup[i]; lookup[i] = NULL;
+                               lookup.erase(lookup.begin()+i); 
+                               i--; 
+                       }
+               }
+               
+               if (remove) { eliminateZeroOTUS(lookup); }
+
+        
+        return lookup;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "readData");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
+    try {
+        
+        vector<SharedRAbundVector*> newLookup;
+        for (int i = 0; i < thislookup.size(); i++) {
+            SharedRAbundVector* temp = new SharedRAbundVector();
+            temp->setLabel(thislookup[i]->getLabel());
+            temp->setGroup(thislookup[i]->getGroup());
+            newLookup.push_back(temp);
+        }
+        
+        //for each bin
+        vector<string> newBinLabels;
+        string snumBins = toString(thislookup[0]->getNumBins());
+        for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
+            if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
+            
+            //look at each sharedRabund and make sure they are not all zero
+            bool allZero = true;
+            for (int j = 0; j < thislookup.size(); j++) {
+                if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
+            }
+            
+            //if they are not all zero add this bin
+            if (!allZero) {
+                for (int j = 0; j < thislookup.size(); j++) {
+                    newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
+                }
+                
+                //if there is a bin label use it otherwise make one
+                string binLabel = "Otu";
+                string sbinNumber = toString(i+1);
+                if (sbinNumber.length() < snumBins.length()) { 
+                    int diff = snumBins.length() - sbinNumber.length();
+                    for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                }
+                binLabel += sbinNumber; 
+                if (i < m->currentBinLabels.size()) {  binLabel = m->currentBinLabels[i]; }
+                
+                newBinLabels.push_back(binLabel);
+            }
+        }
+        
+        for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
+        
+        thislookup = newLookup;
+        m->currentBinLabels = newBinLabels;
+        
+        return 0;
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SharedCommand", "eliminateZeroOTUS");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+int SharedCommand::getDims(string line, int& shapeNumRows, int& shapeNumCols) {
+       try {
+        //get shape
+        bool inBar = false;
+        string num = "";
+        
+        for (int i = 0; i < line.length(); i++) {
+            
+            //you want to ignore any ; until you reach the next '
+            if ((line[i] == '[') && (!inBar)) {  inBar = true; i++;  if (!(i < line.length())) { break; } } 
+            else if ((line[i] == ']') && (inBar)) {  
+                inBar= false;  
+                m->mothurConvert(num, shapeNumCols);
+                break;
+            } 
+            
+            if (inBar) {  
+                if (line[i] == ',') {
+                    m->mothurConvert(num, shapeNumRows);
+                    num = "";
+                }else { if (!isspace(line[i])) { num += line[i]; }  }
+            }
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "getDims");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<string> SharedCommand::readRows(string line, ifstream& in, int& numRows) {
+       try {
+        /*"rows":[
+         {"id":"Otu01", "metadata":{"taxonomy":["Bacteria", "Bacteroidetes", "Bacteroidia", "Bacteroidales", "Porphyromonadaceae", "unclassified"], "bootstrap":[100, 100, 100, 100, 100, 100]}},
+         {"id":"Otu02", "metadata":{"taxonomy":["Bacteria", "Bacteroidetes", "Bacteroidia", "Bacteroidales", "Rikenellaceae", "Alistipes"], "bootstrap":[100, 100, 100, 100, 100, 100]}},
+         ...
+         ],*/
+        vector<string> names;
+        int countOpenBrace = 0;
+        int countClosedBrace = 0;
+        int openParen = 0;
+        int closeParen = 0;
+        string nextRow = "";
+        bool end = false;
+        
+        for (int i = 0; i < line.length(); i++) {
+            
+            if (m->control_pressed) { return names; }
+            
+            if (line[i] == '[')         { countOpenBrace++;     }
+            else if (line[i] == ']')    { countClosedBrace++;   }
+            else if (line[i] == '{')    { openParen++;          }
+            else if (line[i] == '}')    { closeParen++;         }
+            else if (openParen != 0)    { nextRow += line[i];   }  //you are reading the row info
+            
+            //you have reached the end of the rows info
+            if ((countOpenBrace == countClosedBrace) && (countClosedBrace != 0)) { end = true; break; }
+            if ((openParen == closeParen) && (closeParen != 0)) { //process row 
+                numRows++;
+                vector<string> items;
+                m->splitAtChar(nextRow, items, ','); //parse by comma, will return junk for metadata but we aren't using that anyway
+                string part = items[0]; items.clear();
+                m->splitAtChar(part, items, ':'); //split part we want containing the ids
+                string name = items[1];
+                
+                //remove "" if needed
+                int pos = name.find("\"");
+                if (pos != string::npos) {
+                    string newName = "";
+                    for (int k = 0; k < name.length(); k++) {
+                        if (name[k] != '\"') { newName += name[k]; }
+                    }
+                    name = newName;
+                }
+                names.push_back(name);
+                nextRow = "";
+                openParen = 0;
+                closeParen = 0;
+            }
+        }
+        
+        //keep reading
+        if (!end) {
+            while (!in.eof()) {
+                
+                if (m->control_pressed) { break; }
+                
+                char c = in.get(); m->gobble(in);
+                
+                if (c == '[')               { countOpenBrace++;     }
+                else if (c == ']')          { countClosedBrace++;   }
+                else if (c == '{')          { openParen++;          }
+                else if (c == '}')          { closeParen++;         }
+                else if (openParen != 0)    { nextRow += c;         }  //you are reading the row info
+                
+                
+                //you have reached the end of the rows info
+                if ((countOpenBrace == countClosedBrace) && (countClosedBrace != 0)) { end = true; break; }
+                if ((openParen == closeParen) && (closeParen != 0)) { //process row 
+                    numRows++;
+                    vector<string> items;
+                    m->splitAtChar(nextRow, items, ','); //parse by comma, will return junk for metadata but we aren't using that anyway
+                    string part = items[0]; items.clear();
+                    m->splitAtChar(part, items, ':'); //split part we want containing the ids
+                    string name = items[1];
+                    
+                    //remove "" if needed
+                    int pos = name.find("\"");
+                    if (pos != string::npos) {
+                        string newName = "";
+                        for (int k = 0; k < name.length(); k++) {
+                            if (name[k] != '\"') { newName += name[k]; }
+                        }
+                        name = newName;
+                    }
+                    names.push_back(name);
+                    nextRow = "";
+                    openParen = 0;
+                    closeParen = 0;
+                }  
+            }
+        }
+        
+        return names;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "readRows");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+//designed for things like "type": "OTU table", returns map type -> OTU table
+string SharedCommand::getTag(string& line) {
+       try {
+        bool inQuotes = false;
+        string tag = "";
+        char c = '\"';
+        
+        for (int i = 0; i < line.length(); i++) {
+        
+            //you want to ignore any ; until you reach the next '
+                       if ((line[i] == c) && (!inQuotes)) {  inQuotes = true;  } 
+                       else if ((line[i] == c) && (inQuotes)) {  
+                inQuotes= false;  
+                line = line.substr(i+1);
+                return tag;
+            } 
+            
+                       if (inQuotes) {  if (line[i] != c) { tag += line[i]; }  }
+        }
+        
+        return tag;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "getInfo");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SharedCommand::createSharedFromListGroup(string filename) {
+       try {
+        ofstream out;
+        m->openOutputFile(filename, out);
+        
+        GroupMap* groupMap = new GroupMap(groupfile);
+        
+        int groupError = groupMap->readMap();
+        if (groupError == 1) { delete groupMap; return 0; }
+        vector<string> allGroups = groupMap->getNamesOfGroups();
+        m->setAllGroups(allGroups);
+        
+        pickedGroups = false;
+        
+        //if hte user has not specified any groups then use them all
+        if (Groups.size() == 0) {
+            Groups = groupMap->getNamesOfGroups(); m->setGroups(Groups);
+        }else { pickedGroups = true; }
+        
+        //fill filehandles with neccessary ofstreams
+        int i;
+        ofstream* temp;
+        for (i=0; i<Groups.size(); i++) {
+            temp = new ofstream;
+            filehandles[Groups[i]] = temp;
+        }
+        
+        //set fileroot
+        fileroot = outputDir + m->getRootName(m->getSimpleName(listfile));
+        
+        //clears file before we start to write to it below
+        for (int i=0; i<Groups.size(); i++) {
+            m->mothurRemove((fileroot + Groups[i] + ".rabund"));
+            outputNames.push_back((fileroot + Groups[i] + ".rabund"));
+            outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
+        }
+        
+        string errorOff = "no error";
+        
+        //if user provided an order file containing the order the shared file should be in read it
+        //if (ordergroupfile != "") { readOrderFile(); }
+        
+        InputData input(listfile, "shared");
+        SharedListVector* SharedList = input.getSharedListVector();
+        string lastLabel = SharedList->getLabel();
+        vector<SharedRAbundVector*> lookup; 
+        
+        if (m->control_pressed) { 
+            delete SharedList; delete groupMap; 
+            for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
+            out.close(); m->mothurRemove(filename); 
+            for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));                }
+            return 0; 
+        }
+        
+        //sanity check
+        vector<string> groupMapNamesSeqs = groupMap->getNamesSeqs();
+        int error = ListGroupSameSeqs(groupMapNamesSeqs, SharedList);
+        
+        if ((!pickedGroups) && (SharedList->getNumSeqs() != groupMap->getNumSeqs())) {  //if the user has not specified any groups and their files don't match exit with error
+            m->mothurOut("Your group file contains " + toString(groupMap->getNumSeqs()) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); 
+            
+            out.close();
+            m->mothurRemove(filename); //remove blank shared file you made
+            
+            createMisMatchFile(SharedList, groupMap);
+            
+            //delete memory
+            for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+                delete it3->second;
+            }
+            
+            delete SharedList; delete groupMap; 
+            
+            return 0; 
+        }
+        
+        if (error == 1) { m->control_pressed = true; }
+        
+        //if user has specified groups make new groupfile for them
+        if (pickedGroups) { //make new group file
+            string groups = "";
+            if (m->getNumGroups() < 4) {
+                for (int i = 0; i < m->getNumGroups(); i++) {
+                    groups += (m->getGroups())[i] + ".";
+                }
+            }else { groups = "merge"; }
+            
+            string newGroupFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + groups + "groups";
+            outputTypes["group"].push_back(newGroupFile); 
+            outputNames.push_back(newGroupFile);
+            ofstream outGroups;
+            m->openOutputFile(newGroupFile, outGroups);
+            
+            vector<string> names = groupMap->getNamesSeqs();
+            string groupName;
+            for (int i = 0; i < names.size(); i++) {
+                groupName = groupMap->getGroup(names[i]);
+                if (isValidGroup(groupName, m->getGroups())) {
+                    outGroups << names[i] << '\t' << groupName << endl;
+                }
+            }
+            outGroups.close();
+        }
+        
+        //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+        set<string> processedLabels;
+        set<string> userLabels = labels;       
+        
+        while((SharedList != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+            if (m->control_pressed) { 
+                delete SharedList; delete groupMap;
+                for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
+                out.close(); m->mothurRemove(filename); 
+                for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));            }
+                return 0; 
+            }
+            
+            if(allLines == 1 || labels.count(SharedList->getLabel()) == 1){
+                
+                lookup = SharedList->getSharedRAbundVector();
+                
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                if (pickedGroups) { //check for otus with no seqs in them
+                    eliminateZeroOTUS(lookup);
+                }
+                
+                if (m->control_pressed) { 
+                    delete SharedList; delete groupMap; 
+                    for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                    for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
+                    out.close(); m->mothurRemove(filename); 
+                    for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));                }
+                    return 0; 
+                }
+                
+                if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+                printSharedData(lookup, out); //prints info to the .shared file
+                for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                
+                processedLabels.insert(SharedList->getLabel());
+                userLabels.erase(SharedList->getLabel());
+            }
+            
+            if ((m->anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) == true) && (processedLabels.count(lastLabel) != 1)) {
+                string saveLabel = SharedList->getLabel();
+                
+                delete SharedList;
+                SharedList = input.getSharedListVector(lastLabel); //get new list vector to process
+                
+                lookup = SharedList->getSharedRAbundVector();
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                if (pickedGroups) { //check for otus with no seqs in them
+                    eliminateZeroOTUS(lookup);
+                }
+                
+                
+                if (m->control_pressed) { 
+                    delete SharedList; delete groupMap; 
+                    for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                    for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;  }
+                    out.close(); m->mothurRemove(filename); 
+                    for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));                }
+                    return 0; 
+                }
+                
+                if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+                printSharedData(lookup, out); //prints info to the .shared file
+                for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                
+                processedLabels.insert(SharedList->getLabel());
+                userLabels.erase(SharedList->getLabel());
+                
+                //restore real lastlabel to save below
+                SharedList->setLabel(saveLabel);
+            }
+            
+            
+            lastLabel = SharedList->getLabel();
+            
+            delete SharedList;
+            SharedList = input.getSharedListVector(); //get new list vector to process
+        }
+        
+        //output error messages about any remaining user labels
+        set<string>::iterator it;
+        bool needToRun = false;
+        for (it = userLabels.begin(); it != userLabels.end(); it++) {  
+            if (processedLabels.count(lastLabel) != 1) {
+                needToRun = true;
+            }
+        }
+        
+        //run last label if you need to
+        if (needToRun == true)  {
+            if (SharedList != NULL) {  delete SharedList;      }
+            SharedList = input.getSharedListVector(lastLabel); //get new list vector to process
+            
+            lookup = SharedList->getSharedRAbundVector();
+            m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+            if (pickedGroups) { //check for otus with no seqs in them
+                eliminateZeroOTUS(lookup);
+            }
+            
+            if (m->control_pressed) { 
+                delete groupMap;
+                for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {  delete it3->second;   }
+                out.close(); m->mothurRemove(filename); 
+                for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));            }
+                return 0; 
+            }
+            
+            if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+            printSharedData(lookup, out); //prints info to the .shared file
+            for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+            delete SharedList;
+        }
+        
+        out.close();
+        
+        for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+            delete it3->second;
+        }
+        
+        delete groupMap;
+               
+        if (m->control_pressed) { 
+            m->mothurRemove(filename); 
+            for (int i=0; i<Groups.size(); i++) {  m->mothurRemove((fileroot + Groups[i] + ".rabund"));                }
+            return 0; 
+        }
+
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SharedCommand", "createSharedFromListGroup");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+void SharedCommand::printSharedData(vector<SharedRAbundVector*> thislookup, ofstream& out) {
         try {
                 
                 if (order.size() == 0) { //user has not specified an order so do aplabetically
@@ -517,50 +1034,7 @@ void SharedCommand::printSharedData(vector<SharedRAbundVector*> thislookup) {
         }
  }
  //**********************************************************************************************************************
-int SharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
-       try {
-               
-               vector<SharedRAbundVector*> newLookup;
-               for (int i = 0; i < thislookup.size(); i++) {
-                       SharedRAbundVector* temp = new SharedRAbundVector();
-                       temp->setLabel(thislookup[i]->getLabel());
-                       temp->setGroup(thislookup[i]->getGroup());
-                       newLookup.push_back(temp);
-               }
-               
-               //for each bin
-               for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
-                       if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
-               
-                       //look at each sharedRabund and make sure they are not all zero
-                       bool allZero = true;
-                       for (int j = 0; j < thislookup.size(); j++) {
-                               if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
-                       }
-                       
-                       //if they are not all zero add this bin
-                       if (!allZero) {
-                               for (int j = 0; j < thislookup.size(); j++) {
-                                       newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
-                               }
-                               //if there is a bin label use it otherwise make one
-                       }
-                       //else{  cout << "bin # " << i << " is all zeros" << endl;  }
-               }
-       
-               for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
-               thislookup = newLookup;
-               
-               return 0;
- 
-       }
-       catch(exception& e) {
-               m->errorOut(e, "SharedCommand", "eliminateZeroOTUS");
-               exit(1);
-       }
-}
-//**********************************************************************************************************************
-int SharedCommand::createMisMatchFile() {
+int SharedCommand::createMisMatchFile(SharedListVector* SharedList, GroupMap* groupMap) {
         try {
                 ofstream outMisMatch;
                 string outputMisMatchName = outputDir + m->getRootName(m->getSimpleName(listfile));
@@ -658,12 +1132,9 @@ int SharedCommand::createMisMatchFile() {
         }
  }
  //**********************************************************************************************************************
-int SharedCommand::ListGroupSameSeqs() {
+int SharedCommand::ListGroupSameSeqs(vector<string>& groupMapsSeqs, SharedListVector* SharedList) {
         try {
-               
                 int error = 0; 
-               
-               vector<string> groupMapsSeqs = groupMap->getNamesSeqs();
                 
                 set<string> groupNamesSeqs;
                 for(int i = 0; i < groupMapsSeqs.size(); i++) {
diff --git a/sharedcommand.h b/sharedcommand.h

index 861632ce7d6a8026c14b463387b9d8dbb320522d..5d0daa2fdc484f55b78613541e1e82ef547a7d09 100644 (file)
--- a/sharedcommand.h
+++ b/sharedcommand.h
@@ -38,20 +38,22 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       void printSharedData(vector<SharedRAbundVector*>);
-       int createMisMatchFile();
+       void printSharedData(vector<SharedRAbundVector*>, ofstream&);
+       int createMisMatchFile(SharedListVector*, GroupMap*);
         int readOrderFile();
         bool isValidGroup(string, vector<string>);
         int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
-       int ListGroupSameSeqs();
+       int ListGroupSameSeqs(vector<string>&, SharedListVector*);
+    int createSharedFromListGroup(string);
+    int createSharedFromBiom(string);
+    string getTag(string&);
+    vector<string> readRows(string, ifstream&, int&); 
+    int getDims(string, int&, int&);
+    vector<SharedRAbundVector*> readData(string, string, ifstream&, vector<string>&, int);
         
-       SharedListVector* SharedList;
-       InputData* input;
-       GroupMap* groupMap;
         vector<string> Groups, outputNames, order;
         set<string> labels;
-       ofstream out;
-       string filename, fileroot, outputDir, listfile, groupfile, ordergroupfile;
+       string fileroot, outputDir, listfile, groupfile, biomfile, ordergroupfile;
         bool firsttime, pickedGroups, abort, allLines;
         map<string, ofstream*> filehandles;
         map<string, ofstream*>::iterator it3;
diff --git a/sharedrabundfloatvector.cpp b/sharedrabundfloatvector.cpp

index 9f6c15687edc77d0fbd77df7de286cbe98b6339e..b6d916a1900edc643291df4432728d3619a711a6 100644 (file)
--- a/sharedrabundfloatvector.cpp
+++ b/sharedrabundfloatvector.cpp
@@ -48,7 +48,7 @@ SharedRAbundFloatVector::SharedRAbundFloatVector(ifstream& f) : DataVector(), ma
                 //are we at the beginning of the file??
                 if (m->saveNextLabel == "") {  
                         f >> label; 
-                       
+            
                         //is this a shared file that has headers
                         if (label == "label") { 
                                 //gets "group"
@@ -67,20 +67,40 @@ SharedRAbundFloatVector::SharedRAbundFloatVector(ifstream& f) : DataVector(), ma
                                         if (m->control_pressed) { break; }
                                         string temp;
                                         iStringStream >> temp;  m->gobble(iStringStream);
-                                       
+                    
                                         m->binLabelsInFile.push_back(temp);
                                 }
                                 
-                               f >> label;
-                       }
-               }else { label = m->saveNextLabel; }
+                               f >> label >> groupN >> num;
+                       }else {
+                //read in first row since you know there is at least 1 group.
+                f >> groupN >> num;
+                
+                //make binlabels because we don't have any
+                string snumBins = toString(num);
+                m->binLabelsInFile.clear();
+                for (int i = 0; i < num; i++) {  
+                    //if there is a bin label use it otherwise make one
+                    string binLabel = "Otu";
+                    string sbinNumber = toString(i+1);
+                    if (sbinNumber.length() < snumBins.length()) { 
+                        int diff = snumBins.length() - sbinNumber.length();
+                        for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                    }
+                    binLabel += sbinNumber;
+                    m->binLabelsInFile.push_back(binLabel);
+                }
+            }
+               }else { 
+            label = m->saveNextLabel; 
+            
+            //read in first row since you know there is at least 1 group.
+            f >> groupN >> num;
+        }
                 
                 //reset labels, currentLabels may have gotten changed as otus were eliminated because of group choices or sampling
                 m->currentBinLabels = m->binLabelsInFile;
                 
-               //read in first row since you know there is at least 1 group.
-               f >> groupN >> num;
-
                 holdLabel = label;
                 
                 //add new vector to lookup
diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp

index 70b09603be0f4cd85081f9f1b530c5ec0b61040e..3901650cef965dc7a3c0126ecb54a8fb425e434d 100644 (file)
--- a/sharedrabundvector.cpp
+++ b/sharedrabundvector.cpp
@@ -95,16 +95,36 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0),
                                         m->binLabelsInFile.push_back(temp);
                                 }
                                 
-                               f >> label;
-                       }
-               }else { label = m->saveNextLabel; }
+                               f >> label >> groupN >> num;
+                       }else {
+                //read in first row since you know there is at least 1 group.
+                f >> groupN >> num;
+                
+                //make binlabels because we don't have any
+                string snumBins = toString(num);
+                m->binLabelsInFile.clear();
+                for (int i = 0; i < num; i++) {  
+                    //if there is a bin label use it otherwise make one
+                    string binLabel = "Otu";
+                    string sbinNumber = toString(i+1);
+                    if (sbinNumber.length() < snumBins.length()) { 
+                        int diff = snumBins.length() - sbinNumber.length();
+                        for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                    }
+                    binLabel += sbinNumber;
+                    m->binLabelsInFile.push_back(binLabel);
+                }
+            }
+               }else { 
+            label = m->saveNextLabel; 
+            
+            //read in first row since you know there is at least 1 group.
+            f >> groupN >> num;
+        }
                 
                 //reset labels, currentLabels may have gotten changed as otus were eliminated because of group choices or sampling
                 m->currentBinLabels = m->binLabelsInFile;
                 
-               //read in first row since you know there is at least 1 group.
-               f >> groupN >> num;
-               
                 holdLabel = label;
                 
                 //add new vector to lookup
diff --git a/shhhercommand.cpp b/shhhercommand.cpp

index d8f538b56109e2faa290cb060c314186e8be318a..2214c5ea8b4b5f124393067ff2d59dcc3acf931c 100644 (file)
--- a/shhhercommand.cpp
+++ b/shhhercommand.cpp
@@ -128,8 +128,10 @@ ShhherCommand::ShhherCommand(string option) {
                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
                                 }
                         }
-                       
-                       
+            
+            //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+            
                         //check for required parameters
                         flowFileName = validParameter.validFile(parameters, "flow", true);
                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
@@ -146,7 +148,9 @@ ShhherCommand::ShhherCommand(string option) {
                         }
                         else{
                                 ofstream temp;
-
+                
+                string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
+                
                                 //flow.files = 9 character offset
                                 compositeFASTAFileName = outputDir + flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
                                 m->openOutputFile(compositeFASTAFileName, temp);
@@ -214,17 +218,10 @@ ShhherCommand::ShhherCommand(string option) {
                  if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
              }
              else{
+                outputDir += m->hasPath(flowFileName);
                  flowFileVector.push_back(flowFileName);
              }
-
-                       
-                       //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
-                               outputDir = ""; 
-                               outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
-                       }
-                       
-                       
+               
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
                         string temp;
diff --git a/subsample.cpp b/subsample.cpp

index e6dd845adf9b64bec6c932f4cf72de6d8b9fe909..b1e78a44a0a2e5b5cd31e7c38e27710603ee1578 100644 (file)
--- a/subsample.cpp
+++ b/subsample.cpp
@@ -8,6 +8,114 @@
  
  #include "subsample.h"
  
+//**********************************************************************************************************************
+Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
+    try {
+        Tree* newTree = NULL;
+        
+        vector<string> subsampledSeqs = getSample(tmap, size);
+        map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
+        
+        //remove seqs not in sample from treemap
+        for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+            //is that name in the subsample?
+            int count = 0;
+            for (int j = 0; j < subsampledSeqs.size(); j++) {
+                if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
+                count++;
+            }
+
+            if (m->control_pressed) { return newTree; }
+            
+            //if you didnt find it, remove it 
+            if (count == subsampledSeqs.size()) { 
+                tmap->removeSeq(tmap->namesOfSeqs[i]);
+                i--; //need this because removeSeq removes name from namesOfSeqs
+            }
+        }
+        
+        //create new tree
+        int numUniques = sampledNameMap.size();
+        if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
+        
+        newTree = new Tree(numUniques, tmap); //numNodes, treemap
+        newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
+        
+        return newTree;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SubSample", "getSample-Tree");
+        exit(1);
+    }
+}      
+//**********************************************************************************************************************
+//assumes whole maps dupName -> uniqueName
+map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
+    try {
+        map<string, string> nameMap;
+        
+        //whole will be empty if user gave no name file, so we don't need to make a new one
+        if (whole.size() == 0) { return nameMap; }
+        
+        vector<string> newWanted;
+        for (int i = 0; i < wanted.size(); i++) {
+            
+            if (m->control_pressed) { break; }
+            
+            string dupName = wanted[i];
+            
+            map<string, string>::iterator itWhole = whole.find(dupName);
+            if (itWhole != whole.end()) {
+                string repName = itWhole->second;
+                
+                //do we already have this rep?
+                map<string, string>::iterator itName = nameMap.find(repName);
+                if (itName != nameMap.end()) { //add this seqs to dups list
+                    (itName->second) += "," + dupName;
+                }else { //first sighting of this seq
+                    nameMap[repName] = dupName;
+                    newWanted.push_back(repName);
+                }
+            }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
+        }
+        
+        wanted = newWanted;
+        return nameMap;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "deconvolute");
+               exit(1);
+       }
+}      
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(TreeMap* tMap, int size) {
+    try {
+        vector<string> sample;
+        
+        vector<string> Groups = tMap->getNamesOfGroups();    
+        for (int i = 0; i < Groups.size(); i++) {
+            
+            if (m->control_pressed) { break; }
+            
+            vector<string> thisGroup; thisGroup.push_back(Groups[i]);
+            vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
+            int thisSize = thisGroupsSeqs.size();
+            
+            if (thisSize >= size) {    
+                
+                random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
+                
+                for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
+            }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+        } 
+        
+        return sample;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "getSample-TreeMap");
+               exit(1);
+       }
+}      
  //**********************************************************************************************************************
  vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
         try {
@@ -64,7 +172,7 @@ vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int
                 
         }
         catch(exception& e) {
-               m->errorOut(e, "SubSample", "getSample");
+               m->errorOut(e, "SubSample", "getSample-shared");
                 exit(1);
         }
  }      
diff --git a/subsample.h b/subsample.h

index 09c7dcdf0901f8a229ee3c89633ca1186d03a7ef..aaf52447b026127b27141bb8794a77a80f60c7f4 100644 (file)
--- a/subsample.h
+++ b/subsample.h
@@ -11,6 +11,8 @@
  
  #include "mothurout.h"
  #include "sharedrabundvector.h"
+#include "treemap.h"
+#include "tree.h"
  
  //subsampling overwrites the sharedRabunds.  If you need to reuse the original use the getSamplePreserve function.
  
@@ -22,12 +24,17 @@ class SubSample {
          ~SubSample() {}
      
          vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
-    
+        
+        Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
      
      private:
      
          MothurOut* m;
          int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
+    
+        vector<string> getSample(TreeMap*, int); //returns map contains names of seqs in subsample -> group. 
+        map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
+
  
  };
  
diff --git a/tree.cpp b/tree.cpp

index d9d71aea2dd1859d6e198302325fb51d2fdf9367..d9b4a9c1e8f4e31475aa5127adb38d339d1a8631 100644 (file)
--- a/tree.cpp
+++ b/tree.cpp
@@ -16,7 +16,7 @@ Tree::Tree(int num, TreeMap* t) : tmap(t) {
                 
                 numLeaves = num;  
                 numNodes = 2*numLeaves - 1;
-               
+        
                 tree.resize(numNodes);
         }
         catch(exception& e) {
@@ -28,9 +28,6 @@ Tree::Tree(int num, TreeMap* t) : tmap(t) {
  Tree::Tree(string g) { //do not use tree generated by this its just to extract the treenames, its a chicken before the egg thing that needs to be revisited.
         try {
                 m = MothurOut::getInstance();
-               
-               tmap = NULL;
-               
                 parseTreeFile();  m->runParse = false;  
         }
         catch(exception& e) {
@@ -89,11 +86,122 @@ Tree::Tree(TreeMap* t) : tmap(t) {
                 exit(1);
         }
  }
-
+/*****************************************************************/
+Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
+       try {
+               m = MothurOut::getInstance();
+               
+               if (m->runParse == true) {  parseTreeFile();  m->runParse = false;  }
+               numLeaves = m->Treenames.size();
+               numNodes = 2*numLeaves - 1;
+               
+               tree.resize(numNodes);
+        
+               //initialize groupNodeInfo
+               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
+                       groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0);
+               }
+               
+               //initialize tree with correct number of nodes, name and group info.
+               for (int i = 0; i < numNodes; i++) {
+                       //initialize leaf nodes
+                       if (i <= (numLeaves-1)) {
+                               tree[i].setName(m->Treenames[i]);
+                               
+                               //save group info
+                               string group = tmap->getGroup(m->Treenames[i]);
+                               
+                               vector<string> tempGroups; tempGroups.push_back(group);
+                               tree[i].setGroup(tempGroups);
+                               groupNodeInfo[group].push_back(i); 
+                               
+                               //set pcount and pGroup for groupname to 1.
+                               tree[i].pcount[group] = 1;
+                               tree[i].pGroups[group] = 1;
+                               
+                               //Treemap knows name, group and index to speed up search
+                               tmap->setIndex(m->Treenames[i], i);
+                
+                //intialize non leaf nodes
+                       }else if (i > (numLeaves-1)) {
+                               tree[i].setName("");
+                               vector<string> tempGroups;
+                               tree[i].setGroup(tempGroups);
+                       }
+               }
+        
+        //build tree from matrix
+        //initialize indexes
+        map<int, int> indexes;  //maps row in simMatrix to vector index in the tree
+        for (int g = 0; g < numLeaves; g++) {  indexes[g] = g; }
+               
+               //do merges and create tree structure by setting parents and children
+               //there are numGroups - 1 merges to do
+               for (int i = 0; i < (numLeaves - 1); i++) {
+                       float largest = -1000.0;
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       int row, column;
+                       //find largest value in sims matrix by searching lower triangle
+                       for (int j = 1; j < sims.size(); j++) {
+                               for (int k = 0; k < j; k++) {
+                                       if (sims[j][k] > largest) {  largest = sims[j][k]; row = j; column = k;  }
+                               }
+                       }
+            
+                       //set non-leaf node info and update leaves to know their parents
+                       //non-leaf
+                       tree[numLeaves + i].setChildren(indexes[row], indexes[column]);
+                       
+                       //parents
+                       tree[indexes[row]].setParent(numLeaves + i);
+                       tree[indexes[column]].setParent(numLeaves + i);
+                       
+                       //blength = distance / 2;
+                       float blength = ((1.0 - largest) / 2);
+                       
+                       //branchlengths
+                       tree[indexes[row]].setBranchLength(blength - tree[indexes[row]].getLengthToLeaves());
+                       tree[indexes[column]].setBranchLength(blength - tree[indexes[column]].getLengthToLeaves());
+                       
+                       //set your length to leaves to your childs length plus branchlength
+                       tree[numLeaves + i].setLengthToLeaves(tree[indexes[row]].getLengthToLeaves() + tree[indexes[row]].getBranchLength());
+                       
+                       
+                       //update index 
+                       indexes[row] = numLeaves+i;
+                       indexes[column] = numLeaves+i;
+                       
+                       //remove highest value that caused the merge.
+                       sims[row][column] = -1000.0;
+                       sims[column][row] = -1000.0;
+                       
+                       //merge values in simsMatrix
+                       for (int n = 0; n < sims.size(); n++)   {
+                               //row becomes merge of 2 groups
+                               sims[row][n] = (sims[row][n] + sims[column][n]) / 2;
+                               sims[n][row] = sims[row][n];
+                               //delete column
+                               sims[column][n] = -1000.0;
+                               sims[n][column] = -1000.0;
+                       }
+               }
+               
+               //adjust tree to make sure root to tip length is .5
+               int root = findRoot();
+               tree[root].setBranchLength((0.5 - tree[root].getLengthToLeaves()));
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "Tree", "Tree");
+               exit(1);
+       }
+}
  /*****************************************************************/
  Tree::~Tree() {}
  /*****************************************************************/
-void Tree::addNamesToCounts() {
+void Tree::addNamesToCounts(map<string, string> nameMap) {
         try {
                 //ex. seq1      seq2,seq3,se4
                 //              seq1 = pasture
@@ -116,12 +224,12 @@ void Tree::addNamesToCounts() {
  
                         string name = tree[i].getName();
                 
-                       map<string, string>::iterator itNames = m->names.find(name);
+                       map<string, string>::iterator itNames = nameMap.find(name);
                 
-                       if (itNames == m->names.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1);  }
+                       if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1);  }
                         else {
                                 vector<string> dupNames;
-                               m->splitAtComma(m->names[name], dupNames);
+                               m->splitAtComma(nameMap[name], dupNames);
                                 
                                 map<string, int>::iterator itCounts;
                                 int maxPars = 1;
@@ -217,12 +325,13 @@ void Tree::setIndex(string searchName, int index) {
         }
  }
  /*****************************************************************/
-int Tree::assembleTree() {
+int Tree::assembleTree(map<string, string> nameMap) {
         try {
-               //float A = clock();
+               //save for later
+        names = nameMap;
  
                 //if user has given a names file we want to include that info in the pgroups and pcount info.
-               if(m->names.size() != 0) {  addNamesToCounts();  }
+               if(nameMap.size() != 0) {  addNamesToCounts(nameMap);  }
                 
                 //build the pGroups in non leaf nodes to be used in the parsimony calcs.
                 for (int i = numLeaves; i < numNodes; i++) {
@@ -231,8 +340,7 @@ int Tree::assembleTree() {
                         tree[i].pGroups = (mergeGroups(i));
                         tree[i].pcount = (mergeGcounts(i));
                 }
-               //float B = clock();
-               //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl;
+               
                 return 0;
         }
         catch(exception& e) {
@@ -240,7 +348,7 @@ int Tree::assembleTree() {
                 exit(1);
         }
  }
-/*****************************************************************/
+/*****************************************************************
  int Tree::assembleTree(string n) {
         try {
                 
@@ -261,9 +369,16 @@ int Tree::assembleTree(string n) {
         }
  }
  /*****************************************************************/
-void Tree::getSubTree(Tree* copy, vector<string> Groups) {
+//assumes leaf node names are in groups and no names file - used by indicator command
+void Tree::getSubTree(Tree* Ctree, vector<string> Groups) {
         try {
-                       
+        
+        //copy Tree since we are going to destroy it
+        Tree* copy = new Tree(tmap);
+        copy->getCopy(Ctree);
+        map<string, string> empty;
+        copy->assembleTree(empty);
+        
                 //we want to select some of the leaf nodes to create the output tree
                 //go through the input Tree starting at parents of leaves
                 for (int i = 0; i < numNodes; i++) {
@@ -408,12 +523,40 @@ void Tree::getSubTree(Tree* copy, vector<string> Groups) {
                         //you found the root
                         if (copy->tree[i].getParent() == -1) { root = i; break; }
                 }
-               
+        
                 int nextSpot = numLeaves;
                 populateNewTree(copy->tree, root, nextSpot);
+        
+        delete copy;
         }
         catch(exception& e) {
-               m->errorOut(e, "Tree", "getCopy");
+               m->errorOut(e, "Tree", "getSubTree");
+               exit(1);
+       }
+}
+/*****************************************************************/
+//assumes nameMap contains unique names as key or is empty. 
+//assumes numLeaves defined in tree constructor equals size of seqsToInclude and seqsToInclude only contains unique seqs.
+int Tree::getSubTree(Tree* copy, vector<string> seqsToInclude, map<string, string> nameMap) {
+       try {
+        
+        if (numLeaves != seqsToInclude.size()) { m->mothurOut("[ERROR]: numLeaves does not equal numUniques, cannot create subtree.\n"); m->control_pressed = true; return 0; }
+        
+        getSubTree(copy, seqsToInclude);
+        if (nameMap.size() != 0) {  addNamesToCounts(nameMap);  }
+        
+        //build the pGroups in non leaf nodes to be used in the parsimony calcs.
+               for (int i = numLeaves; i < numNodes; i++) {
+                       if (m->control_pressed) { return 1; }
+            
+                       tree[i].pGroups = (mergeGroups(i));
+                       tree[i].pcount = (mergeGcounts(i));
+               }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "Tree", "getSubTree");
                 exit(1);
         }
  }
@@ -627,7 +770,6 @@ map<string,int> Tree::mergeGcounts(int position) {
         }
  }
  /**************************************************************************************************/
-
  void Tree::randomLabels(vector<string> g) {
         try {
         
@@ -676,37 +818,7 @@ void Tree::randomLabels(vector<string> g) {
                 exit(1);
         }
  }
-/**************************************************************************************************
-
-void Tree::randomLabels(string groupA, string groupB) {
-       try {
-               int numSeqsA = globaldata->gTreemap->seqsPerGroup[groupA];
-               int numSeqsB = globaldata->gTreemap->seqsPerGroup[groupB];
-
-               vector<string> randomGroups(numSeqsA+numSeqsB, groupA);
-               for(int i=numSeqsA;i<randomGroups.size();i++){
-                       randomGroups[i] = groupB;
-               }
-               random_shuffle(randomGroups.begin(), randomGroups.end());
-                               
-               int randomCounter = 0;                          
-               for(int i=0;i<numLeaves;i++){
-                       if(tree[i].getGroup() == groupA || tree[i].getGroup() == groupB){
-                               tree[i].setGroup(randomGroups[randomCounter]);
-                               tree[i].pcount.clear();
-                               tree[i].pcount[randomGroups[randomCounter]] = 1;
-                               tree[i].pGroups.clear();
-                               tree[i].pGroups[randomGroups[randomCounter]] = 1;
-                               randomCounter++;
-                       }
-               }
-       }               
-       catch(exception& e) {
-               m->errorOut(e, "Tree", "randomLabels");
-               exit(1);
-       }
-}
-**************************************************************************************************/
+/**************************************************************************************************/
  void Tree::randomBlengths()  {
         try {
                 for(int i=numNodes-1;i>=0;i--){
@@ -725,21 +837,23 @@ void Tree::randomBlengths()  {
  /*************************************************************************************************/
  void Tree::assembleRandomUnifracTree(vector<string> g) {
         randomLabels(g);
-       assembleTree("noNameCounts");
+    map<string, string> empty;
+       assembleTree(empty);
  }
  /*************************************************************************************************/
  void Tree::assembleRandomUnifracTree(string groupA, string groupB) {
-
         vector<string> temp; temp.push_back(groupA); temp.push_back(groupB);
         randomLabels(temp);
-       assembleTree("noNameCounts");
+    map<string, string> empty;
+       assembleTree(empty);
  }
  
  /*************************************************************************************************/
  //for now it's just random topology but may become random labels as well later that why this is such a simple function now...
  void Tree::assembleRandomTree() {
         randomTopology();
-       assembleTree();
+    map<string, string> empty;
+       assembleTree(empty);
  }
  /**************************************************************************************************/
  
@@ -792,6 +906,18 @@ void Tree::print(ostream& out) {
         }
  }
  /*****************************************************************/
+void Tree::print(ostream& out, map<string, string> nameMap) {
+       try {
+               int root = findRoot();
+               printBranch(root, out, nameMap);
+               out << ";" << endl;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Tree", "print");
+               exit(1);
+       }
+}
+/*****************************************************************/
  void Tree::print(ostream& out, string mode) {
         try {
                 int root = findRoot();
@@ -844,10 +970,82 @@ int Tree::findRoot() {
         }
  }
  /*****************************************************************/
-void Tree::printBranch(int node, ostream& out, string mode) {
+void Tree::printBranch(int node, ostream& out, map<string, string> names) {
  try {
  
  // you are not a leaf
+               if (tree[node].getLChild() != -1) {
+                       out << "(";
+                       printBranch(tree[node].getLChild(), out, names);
+                       out << ",";
+                       printBranch(tree[node].getRChild(), out, names);
+                       out << ")";
+                       
+            //if there is a branch length then print it
+            if (tree[node].getBranchLength() != -1) {
+                out << ":" << tree[node].getBranchLength();
+            }
+                       
+               }else { //you are a leaf
+            map<string, string>::iterator itNames = names.find(tree[node].getName());
+            
+            string outputString = "";
+            if (itNames != names.end()) { 
+                
+                vector<string> dupNames;
+                m->splitAtComma((itNames->second), dupNames);
+                
+                if (dupNames.size() == 1) {
+                    outputString += tree[node].getName();
+                    if (tree[node].getBranchLength() != -1) {
+                        outputString += ":" + toString(tree[node].getBranchLength());
+                    }
+                }else {
+                    outputString += "(";
+                    
+                    for (int u = 0; u < dupNames.size()-1; u++) {
+                        outputString += dupNames[u];
+                        
+                        if (tree[node].getBranchLength() != -1) {
+                            outputString += ":" + toString(0.0);
+                        }
+                        outputString += ",";
+                    }
+                    
+                    outputString += dupNames[dupNames.size()-1];
+                    if (tree[node].getBranchLength() != -1) {
+                        outputString += ":" + toString(0.0);
+                    }
+                    
+                    outputString += ")";
+                    if (tree[node].getBranchLength() != -1) {
+                        outputString += ":" + toString(tree[node].getBranchLength());
+                    }
+                }
+            }else { 
+                outputString = tree[node].getName();
+                //if there is a branch length then print it
+                if (tree[node].getBranchLength() != -1) {
+                    outputString += ":" + toString(tree[node].getBranchLength());
+                }
+                
+                m->mothurOut("[ERROR]: " + tree[node].getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); 
+            }
+                
+            out << outputString;
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Tree", "printBranch");
+               exit(1);
+       }
+}
+/*****************************************************************/
+void Tree::printBranch(int node, ostream& out, string mode) {
+    try {
+        
+        // you are not a leaf
                 if (tree[node].getLChild() != -1) {
                         out << "(";
                         printBranch(tree[node].getLChild(), out, mode);
@@ -872,11 +1070,6 @@ try {
                                 if (tree[node].getBranchLength() != -1) {
                                         out << ":" << tree[node].getBranchLength();
                                 }
-                       }else if (mode == "deunique") {
-                               //if there is a branch length then print it
-                               if (tree[node].getBranchLength() != -1) {
-                                       out << ":" << tree[node].getBranchLength();
-                               }
                         }
                 }else { //you are a leaf
                         string leafGroup = tmap->getGroup(tree[node].getName());
@@ -902,53 +1095,6 @@ try {
                                 if (tree[node].getBranchLength() != -1) {
                                         out << ":" << tree[node].getBranchLength();
                                 }
-                       }else if (mode == "deunique") {
-                               map<string, string>::iterator itNames = m->names.find(tree[node].getName());
-                               
-                               string outputString = "";
-                               if (itNames != m->names.end()) { 
-                                       
-                                       vector<string> dupNames;
-                                       m->splitAtComma((itNames->second), dupNames);
-                                       
-                                       if (dupNames.size() == 1) {
-                                               outputString += tree[node].getName();
-                                               if (tree[node].getBranchLength() != -1) {
-                                                       outputString += ":" + toString(tree[node].getBranchLength());
-                                               }
-                                       }else {
-                                               outputString += "(";
-                                               
-                                               for (int u = 0; u < dupNames.size()-1; u++) {
-                                                       outputString += dupNames[u];
-                                                       
-                                                       if (tree[node].getBranchLength() != -1) {
-                                                               outputString += ":" + toString(0.0);
-                                                       }
-                                                       outputString += ",";
-                                               }
-                                               
-                                               outputString += dupNames[dupNames.size()-1];
-                                               if (tree[node].getBranchLength() != -1) {
-                                                       outputString += ":" + toString(0.0);
-                                               }
-                                               
-                                               outputString += ")";
-                                               if (tree[node].getBranchLength() != -1) {
-                                                       outputString += ":" + toString(tree[node].getBranchLength());
-                                               }
-                                       }
-                               }else { 
-                                       outputString = tree[node].getName();
-                                       //if there is a branch length then print it
-                                       if (tree[node].getBranchLength() != -1) {
-                                               outputString += ":" + toString(tree[node].getBranchLength());
-                                       }
-                                       
-                                       m->mothurOut("[ERROR]: " + tree[node].getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); 
-                               }
-                                       
-                               out << outputString;
                         }
                 }
                 
diff --git a/tree.h b/tree.h

index 2d9d4f815f90bd889c10b886853ca415840060f5..0660e8a181632ae09668d1484236276e5898ed39 100644 (file)
--- a/tree.h
+++ b/tree.h
@@ -19,13 +19,18 @@ public:
         Tree(string);  //do not use tree generated by this constructor its just to extract the treenames, its a chicken before the egg thing that needs to be revisited.
         Tree(int, TreeMap*); 
         Tree(TreeMap*);         //to generate a tree from a file
+    Tree(TreeMap*, vector< vector<double> >&); //create tree from sim matrix
         ~Tree();
         
+    TreeMap* getTreeMap() { return tmap; }
         void getCopy(Tree*);  //makes tree a copy of the one passed in.
         void getSubTree(Tree*, vector<string>);  //makes tree a that contains only the names passed in.
+    int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap);  //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. 
+    
         void assembleRandomTree();
         void assembleRandomUnifracTree(vector<string>);
         void assembleRandomUnifracTree(string, string);
+    
         void createNewickFile(string);
         int getIndex(string);
         void setIndex(string, int);
@@ -35,11 +40,11 @@ public:
         void printTree();
         void print(ostream&);
         void print(ostream&, string);
+    void print(ostream&, map<string, string>);
         int findRoot();  //return index of root node
         
         //this function takes the leaf info and populates the non leaf nodes
-       int assembleTree();     
-       int assembleTree(string);       
+       int assembleTree(map<string, string>);  
         
         vector<Node> tree;              //the first n nodes are the leaves, where n is the number of sequences.
         map< string, vector<int> > groupNodeInfo;       //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file.
@@ -50,16 +55,18 @@ private:
         ofstream out;
         string filename;
         
+    map<string, string> names;
         map<string, int>::iterator it, it2;
         map<string, int> mergeGroups(int);  //returns a map with a groupname and the number of times that group was seen in the children
         map<string,int> mergeGcounts(int);
         
-       void addNamesToCounts();
+       void addNamesToCounts(map<string, string>);
         void randomTopology();
         void randomBlengths();
         void randomLabels(vector<string>);
         //void randomLabels(string, string);
-       void printBranch(int, ostream&, string);  //recursively print out tree
+       void printBranch(int, ostream&, map<string, string>);  //recursively print out tree
+    void printBranch(int, ostream&, string);
         void parseTreeFile();   //parses through tree file to find names of nodes and number of them
                                                         //this is required in case user has sequences in the names file that are
                                                         //not included in the tree. 
diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp

index 0150a7a4cd49eb490ba7a208ce4f74fb691b3e92..2542431f111f85b5bc344d93388662364045a509 100644 (file)
--- a/treegroupscommand.cpp
+++ b/treegroupscommand.cpp
@@ -27,7 +27,7 @@ vector<string> TreeGroupCommand::setParameters(){
                 CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
                 
          CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-        CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
+//CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -482,76 +482,15 @@ int TreeGroupCommand::execute(){
  Tree* TreeGroupCommand::createTree(vector< vector<double> >& simMatrix){
         try {
                 //create tree
-               t = new Tree(tmap);
+               t = new Tree(tmap, simMatrix);
          
-        //initialize index
-        map<int, int> index;  //maps row in simMatrix to vector index in the tree
-        for (int g = 0; g < numGroups; g++) {  index[g] = g;   }
+        if (m->control_pressed) { delete t; t = NULL; return t; }
                 
-               //do merges and create tree structure by setting parents and children
-               //there are numGroups - 1 merges to do
-               for (int i = 0; i < (numGroups - 1); i++) {
-                       float largest = -1000.0;
-                       
-                       if (m->control_pressed) { delete t; t = NULL; return t; }
-                       
-                       int row, column;
-                       //find largest value in sims matrix by searching lower triangle
-                       for (int j = 1; j < simMatrix.size(); j++) {
-                               for (int k = 0; k < j; k++) {
-                                       if (simMatrix[j][k] > largest) {  largest = simMatrix[j][k]; row = j; column = k;  }
-                               }
-                       }
+        //assemble tree
+        map<string, string> empty;
+               t->assembleTree(empty);
  
-                       //set non-leaf node info and update leaves to know their parents
-                       //non-leaf
-                       t->tree[numGroups + i].setChildren(index[row], index[column]);
-                       
-                       //parents
-                       t->tree[index[row]].setParent(numGroups + i);
-                       t->tree[index[column]].setParent(numGroups + i);
-                       
-                       //blength = distance / 2;
-                       float blength = ((1.0 - largest) / 2);
-                       
-                       //branchlengths
-                       t->tree[index[row]].setBranchLength(blength - t->tree[index[row]].getLengthToLeaves());
-                       t->tree[index[column]].setBranchLength(blength - t->tree[index[column]].getLengthToLeaves());
-                       
-                       //set your length to leaves to your childs length plus branchlength
-                       t->tree[numGroups + i].setLengthToLeaves(t->tree[index[row]].getLengthToLeaves() + t->tree[index[row]].getBranchLength());
-                       
-                       
-                       //update index 
-                       index[row] = numGroups+i;
-                       index[column] = numGroups+i;
-                       
-                       //remove highest value that caused the merge.
-                       simMatrix[row][column] = -1000.0;
-                       simMatrix[column][row] = -1000.0;
-                       
-                       //merge values in simsMatrix
-                       for (int n = 0; n < simMatrix.size(); n++)      {
-                               //row becomes merge of 2 groups
-                               simMatrix[row][n] = (simMatrix[row][n] + simMatrix[column][n]) / 2;
-                               simMatrix[n][row] = simMatrix[row][n];
-                               //delete column
-                               simMatrix[column][n] = -1000.0;
-                               simMatrix[n][column] = -1000.0;
-                       }
-               }
-               
-               //adjust tree to make sure root to tip length is .5
-               int root = t->findRoot();
-               t->tree[root].setBranchLength((0.5 - t->tree[root].getLengthToLeaves()));
-               
-               //assemble tree
-               t->assembleTree();
-               
-               if (m->control_pressed) { delete t; t = NULL; return t; }
-               
                 return t;
-       
         }
         catch(exception& e) {
                 m->errorOut(e, "TreeGroupCommand", "createTree");
@@ -1004,11 +943,13 @@ int TreeGroupCommand::process(vector<SharedRAbundVector*> thisLookup) {
                  if (m->control_pressed) { for (int k = 0; k < trees.size(); k++) { delete trees[k]; } }
                  
                  Consensus consensus;
-                Tree* conTree = consensus.getTree(trees, tmap);
+                //clear old tree names if any
+                m->Treenames.clear(); m->Treenames = m->getGroups(); //may have changed if subsample eliminated groups
+                Tree* conTree = consensus.getTree(trees);
                  
                  //create a new filename
                  string conFile = outputDir + m->getRootName(m->getSimpleName(inputfile)) + treeCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".cons.tre";                              
-                outputNames.push_back(conFile); outputTypes["tree"].push_back(outputFile); 
+                outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile); 
                  ofstream outTree;
                  m->openOutputFile(conFile, outTree);
                  
diff --git a/treemap.cpp b/treemap.cpp

index 1fc5c01b796a67bd948a70147fa4dead9320ea08..c228162aab1d511af3fc5318aaf3e5eb5b39610c 100644 (file)
--- a/treemap.cpp
+++ b/treemap.cpp
@@ -19,6 +19,43 @@
  
  /************************************************************/
   TreeMap::~TreeMap(){}
+/************************************************************/
+int TreeMap::readMap(string gf) {
+    
+    groupFileName = gf;
+       m->openInputFile(gf, fileHandle);
+    
+    string seqName, seqGroup;
+    int error = 0;
+    
+    while(fileHandle){
+        fileHandle >> seqName;       m->gobble(fileHandle);    //read from first column
+        fileHandle >> seqGroup;                        //read from second column
+        
+        if (m->control_pressed) {  fileHandle.close();  return 1; }
+        
+        setNamesOfGroups(seqGroup);
+        
+        map<string, GroupIndex>::iterator itCheck = treemap.find(seqName);
+        if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+        else {
+            namesOfSeqs.push_back(seqName);
+            treemap[seqName].groupname = seqGroup;     //store data in map
+            
+            it2 = seqsPerGroup.find(seqGroup);
+            if (it2 == seqsPerGroup.end()) { //if it's a new group
+                seqsPerGroup[seqGroup] = 1;
+            }else {//it's a group we already have
+                seqsPerGroup[seqGroup]++;
+            }                          
+        }
+        
+        m->gobble(fileHandle);
+    }
+    fileHandle.close();
+    
+    return error;
+}
  
  /************************************************************/
  int TreeMap::readMap() {
@@ -26,7 +63,7 @@ int TreeMap::readMap() {
                 int error = 0;
                 
                 while(fileHandle){
-                       fileHandle >> seqName;          //read from first column
+                       fileHandle >> seqName;           m->gobble(fileHandle); //read from first column
                         fileHandle >> seqGroup;                 //read from second column
                         
                         if (m->control_pressed) {  fileHandle.close();  return 1; }
@@ -229,6 +266,60 @@ void TreeMap::makeSim(ListVector* list) {
                 exit(1);
         }
  }
+/************************************************************/
+int TreeMap::getCopy(TreeMap& copy){
+       try {
+         
+        namesOfGroups = copy.getNamesOfGroups();
+               numGroups = copy.getNumGroups();
+        namesOfSeqs = copy.namesOfSeqs;
+        seqsPerGroup = copy.seqsPerGroup;
+        treemap = copy.treemap;
+        
+        return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TreeMap", "getCopy");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<string> TreeMap::getNamesSeqs(){
+       try {
+        
+               vector<string> names;
+               
+        for(it = treemap.begin(); it != treemap.end(); it++){
+            names.push_back(it->first);
+               }
+               
+               return names;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TreeMap", "getNamesSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<string> TreeMap::getNamesSeqs(vector<string> picked){
+       try {
+               
+               vector<string> names;
+               
+               for(it = treemap.begin(); it != treemap.end(); it++){
+                       //if you are belong to one the the groups in the picked vector add you
+                       if (m->inUsersGroups(it->second.groupname, picked)) {
+                               names.push_back(it->first);
+                       }
+               }
+               
+               return names;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TreeMap", "getNamesSeqs");
+               exit(1);
+       }
+}
  
  /************************************************************/
  
diff --git a/treemap.h b/treemap.h

index 7ed8d04f0886367f5a17a5baeec4848e75687358..57822e02d4b8c3a57f7d0ddf26947c470bf9400b 100644 (file)
--- a/treemap.h
+++ b/treemap.h
@@ -10,7 +10,6 @@
   */
  
  #include "mothur.h"
-#include "groupmap.h"
  #include "listvector.hpp"
  
  /* This class is used by the read.tree command to build the tree container. */
@@ -20,15 +19,14 @@ struct GroupIndex {
         int             vectorIndex;
  };
  
-class GroupMap;
-class ListVector;
-
  class TreeMap {
  public:
         TreeMap() { m = MothurOut::getInstance(); }
         TreeMap(string);
         ~TreeMap();
+    
         int readMap();
+    int readMap(string);
         int getNumGroups();
         int getNumSeqs();
         void setIndex(string, int);  //sequencename, index
@@ -42,13 +40,19 @@ public:
                 sort(namesOfGroups.begin(), namesOfGroups.end());
                 return namesOfGroups;
         }
-       vector<string> namesOfSeqs;
-    map<string,int> seqsPerGroup;      //groupname, number of seqs in that group.
-       map<string, GroupIndex> treemap; //sequence name and <groupname, vector index>
-       void print(ostream&);
+    
+    void print(ostream&);
         void makeSim(vector<string>);  //takes groupmap info and fills treemap for use by tree.shared command.
         void makeSim(ListVector*);  //takes listvector info and fills treemap for use by tree.shared command.   
-       
+    vector<string> getNamesSeqs();
+       vector<string> getNamesSeqs(vector<string>); //get names of seqs belonging to a group or set of groups
+    int getCopy(TreeMap&);
+    
+    vector<string> namesOfSeqs;
+    map<string,int> seqsPerGroup;      //groupname, number of seqs in that group.
+       map<string, GroupIndex> treemap; //sequence name and <groupname, vector index>
+
+    
  private:
         vector<string> namesOfGroups;
         ifstream fileHandle;
diff --git a/treereader.cpp b/treereader.cpp

new file mode 100644 (file)

index 0000000..b385d21
--- /dev/null
+++ b/treereader.cpp
@@ -0,0 +1,158 @@
+//
+//  treereader.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/11/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "treereader.h"
+#include "readtree.h"
+
+/***********************************************************************/
+
+TreeReader::TreeReader(string tf) : treefile(tf)  { 
+    try {
+        m = MothurOut::getInstance();
+        namefile = "";
+        groupfile = "";
+        readTrees();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TreeReader", "TreeReader");
+               exit(1);
+       }
+}
+/***********************************************************************/
+
+TreeReader::TreeReader(string tf, string gf) : treefile(tf),  groupfile(gf)  { 
+    try {
+        m = MothurOut::getInstance();
+        namefile = "";
+        readTrees();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TreeReader", "TreeReader");
+               exit(1);
+       }
+}
+/***********************************************************************/
+TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf),  groupfile(gf), namefile(nf)  { 
+    try {
+        m = MothurOut::getInstance();
+        readTrees();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TreeReader", "TreeReader");
+               exit(1);
+       }
+}
+/***********************************************************************/
+bool TreeReader::readTrees()  { 
+    try {
+        
+        tmap = new TreeMap();
+        if (groupfile != "") {      tmap->readMap(groupfile);        }
+               else{ //fake out by putting everyone in one group
+                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
+                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
+               }
+               
+        int numUniquesInName = 0;
+               if (namefile != "") { numUniquesInName = readNamesFile(); }
+               
+               ReadTree* read = new ReadNewickTree(treefile);
+               int readOk = read->read(tmap); 
+               
+               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine();  delete read; m->control_pressed=true; return 0; }
+               
+               read->AssembleTrees(names);
+               trees = read->getTrees();
+               delete read;
+        
+               //make sure all files match
+               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
+               int numNamesInTree;
+               if (namefile != "")  {  
+                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
+                       else {   numNamesInTree = m->Treenames.size();  }
+               }else {  numNamesInTree = m->Treenames.size();  }
+               
+               
+               //output any names that are in group file but not in tree
+               if (numNamesInTree < tmap->getNumSeqs()) {
+                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+                               //is that name in the tree?
+                               int count = 0;
+                               for (int j = 0; j < m->Treenames.size(); j++) {
+                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
+                                       count++;
+                               }
+                               
+                               if (m->control_pressed) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; } return 0; }
+                               
+                               //then you did not find it so report it 
+                               if (count == m->Treenames.size()) { 
+                                       //if it is in your namefile then don't remove
+                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
+                                       
+                                       if (it == nameMap.end()) {
+                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
+                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
+                                               i--; //need this because removeSeq removes name from namesOfSeqs
+                                       }
+                               }
+                       }
+               }
+        
+        return true;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "TreeReader", "readTrees");
+               exit(1);
+       }
+}
+/*****************************************************************/
+int TreeReader::readNamesFile() {
+       try {
+               nameMap.clear();
+        names.clear();
+               int numUniquesInName = 0;
+               
+               ifstream in;
+               m->openInputFile(namefile, in);
+               
+               string first, second;
+               map<string, string>::iterator itNames;
+               
+               while(!in.eof()) {
+                       in >> first >> second; m->gobble(in);
+                       
+                       numUniquesInName++;
+                       
+                       itNames = nameMap.find(first);
+                       if (itNames == nameMap.end()) {  
+                               names[first] = second; 
+                               
+                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
+                               vector<string> dupNames;
+                               m->splitAtComma(second, dupNames);
+                               
+                               for (int i = 0; i < dupNames.size(); i++) {     
+                                       nameMap[dupNames[i]] = first; 
+                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
+                               }
+                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); nameMap.clear(); names.clear(); namefile = ""; return 1; }                    
+               }
+               in.close();
+               
+               return numUniquesInName;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TreeReader", "readNamesFile");
+               exit(1);
+       }
+}
+/***********************************************************************/
+
+
diff --git a/treereader.h b/treereader.h

new file mode 100644 (file)

index 0000000..fb9c791
--- /dev/null
+++ b/treereader.h
@@ -0,0 +1,44 @@
+#ifndef Mothur_treereader_h
+#define Mothur_treereader_h
+
+//
+//  treereader.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 4/11/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothurout.h"
+#include "tree.h"
+
+class TreeReader {
+    
+public:
+    
+    TreeReader(string tf);
+       TreeReader(string tf, string gf);
+    TreeReader(string tf, string gf, string nf);
+       ~TreeReader() {}        
+    
+    vector<Tree*> getTrees()            { return trees;     }
+    map<string, string> getNames()      { return nameMap;   } //dups -> unique
+    map<string, string> getNameMap()    { return names;     } //unique -> dups list
+    
+    
+private:
+    MothurOut* m;
+       vector<Tree*> trees;
+    TreeMap* tmap;
+    map<string, string> nameMap; //dupName -> uniqueName
+    map<string, string> names;
+    
+    string treefile, groupfile, namefile;
+    
+    bool readTrees();
+    int readNamesFile();
+};
+
+
+
+#endif
diff --git a/trialSwap2.cpp b/trialSwap2.cpp

index c580436b3b3d542150502d2b771f426bc4aed8a7..a0c3210c33dd49c97274b4ad555efe2a75f9fede 100644 (file)
--- a/trialSwap2.cpp
+++ b/trialSwap2.cpp
@@ -4,647 +4,7 @@
  //The sum_of_squares, havel_hakimi and calc_c_score algorithms have been adapted from I. Miklos and J. Podani. 2004. Randomization of presence-absence matrices: comments and new algorithms. Ecology 85:86-92.
  
  
-/**************************************************************************************************
-int TrialSwap2::intrand(int n){
-    try {
-        double z;
-        
-        z = (double)random() * (double)n / (double)RAND_MAX;
-        if(z>=n)
-            z=n-1;
-        if(z<0)
-            z=0;
-        return((int)floor(z));
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "intrand");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/* completely random matrix, all column and row totals are variable, matrix size is the same
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim1(vector<vector<int> > &co_matrix){ 
-    try {
-        vector<int> randRow;
-        vector<vector<int> > tmpmatrix;
-        int nrows = co_matrix.size();
-        int ncols = co_matrix[0].size();
-        
-        //clear co_matrix
-        //     for(i=0;i<nrows;i++)
-        //     {
-        //         co_matrix.clear();
-        //     }
-        
-        //cout << "building matrix" << endl;
-        for(int i=0;i<nrows;i++){
-            if (m->control_pressed) { break; }
-            
-            for(int j=0;j<ncols;j++){
-                double randNum = rand() / double(RAND_MAX);
-                //cout << randNum << endl;
-                
-                if(randNum > 0.5) {
-                    randRow.push_back(1);
-                }else{
-                    randRow.push_back(0);
-                }
-            }
-            tmpmatrix.push_back(randRow);
-            randRow.clear();
-            //cout << endl;
-        }
-        co_matrix = tmpmatrix;
-        
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim1");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- *row sums fixed, columns equiprobable 
- */
-void TrialSwap2::sim2(vector<vector<int> > &co_matrix)
-{ 
-    try {
-        
-        for(int i=0;i<co_matrix.size();i++)
-        {
-            if (m->control_pressed) { break; }
-            random_shuffle( co_matrix[i].begin(), co_matrix[i].end() ); 
-        }
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim2");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-int TrialSwap2::sim2plus(vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{
-    try {
-        int nrows = co_matrix.size();
-        int ncols = co_matrix[0].size();
-        double cellprob = 1.0/ncols;
-        vector<double> cellprobvec;
-        vector<int> tmprow;
-        vector<vector<int> > tmpmatrix;
-        //double randNum;
-        
-        double start = 0.0;
-        
-        for(int i=0; i<ncols; i++)
-        {
-            if (m->control_pressed) { return 0; }
-            cellprobvec.push_back(start + cellprob);
-            start = cellprobvec[i];
-        }
-        
-        for(int i=0; i<nrows; i++)
-        {
-            tmprow.assign(ncols, 0);
-            
-            while( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
-            {
-                if (m->control_pressed) { return 0; }
-                double randNum = rand() / double(RAND_MAX);
-                //cout << randNum << endl;
-                if(randNum <= cellprobvec[0])
-                {
-                    tmprow[0] = 1;
-                    continue;
-                }
-                for(int j=1;j<ncols;j++)
-                {
-                    //cout << range[j] << endl;
-                    if(randNum <= cellprobvec[j] && randNum > cellprobvec[j-1] && tmprow[j] != 1)
-                    {
-                        tmprow[j] = 1;
-                    }
-                }
-            }
-            tmpmatrix.push_back(tmprow);
-            tmprow.clear();
-        }
-        co_matrix = tmpmatrix;
-        tmpmatrix.clear();
-        cellprobvec.clear();
-        
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim2plus");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- * same as sim2 but using initmatrix which is the initial co-occurrence matrix before transposition
- * may have to be changed depending on what matrix 'seed' is used. One way to use is to transpose
- * every null matrix before using an index and use the random matrix as a seed for the next null.
- */
-/**************************************************************************************************/
-void TrialSwap2::sim3(vector<vector<int> > &initmatrix)
-{
-    try {
-        for(int i=0;i<initmatrix.size();i++)
-        {
-            if (m->control_pressed) { break; }
-            random_shuffle( initmatrix[i].begin(), initmatrix[i].end() ); 
-        }
-        
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim3");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim4(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{   
-    try {
-        vector<double> colProb;
-        vector<int> tmprow;//(ncols, 7);
-        vector<vector<int> > tmpmatrix;
-        vector<double> range;
-        vector<double> randNums;
-        int ncols = columntotal.size();
-        int nrows = rowtotal.size();
-        tmprow.clear();
-        
-        double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
-        //cout << "col sum: " << colSum << endl;
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            colProb.push_back(columntotal[i]/colSum);
-        }
-        
-        double start = 0.0;
-        
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            range.push_back(start + colProb[i]);
-            start = range[i];
-        }
-        
-        for(int i=0;i<nrows;i++)
-        {
-            tmprow.assign(ncols, 0);
-            if (m->control_pressed) { return 0; }
-            
-            while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
-            {
-                if (m->control_pressed) { return 0; }
-                
-                double randNum = rand() / double(RAND_MAX);
-                if(randNum <= range[0])
-                {
-                    tmprow[0] = 1;
-                    continue;
-                }
-                for(int j=1;j<ncols;j++)
-                {
-                    if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
-                    {
-                        tmprow[j] = 1;
-                    }
-                    
-                }
-            }
-            tmpmatrix.push_back(tmprow);
-            tmprow.clear();
-        }
-        
-        co_matrix = tmpmatrix;
-        
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim4");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- * inverse of sim4, MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim5(vector<int> initcolumntotal,vector<int> initrowtotal, vector<vector<int> > &initmatrix)
-{
-    try {
-        vector<double> colProb;
-        vector<int> tmprow;//(ncols, 7);
-        vector<vector<int> > tmpmatrix;
-        vector<double> range;
-        vector<double> randNums;
-        int ncols = initcolumntotal.size();
-        int nrows = initrowtotal.size();
-        
-        tmprow.clear();
-        
-        double colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
-        //cout << "col sum: " << colSum << endl;
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            colProb.push_back(initcolumntotal[i]/colSum);
-        }
-        
-        double start = 0.0;
-        
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            range.push_back(start + colProb[i]);
-            start = range[i];
-        }
-        
-        for(int i=0;i<nrows;i++)
-        {
-            tmprow.assign(ncols, 0);
-            if (m->control_pressed) { return 0; }
-            
-            while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < initrowtotal[i])
-            {
-                if (m->control_pressed) { return 0; }
-                
-                double randNum = rand() / double(RAND_MAX);
-                if(randNum <= range[0])
-                {
-                    tmprow[0] = 1;
-                    continue;
-                }
-                for(int j=1;j<ncols;j++)
-                {
-                    if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
-                    {
-                        tmprow[j] = 1;
-                    }
-                    
-                }
-            }
-            tmpmatrix.push_back(tmprow);
-            tmprow.clear();
-        }
-        
-        initmatrix = tmpmatrix;
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim5");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim6(vector<int> columntotal, vector<vector<int> > &co_matrix)
-{
-    try {
-        vector<vector<int> > tmpmatrix;
-        vector<double> colProb;
-        vector<int> tmprow;
-        vector<double> range;
-        int ncols = columntotal.size();
-        int nrows = co_matrix.size();
-        
-        int colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
-        
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            colProb.push_back(columntotal[i]/double (colSum));
-        }
-        
-        double start = 0.0;
-        
-        for(int i=0;i<ncols;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            range.push_back(start + colProb[i]);
-            start = range[i];
-        }
-        
-        for(int i=0;i<nrows;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            tmprow.assign(ncols, 0);
-            int tmprowtotal;
-            tmprowtotal = (rand() / double (RAND_MAX)) * 10;
-            while ( tmprowtotal > ncols) {
-                if (m->control_pressed) { return 0; }
-                tmprowtotal = (rand() / double (RAND_MAX)) * 10;
-            }
-            //cout << tmprowtotal << endl;
-            //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
-            
-            while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
-            {
-                if (m->control_pressed) { return 0; }
-                double randNum = rand() / double(RAND_MAX);
-                //cout << randNum << endl;
-                if(randNum <= range[0])
-                {
-                    tmprow[0] = 1;
-                    continue;
-                }
-                for(int j=1;j<ncols;j++)
-                {
-                    //cout << range[j] << endl;
-                    if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
-                    {
-                        tmprow[j] = 1;
-                    }
-                    
-                }
-                
-                
-            }
-            
-            tmpmatrix.push_back(tmprow);
-            tmprow.clear();
-        }
-        
-        co_matrix = tmpmatrix;
-        tmpmatrix.clear();
-        
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim6");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- * MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim7(vector<int> initrowtotal, vector<vector<int> > &co_matrix)
-{
-    try {
-        vector<vector<double> > probmatrix;
-        vector<vector<int> > tmpmatrix;
-        vector<double> colProb;
-        vector<double> probrow;
-        vector<int> tmprow;
-        vector<double> range;
-        double nc;
-        int ncols = co_matrix[0].size(); int nrows = co_matrix.size(); 
-        
-        tmpmatrix.assign(nrows, vector<int>(ncols, 0.));
-        
-        int rowsum = accumulate( initrowtotal.begin(), initrowtotal.end(), 0 );
-        
-        nc = rowsum * ncols;
-        //cout << nc << endl;
-        
-        //assign null matrix based on probabilities
-        
-        double start = 0.0; // don't reset start -- probs should be from 0-1 thoughout the entire matrix 
-        
-        for(int i=0;i<nrows;i++)
-        {
-            if (m->control_pressed) { return 0; }
-            //cout << initrowtotal[i]/double(nc) << endl;
-            double cellprob = initrowtotal[i]/double(nc);
-            //cout << cellprob << endl;
-            for(int j=0;j<ncols;j++)
-            {
-                
-                probrow.push_back(start + cellprob);
-                //cout << probrow[j] << endl;
-                //cout << start << endl;
-                start = start + cellprob;
-            }
-            probmatrix.push_back(probrow);
-            probrow.clear();
-        }
-        
-        
-        //while(tmprowsum < rowsum)
-        //for(int k=0;k<rowsum;k++)
-        int k = 0;
-        while(k < rowsum)
-        {
-            if (m->control_pressed) { return 0; }
-        done:
-            //cout << k << endl;
-            //tmprowsum = accumulate( tmprowtotal.begin(), tmprowtotal.end(), 0 );
-            double randNum = rand() / double(RAND_MAX);
-            //cout << randNum << "+" << endl;
-            //special case for the first entry
-            if(randNum <= probmatrix[0][0] && tmpmatrix[0][0] != 1)
-            {
-                tmpmatrix[0][0] = 1;
-                k++;
-                //cout << k << endl;
-                continue;
-            }
-            
-            
-            for(int i=0;i<nrows;i++)
-            {
-                if (m->control_pressed) { return 0; }
-                for(int j=0;j<ncols;j++)
-                {
-                    //cout << probmatrix[i][j] << endl;
-                    if(randNum <= probmatrix[i][j] && randNum > probmatrix[i][j-1] && tmpmatrix[i][j] != 1)
-                    {
-                        tmpmatrix[i][j] = 1;
-                        k++;
-                        //cout << k << endl;
-                        goto done;
-                    }
-                    //else
-                    //k = k-1;
-                }
-                
-            }
-            
-        }
-        
-        co_matrix = tmpmatrix;
-        return 0;
-    //build probibility matrix
-    /* for(int i=0;i<nrows;i++)
-     {
-     for(int j=0;j<ncols;j++)
-     {
-     probrow.push_back(rowtotal[i]/nc);
-     }
-     probmatrix.pushback(probrow);
-     probrow.clear;
-     }
-     */
-    
-    /* int colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
-        
-        for(int i=0;i<ncols;i++)
-        {
-            colProb.push_back(initcolumntotal[i]/double (colSum));
-        }
-        
-        double start = 0.0;
-        
-        for(int i=0;i<ncols;i++)
-        {
-            range.push_back(start + colProb[i]);
-            start = range[i];
-        }
-        
-        for(int i=0;i<nrows;i++)
-        {
-            tmprow.assign(ncols, 0);
-            int tmprowtotal;
-            tmprowtotal = (rand() / double (RAND_MAX)) * 10;
-            while ( tmprowtotal > ncols)
-                tmprowtotal = (rand() / double (RAND_MAX)) * 10;
-            //cout << tmprowtotal << endl;
-            //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
-            
-            while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
-            {
-                double randNum = rand() / double(RAND_MAX);
-                //cout << randNum << endl;
-                if(randNum <= range[0])
-                {
-                    tmprow[0] = 1;
-                    continue;
-                }
-                for(int j=1;j<ncols;j++)
-                {
-                    //cout << range[j] << endl;
-                    if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
-                    {
-                        tmprow[j] = 1;
-                    }
-                }
-            }
-            
-            tmpmatrix.push_back(tmprow);
-            tmprow.clear();
-        }
-
-        initmatrix = tmpmatrix;
-     */
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim7");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim8(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{   
-    try {
-        double prob; 
-        double start = 0.0;
-        int ncols = columntotal.size(); int nrows = rowtotal.size(); 
-        double probarray[nrows * ncols];
-        double randnum;
-        int grandtotal; 
-        int total = 0;
-        
-        //double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
-        double rowSum = accumulate( rowtotal.begin(), rowtotal.end(), 0 );
-        
-        if (m->control_pressed) { return 0; }
-        
-        //cout << "rowsum: " << rowSum << endl;
-        
-        grandtotal = rowSum;
-        
-        //create probability matrix with each site being between 0 and 1
-        for (int i=0;i<nrows;i++) {
-            if (m->control_pressed) { return 0; }
-            for (int j=0;j<ncols;j++) {
-                prob = (rowtotal[i] * columntotal[j])/(rowSum*rowSum);
-                if (prob == 0.0)
-                    probarray[ncols * i + j] = -1;
-                else
-                    probarray[ncols * i + j] = start + prob;
-                //probmatrixrow.pushback(start + prob);
-                start += prob;
-            }
-        }
-        //cout << "prbarray" << endl;
-        //for(int i=0;i<(nrows*ncols);i++)
-        //cout << probarray[i] << " ";
-        //cout << endl;
-        
-        //generate random muber between 0 and 1 and interate through probarray until found
-        while (total < grandtotal)  {
-            if (m->control_pressed) { return 0; }
-            randnum = rand() / double(RAND_MAX);
-            //cout << "rand num: " << randnum << endl;
-            if((randnum <= probarray[0]) && (probarray[0] != 2) ) {
-                probarray[0] = 2;
-                total++;
-                continue;
-            }
-            for(int i=1;i<(nrows*ncols);i++) {
-                if (m->control_pressed) { return 0; }
-                if((randnum <= probarray[i]) && (randnum > probarray[i-1]) && (probarray[i] != 2) ) {
-                    probarray[i] = 2;
-                    total++;
-                    break;
-                }
-                else
-                    continue;
-            }
-        }
-        //cout << "prbarray" << endl;
-        //for(int i=0;i<(nrows*ncols);i++)
-        //cout << probarray[i] << " ";
-        //cout << endl;
-        for(int i=0;i<nrows;i++) {
-            if (m->control_pressed) { return 0; }
-            for(int j=0;j<ncols;j++) {
-                if(probarray[ncols * i + j] == 2)
-                    co_matrix[i][j] = 1;
-                else
-                    co_matrix[i][j] = 0;
-            }
-        }
-        return 0;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "sim8");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int>  rowtotal)
+double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix, vector<int> rowtotal, int ncols, int nrows)
  {
      try {
          double cscore = 0.0;
@@ -652,10 +12,10 @@ double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int>  ro
          double D;
          double normcscore = 0.0;
          int nonzeros = 0;
-        int ncols = co_matrix[0].size(); int nrows = rowtotal.size(); 
+        //int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
          vector<vector<double> > s; s.resize(nrows);
          for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0.0); }//only fill half the matrix
-
+        
          
          for(int i=0;i<nrows-1;i++)
          {
@@ -685,8 +45,8 @@ double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int>  ro
                  if(maxD != 0)
                  {
                      normcscore += D/maxD;
-                    nonzeros++;    
-                }            
+                    nonzeros++;
+                }
              }
          }
          
@@ -695,18 +55,18 @@ double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int>  ro
          
          return cscore;
      }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "calc_c_score");
-               exit(1);
-       }
+    catch(exception& e) {
+        m->errorOut(e, "TrialSwap2", "calc_c_score");
+        exit(1);
+    }
  }
  /**************************************************************************************************/
-int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int>  rowtotal)
+int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int> rowtotal, int ncols, int nrows)
  {
      try {
          int cunits=0;
          //int s[nrows][ncols];
-        int ncols = co_matrix[0].size(); int nrows = rowtotal.size(); 
+        //int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
          vector<vector<int> > s; s.resize(nrows);
          for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0); }//only fill half the matrix
          
@@ -735,28 +95,28 @@ int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int>  rowt
              }
          }
          
-        return cunits;   
+        return cunits;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "TrialSwap2", "calc_checker");
+        exit(1);
      }
-       catch(exception& e) {
-               m->errorOut(e, "TrialSwap2", "calc_checker");
-               exit(1);
-       }
  }
  /**************************************************************************************************/
-double TrialSwap2::calc_vratio (vector<int> rowtotal, vector<int> columntotal)
+double TrialSwap2::calc_vratio (int nrows, int ncols, vector<int> rowtotal, vector<int> columntotal)
  {
      try {
-        int nrows = rowtotal.size();
-        int ncols = columntotal.size();
+        //int nrows = rowtotal.size();
+        //int ncols = columntotal.size();
          int sumCol = accumulate(columntotal.begin(), columntotal.end(), 0 );
-       // int sumRow = accumulate(rowtotal.begin(), rowtotal.end(), 0 );
+        // int sumRow = accumulate(rowtotal.begin(), rowtotal.end(), 0 );
          
          double colAvg = (double) sumCol / (double) ncols;
- //       double rowAvg = (double) sumRow / (double) nrows;
+        // double rowAvg = (double) sumRow / (double) nrows;
          
          double p = 0.0;
          
- //       double totalRowVar = 0.0;
+        // double totalRowVar = 0.0;
          double rowVar = 0.0;
          double colVar = 0.0;
          
@@ -765,7 +125,7 @@ double TrialSwap2::calc_vratio (vector<int> rowtotal, vector<int> columntotal)
              if (m->control_pressed) { return 0; }
              p = (double) rowtotal[i]/(double) ncols;
              rowVar += p * (1.0-p);
-        } 
+        }
          
          for(int i=0;i<ncols;i++)
          {
@@ -781,29 +141,44 @@ double TrialSwap2::calc_vratio (vector<int> rowtotal, vector<int> columntotal)
          m->errorOut(e, "TrialSwap2", "calc_vratio");
          exit(1);
      }
-         
+    
  }
  /**************************************************************************************************/
-int TrialSwap2::calc_combo (vector<vector<int> > &initmatrix)
+int TrialSwap2::calc_combo (int nrows, int ncols, vector<vector<int> > &nullmatrix)
  {
      try {
-        int initrows = initmatrix.size();
+        //need to transpose so we can compare rows (row-major order)
+        int tmpnrows = nrows;
+        vector<vector<int> > tmpmatrix;
+        
+        vector<int> tmprow;
+        if(!tmpmatrix.empty())
+            tmpmatrix.clear();
+        for (int i=0;i<ncols;i++)
+        {
+            for (int j=0;j<nrows;j++)
+            {
+                tmprow.push_back(nullmatrix[j][i]);
+            }
+            
+            tmpmatrix.push_back(tmprow);
+            tmprow.clear();
+        }
+        
          int unique = 0;
          int match = 0;
-        int matches = 0;
-        for(int i=0;i<initrows;i++)
+        for(int j=0;j<ncols;j++)
          {
              match = 0;
-            for(int j=i+1;j<=initrows;j++)
+            for(int i=j+1;i<=ncols;i++)
              {
-                if (m->control_pressed) { return 0; }
-                if( (initmatrix[i] == initmatrix[j])) 
+                //comparing matrix rows
+                if( (tmpmatrix[j] == tmpmatrix[i]))
                  {
                      match++;
-                    matches++;
                      break;
                  }
-            }        
+            }
              
              //on the last iteration of a previously matched row it will add itself because it doesn't match any following rows, so that combination is counted
              if (match == 0)
@@ -815,31 +190,27 @@ int TrialSwap2::calc_combo (vector<vector<int> > &initmatrix)
          m->errorOut(e, "TrialSwap2", "calc_combo");
          exit(1);
      }
-} 
+}
  /**************************************************************************************************/
-int TrialSwap2::swap_checkerboards (vector<vector<int> > &co_matrix)
+int TrialSwap2::swap_checkerboards (vector<vector<int> > &co_matrix, int ncols, int nrows)
  {
      try {
-        int ncols = co_matrix[0].size(); int nrows = co_matrix.size(); 
+        //int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
          int i, j, k, l;
          i = m->getRandomIndex(nrows-1);
          while((j = m->getRandomIndex(nrows-1) ) == i ) {;if (m->control_pressed) { return 0; }}
          k = m->getRandomIndex(ncols-1);
          while((l = m->getRandomIndex(ncols-1)) == k ) {;if (m->control_pressed) { return 0; }}
-                
-        //cout << co_matrix[i][k] << " " << co_matrix[j][l] << endl;
-        //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
-        //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
-        //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
+        
          if((co_matrix[i][k]*co_matrix[j][l]==1 && co_matrix[i][l]+co_matrix[j][k]==0)||(co_matrix[i][k]+co_matrix[j][l]==0 && co_matrix[i][l]*co_matrix[j][k]==1)) //checking for checkerboard value and swap
          {
              co_matrix[i][k]=1-co_matrix[i][k];
              co_matrix[i][l]=1-co_matrix[i][l];
              co_matrix[j][k]=1-co_matrix[j][k];
              co_matrix[j][l]=1-co_matrix[j][l];
-            //cout << "swapped!" << endl;
+            
          }
-        //cout << "i: " << i << " j: " << j << " k: " << " l: " << l << endl;
+        
          return 0;
      }
      catch(exception& e) {
@@ -902,11 +273,11 @@ double TrialSwap2::t_test (double initialscore, int runs, double nullMean, vecto
          
          m->mothurOut("nullMean: " + toString(nullMean)); m->mothurOutEndLine();
          
-        m->mothurOut("sum: " + toString(sum));  m->mothurOutEndLine();
+        m->mothurOut("sum: " + toString(sum)); m->mothurOutEndLine();
          
          sampleSD = sqrt( (1/runs) * sum );
          
-        m->mothurOut("samplSD: " + toString(sampleSD));  m->mothurOutEndLine();
+        m->mothurOut("samplSD: " + toString(sampleSD)); m->mothurOutEndLine();
          
          t = (nullMean - initialscore) / (sampleSD / sqrt(runs));
          
@@ -921,15 +292,15 @@ double TrialSwap2::t_test (double initialscore, int runs, double nullMean, vecto
  int TrialSwap2::print_matrix(vector<vector<int> > &matrix, int nrows, int ncols)
  {
      try {
-         m->mothurOut("matrix:");  m->mothurOutEndLine();
+        m->mothurOut("matrix:"); m->mothurOutEndLine();
          
          for (int i = 0; i < nrows; i++)
          {
              if (m->control_pressed) { return 0; }
              for (int j = 0; j < ncols; j++)
              {
-                m->mothurOut(toString(matrix[i][j]));            
-            }    
+                m->mothurOut(toString(matrix[i][j]));
+            }
              m->mothurOutEndLine();
          }
          return 0;
@@ -940,83 +311,6 @@ int TrialSwap2::print_matrix(vector<vector<int> > &matrix, int nrows, int ncols)
      }
  }
  /**************************************************************************************************/
-int TrialSwap2::transpose_matrix (vector<vector<int> > &initmatrix, vector<vector<int> > &co_matrix)//, int nrows, int nocols)
-{    
-    try {
-        int ncols = initmatrix.size(); int nrows = initmatrix[0].size(); 
-        int tmpnrows = nrows;
-        //vector<vector<int> > tmpvec;
-        vector<int> tmprow;
-        if(!co_matrix.empty())
-            co_matrix.clear();
-        for (int i=0;i<nrows;i++)
-        {       
-            if (m->control_pressed) { return 0; }
-            for (int j=0;j<ncols;j++)
-            {
-                tmprow.push_back(initmatrix[j][i]);
-            }
-            /*if (accumulate( tmprow.begin(), tmprow.end(), 0 ) == 0)
-             {
-             tmpnrows--;
-             }
-             else */
-            co_matrix.push_back(tmprow);
-            tmprow.clear();
-        }
-        nrows = tmpnrows;
-        return 0;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "TrialSwap2", "transpose_matrix");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
-int TrialSwap2::update_row_col_totals(vector<vector<int> > &co_matrix, vector<int> &rowtotal, vector<int> &columntotal)
-{
-    try {
-        //rowtotal.clear();
-        //columntotal.clear();
-        //generate (rowtotal.begin(), rowtotal.end(), 0);
-        //generate (columntotal.begin(), columntotal.end(), 0);
-        int nrows = co_matrix.size();
-        int ncols = co_matrix[0].size();
-        vector<int> tmpcolumntotal; tmpcolumntotal.resize(ncols, 0);
-        vector<int> tmprowtotal; tmprowtotal.resize(nrows, 0);
-        
-        int rowcount = 0;
-        
-        for (int i = 0; i < nrows; i++)
-        {
-            if (m->control_pressed) { return 0; }
-            for (int j = 0; j < ncols; j++)
-            {
-                if (co_matrix[i][j] == 1)
-                {
-                    rowcount++;
-                    tmpcolumntotal[j]++;
-                }           
-            }    
-            tmprowtotal[i] = rowcount;
-            rowcount = 0;
-        }
-        columntotal = tmpcolumntotal;
-        rowtotal = tmprowtotal;
-        /*cout << "rowtotal: ";
-        for(int i = 0; i<nrows; i++) { cout << rowtotal[i]; }
-        cout << "  ";
-        cout << " coltotal: ";
-        for(int i = 0; i<ncols; i++) { cout << columntotal[i]; }
-        cout << endl;*/
-        return 0;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "TrialSwap2", "update_row_col_totals");
-        exit(1);
-    }
-}
-/**************************************************************************************************/
  
  
  
diff --git a/trialswap2.h b/trialswap2.h

index 6e68e95d440c22b3d63a0040aad5f9c3d51ef6ae..924938b1a16297015a3bdb14d07db0f91fc7b5bc 100644 (file)
--- a/trialswap2.h
+++ b/trialswap2.h
@@ -16,29 +16,17 @@
  class TrialSwap2 {
      
  public:
-       TrialSwap2(){  m = MothurOut::getInstance(); };
+    TrialSwap2(){ m = MothurOut::getInstance(); };
      ~TrialSwap2(){};
      
      double calc_pvalue_lessthan (vector<double>, double);
      double calc_pvalue_greaterthan (vector<double>, double);
-    int swap_checkerboards (vector<vector<int> > &);
-    int calc_combo (vector<vector<int> > &);
-    double calc_vratio (vector<int>, vector<int>);
-    int calc_checker (vector<vector<int> > &,vector<int>);
-    double calc_c_score (vector<vector<int> > &,vector<int>);
-    
-    int sim1 (vector<vector<int> > &);
-    void sim2(vector<vector<int> >&);
-    int sim2plus(vector<int>, vector<vector<int> > &);
-    void sim3(vector<vector<int> > &);
-    int sim4(vector<int>, vector<int>, vector<vector<int> > &);
-    int sim5(vector<int>, vector<int>, vector<vector<int> > &);
-    int sim6(vector<int>, vector<vector<int> > &);
-    int sim7(vector<int>, vector<vector<int> > &);
-    int sim8(vector<int>, vector<int>, vector<vector<int> > &);
-    int transpose_matrix (vector<vector<int> > &, vector<vector<int> > &);
-    int update_row_col_totals(vector<vector<int> > &, vector<int>&, vector<int>&);
-
+    int swap_checkerboards (vector<vector<int> > &, int, int);
+    int calc_combo (int, int, vector<vector<int> > &);
+    double calc_vratio (int, int, vector<int>, vector<int>);
+    int calc_checker (vector<vector<int> > &, vector<int>, int, int);
+    double calc_c_score (vector<vector<int> > &, vector<int>, int, int);
+    
      
  private:
      MothurOut* m;
@@ -47,9 +35,8 @@ private:
      int print_matrix(vector<vector<int> > &, int, int);
      
      
-
+    
  };
-
  #endif
  
  
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index f00743c546d22879f930bf25c741cdbcdd3e7fe5..9f8fafb24e14a807baa8dd5a981c93556c46e3b7 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -437,6 +437,17 @@ int TrimSeqsCommand::execute(){
                                         
                                         Sequence currSeq(in); m->gobble(in);
                                         out << currSeq.getName() << '\t' << it->second << endl;
+                    
+                    if (nameFile != "") {
+                        map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+                        if (itName != nameMap.end()) { 
+                            vector<string> thisSeqsNames; 
+                            m->splitAtChar(itName->second, thisSeqsNames, ',');
+                            for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                out << thisSeqsNames[k] << '\t' << it->second << endl;
+                            }
+                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }                                                  
+                    }
                                 }
                                 in.close();
                                 out.close();
@@ -1179,9 +1190,8 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
                  cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
                  if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
              }
-        
-            if(qfilename == "")        {       qLines = lines; } //files with duds
          }
+            if(qfilename == "")        {       qLines = lines; } //files with duds
                         return 1;
                 
                 #endif
diff --git a/uchime_src/addtargets2.cpp b/uchime_src/addtargets2.cpp

new file mode 100644 (file)

index 0000000..f3f6377
--- /dev/null
+++ b/uchime_src/addtargets2.cpp
@@ -0,0 +1,38 @@
+#if    UCHIMES\r
+\r
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "ultra.h"\r
+#include <set>\r
+\r
+const float MAX_WORD_COUNT_DROP = 1;\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order);\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path);\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path);\r
+void USort(const SeqData &Query, const SeqDB &DB, vector<float> &WordCounts,\r
+  vector<unsigned> &Order);\r
+\r
+void AddTargets(SeqDB &DB, const SeqData &Query, set<unsigned> &TargetIndexes)\r
+       {\r
+       const unsigned SeqCount = DB.GetSeqCount();\r
+       if (SeqCount == 0)\r
+               return;\r
+\r
+       vector<float> WordCounts;\r
+       vector<unsigned> Order;\r
+       USort(Query, DB, WordCounts, Order);\r
+       asserta(SIZE(Order) == SeqCount);\r
+       unsigned TopSeqIndex = Order[0];\r
+       float TopWordCount = WordCounts[TopSeqIndex];\r
+       for (unsigned i = 0; i < SeqCount; ++i)\r
+               {\r
+               unsigned SeqIndex = Order[i];\r
+               float WordCount = WordCounts[SeqIndex];\r
+               if (TopWordCount - WordCount > MAX_WORD_COUNT_DROP)\r
+                       return;\r
+               TargetIndexes.insert(SeqIndex);\r
+               }\r
+       }\r
+\r
+#endif\r
diff --git a/uchime_src/alignchime.cpp b/uchime_src/alignchime.cpp

new file mode 100644 (file)

index 0000000..d7b05a8
--- /dev/null
+++ b/uchime_src/alignchime.cpp
@@ -0,0 +1,649 @@
+#include "myutils.h"\r
+#include "seq.h"\r
+#include "chime.h"\r
+#include "dp.h"\r
+\r
+#define TRACE          0\r
+#define TRACE_BS       0\r
+\r
+void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB,\r
+  const string &PathQA, const string &PathQB,\r
+  string &Q3, string &A3, string &B3);\r
+\r
+void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,\r
+  const string &QLabel, const string &ALabel, const string &BLabel,\r
+  ChimeHit2 &Hit);\r
+\r
+double GetScore2(double Y, double N, double A)\r
+       {\r
+       return Y/(opt_xn*(N + opt_dn) + opt_xa*A);\r
+       }\r
+\r
+void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,\r
+  const string &QLabel, const string &ALabel, const string &BLabel,\r
+  ChimeHit2 &Hit)\r
+       {\r
+       Hit.Clear();\r
+       Hit.QLabel = QLabel;\r
+\r
+       const byte *Q3Seq = (const byte *) Q3.c_str();\r
+       const byte *A3Seq = (const byte *) A3.c_str();\r
+       const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+       const unsigned ColCount = SIZE(Q3);\r
+       asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+#if    TRACE\r
+       Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq);\r
+       Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq);\r
+       Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq);\r
+#endif\r
+\r
+// Discard terminal gaps\r
+       unsigned ColLo = UINT_MAX;\r
+       unsigned ColHi = UINT_MAX;\r
+       for (unsigned Col = 2; Col + 2 < ColCount; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (isacgt(q) && isacgt(a) && isacgt(b))\r
+                       {\r
+                       if (ColLo == UINT_MAX)\r
+                               ColLo = Col;\r
+                       ColHi = Col;\r
+                       }\r
+               }\r
+\r
+       if (ColLo == UINT_MAX)\r
+               return;\r
+\r
+       unsigned QPos = 0;\r
+       unsigned APos = 0;\r
+       unsigned BPos = 0;\r
+       unsigned DiffCount = 0;\r
+\r
+       vector<unsigned> ColToQPos(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumCount(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumSameA(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumSameB(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumForA(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumForB(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumAbstain(ColLo, UINT_MAX);\r
+       vector<unsigned> AccumAgainst(ColLo, UINT_MAX);\r
+\r
+       unsigned SumSameA = 0;\r
+       unsigned SumSameB = 0;\r
+       unsigned SumSameAB = 0;\r
+       unsigned Sum = 0;\r
+       unsigned SumForA = 0;\r
+       unsigned SumForB = 0;\r
+       unsigned SumAbstain = 0;\r
+       unsigned SumAgainst = 0;\r
+       for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (isacgt(q) && isacgt(a) && isacgt(b))\r
+                       {\r
+                       if (q == a)\r
+                               ++SumSameA;\r
+                       if (q == b)\r
+                               ++SumSameB;\r
+                       if (a == b)\r
+                               ++SumSameAB;\r
+                       if (q == a && q != b)\r
+                               ++SumForA;\r
+                       if (q == b && q != a)\r
+                               ++SumForB;\r
+                       if (a == b && q != a)\r
+                               ++SumAgainst;\r
+                       if (q != a && q != b)\r
+                               ++SumAbstain;\r
+                       ++Sum;\r
+                       }\r
+\r
+               ColToQPos.push_back(QPos);\r
+               AccumSameA.push_back(SumSameA);\r
+               AccumSameB.push_back(SumSameB);\r
+               AccumCount.push_back(Sum);\r
+               AccumForA.push_back(SumForA);\r
+               AccumForB.push_back(SumForB);\r
+               AccumAbstain.push_back(SumAbstain);\r
+               AccumAgainst.push_back(SumAgainst);\r
+\r
+               if (q != '-')\r
+                       ++QPos;\r
+               if (a != '-')\r
+                       ++APos;\r
+               if (b != '-')\r
+                       ++BPos;\r
+               }\r
+\r
+       asserta(SIZE(ColToQPos) == ColHi+1);\r
+       asserta(SIZE(AccumSameA) == ColHi+1);\r
+       asserta(SIZE(AccumSameB) == ColHi+1);\r
+       asserta(SIZE(AccumAbstain) == ColHi+1);\r
+       asserta(SIZE(AccumAgainst) == ColHi+1);\r
+\r
+       double IdQA = double(SumSameA)/Sum;\r
+       double IdQB = double(SumSameB)/Sum;\r
+       double IdAB = double(SumSameAB)/Sum;\r
+       double MaxId = max(IdQA, IdQB);\r
+\r
+#if    TRACE\r
+       Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0);\r
+       Log("\n");\r
+       Log("    x  AQB   IdAL   IdBL   IdAR   IdBR   DivAB   DivBA    YAL    YBL    YAR    YBR    AbL    AbR  ScoreAB  ScoreAB    XLo    Xhi\n");\r
+       Log("-----  ---  -----  -----  -----  -----  ------  ------  -----  -----  -----  -----  -----  -----  -------  -------  -----  -----\n");\r
+#endif\r
+       unsigned BestXLo = UINT_MAX;\r
+       unsigned BestXHi = UINT_MAX;\r
+       double BestDiv = 0.0;\r
+       double BestIdQM = 0.0;\r
+       double BestScore = 0.0;\r
+\r
+// Find range of cols BestXLo..BestXHi that maximizes score\r
+       bool FirstA = false;\r
+\r
+// NOTE: Must be < ColHi not <= because use Col+1 below\r
+       for (unsigned Col = ColLo; Col < ColHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               unsigned SameAL = AccumSameA[Col];\r
+               unsigned SameBL = AccumSameB[Col];\r
+               unsigned SameAR = SumSameA - AccumSameA[Col];\r
+               unsigned SameBR = SumSameB - AccumSameB[Col];\r
+\r
+               double IdAB = double(SameAL + SameBR)/Sum;\r
+               double IdBA = double(SameBL + SameAR)/Sum;\r
+\r
+               unsigned ForAL = AccumForA[Col];\r
+               unsigned ForBL = AccumForB[Col];\r
+               unsigned ForAR = SumForA - AccumForA[Col+1];\r
+               unsigned ForBR = SumForB - AccumForB[Col+1];\r
+               unsigned AbL = AccumAbstain[Col];\r
+               unsigned AbR = SumAbstain - AccumAbstain[Col+1];\r
+\r
+               double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR);\r
+               double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR);\r
+       \r
+               double DivAB = IdAB/MaxId;\r
+               double DivBA = IdBA/MaxId;\r
+               double MaxDiv = max(DivAB, DivBA);\r
+\r
+               //if (MaxDiv > BestDiv)\r
+               //      {\r
+               //      BestDiv = MaxDiv;\r
+               //      BestXLo = Col;\r
+               //      BestXHi = Col;\r
+               //      FirstA = (DivAB > DivBA);\r
+               //      if (FirstA)\r
+               //              BestIdQM = IdAB;\r
+               //      else\r
+               //              BestIdQM = IdBA;\r
+               //      }\r
+               //else if (MaxDiv == BestDiv)\r
+               //      BestXHi = Col;\r
+\r
+               double MaxScore = max(ScoreAB, ScoreBA);\r
+               if (MaxScore > BestScore)\r
+                       {\r
+                       BestScore = MaxScore;\r
+                       BestXLo = Col;\r
+                       BestXHi = Col;\r
+                       FirstA = (ScoreAB > ScoreBA);\r
+                       if (FirstA)\r
+                               BestIdQM = IdAB;\r
+                       else\r
+                               BestIdQM = IdBA;\r
+                       if (MaxDiv > BestDiv)\r
+                               BestDiv = MaxDiv;\r
+                       }\r
+               else if (MaxScore == BestScore)\r
+                       {\r
+                       BestXHi = Col;\r
+                       if (MaxDiv > BestDiv)\r
+                               BestDiv = MaxDiv;\r
+                       }\r
+\r
+#if    TRACE\r
+               {\r
+               Log("%5u", Col);\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               Log("  %c%c%c", a, q, b);\r
+               Log("  %5u", SameAL);\r
+               Log("  %5u", SameBL);\r
+               Log("  %5u", SameAR);\r
+               Log("  %5u", SameBR);\r
+               Log("  %5.4f", DivAB);\r
+               Log("  %5.4f", DivBA);\r
+               Log("  %5u", ForAL);\r
+               Log("  %5u", ForBL);\r
+               Log("  %5u", ForAR);\r
+               Log("  %5u", ForBR);\r
+               Log("  %5u", AbL);\r
+               Log("  %5u", AbR);\r
+               Log("  %7.4f", ScoreAB);\r
+               Log("  %7.4f", ScoreBA);\r
+               if (BestXLo != UINT_MAX)\r
+                       Log("  %5u", BestXLo);\r
+               if (BestXHi != UINT_MAX)\r
+                       Log("  %5u", BestXHi);\r
+               Log("\n");\r
+               }\r
+#endif\r
+               }\r
+\r
+       if (BestXLo == UINT_MAX)\r
+               {\r
+#if    TRACE\r
+               Log("\n");\r
+               Log("No crossover found.\n");\r
+#endif\r
+               return;\r
+               }\r
+#if    TRACE\r
+       Log("BestX col %u - %u\n", BestXLo, BestXHi);\r
+#endif\r
+\r
+// Find maximum region of identity within BestXLo..BestXHi\r
+       unsigned ColXLo = (BestXLo + BestXHi)/2;\r
+       unsigned ColXHi = ColXLo;\r
+       unsigned SegLo = UINT_MAX;\r
+       unsigned SegHi = UINT_MAX;\r
+       for (unsigned Col = BestXLo; Col <= BestXHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (q == a && q == b)\r
+                       {\r
+                       if (SegLo == UINT_MAX)\r
+                               SegLo = Col;\r
+                       SegHi = Col;\r
+                       }\r
+               else\r
+                       {\r
+                       unsigned SegLength = SegHi - SegLo + 1;\r
+                       unsigned BestSegLength = ColXHi - ColXLo + 1;\r
+                       if (SegLength > BestSegLength)\r
+                               {\r
+                               ColXLo = SegLo;\r
+                               ColXHi = SegHi;\r
+                               }\r
+                       SegLo = UINT_MAX;\r
+                       SegHi = UINT_MAX;\r
+                       }\r
+               }\r
+       unsigned SegLength = SegHi - SegLo + 1;\r
+       unsigned BestSegLength = ColXHi - ColXLo + 1;\r
+       if (SegLength > BestSegLength)\r
+               {\r
+               ColXLo = SegLo;\r
+               ColXHi = SegHi;\r
+               }\r
+\r
+       QPos = 0;\r
+       for (unsigned x = 0; x < ColCount; ++x)\r
+               {\r
+               if (x == ColXLo)\r
+                       Hit.QXLo = QPos;\r
+               else if (x == ColXHi)\r
+                       {\r
+                       Hit.QXHi = QPos;\r
+                       break;\r
+                       }\r
+               char q = Q3Seq[x];\r
+               if (q != '-')\r
+                       ++QPos;\r
+               }\r
+\r
+       Hit.ColXLo = ColXLo;\r
+       Hit.ColXHi = ColXHi;\r
+\r
+       //if (FirstA)\r
+       //      {\r
+       //      Hit.LY = AccumForA[ColXLo];\r
+       //      Hit.LN = AccumForB[ColXLo];\r
+\r
+       //      Hit.RY = SumForB - AccumForB[ColXHi];\r
+       //      Hit.RN = SumForA - AccumForA[ColXHi];\r
+       //      }\r
+       //else\r
+       //      {\r
+       //      Hit.LY = AccumForB[ColXLo];\r
+       //      Hit.LN = AccumForA[ColXLo];\r
+       //      Hit.RY = SumForA - AccumForA[ColXHi];\r
+       //      Hit.RN = SumForB - AccumForB[ColXHi];\r
+       //      }\r
+\r
+       //Hit.LA = AccumAgainst[ColXLo];\r
+       //Hit.LD = AccumAbstain[ColXLo];\r
+\r
+       //Hit.RA = SumAgainst - AccumAgainst[ColXHi];\r
+       //Hit.RD = SumAbstain - AccumAbstain[ColXHi];\r
+\r
+       Hit.PctIdAB = IdAB*100.0;\r
+       Hit.PctIdQM = BestIdQM*100.0;\r
+\r
+       Hit.Div = (BestDiv - 1.0)*100.0;\r
+\r
+       //Hit.QSD = QSD;\r
+       Hit.Q3 = Q3;\r
+       Hit.QLabel = QLabel;\r
+       if (FirstA)\r
+               {\r
+               //Hit.ASD = ASD;\r
+               //Hit.BSD = BSD;\r
+               //Hit.PathQA = PathQA;\r
+               //Hit.PathQB = PathQB;\r
+               Hit.A3 = A3;\r
+               Hit.B3 = B3;\r
+               Hit.ALabel = ALabel;\r
+               Hit.BLabel = BLabel;\r
+               Hit.PctIdQA = IdQA*100.0;\r
+               Hit.PctIdQB = IdQB*100.0;\r
+               }\r
+       else\r
+               {\r
+               Hit.A3 = B3;\r
+               Hit.B3 = A3;\r
+               Hit.ALabel = BLabel;\r
+               Hit.BLabel = ALabel;\r
+               Hit.PctIdQA = IdQB*100.0;\r
+               Hit.PctIdQB = IdQA*100.0;\r
+               }\r
+\r
+// CS SNPs\r
+       Hit.CS_LY = 0;\r
+       Hit.CS_LN = 0;\r
+       Hit.CS_RY = 0;\r
+       Hit.CS_RN = 0;\r
+       Hit.CS_LA = 0;\r
+       Hit.CS_RA = 0;\r
+\r
+       //vector<float> Cons;\r
+       //for (unsigned Col = 0; Col < ColCount; ++Col)\r
+       //      {\r
+       //      char q = Q3Seq[Col];\r
+       //      char a = A3Seq[Col];\r
+       //      char b = B3Seq[Col];\r
+       //      if (q == a && q == b && a == b)\r
+       //              {\r
+       //              Cons.push_back(1.0f);\r
+       //              continue;\r
+       //              }\r
+\r
+       //      bool gapq = isgap(q);\r
+       //      bool gapa = isgap(a);\r
+       //      bool gapb = isgap(b);\r
+\r
+       //      if (!gapq && !gapa && !gapb)\r
+       //              {\r
+       //              if (q == a || q == b || a == b)\r
+       //                      Cons.push_back(0.75);\r
+       //              else\r
+       //                      Cons.push_back(0.5);\r
+       //              }\r
+       //      else\r
+       //              {\r
+       //              if (!gapa && (a == b || a == q))\r
+       //                      Cons.push_back(0.5f);\r
+       //              else if (!gapb && b == q)\r
+       //                      Cons.push_back(0.5f);\r
+       //              else\r
+       //                      Cons.push_back(0.0f);\r
+       //              }\r
+       //      }\r
+\r
+       //float fLY = 0.0f;\r
+       //float fLN = 0.0f;\r
+       //float fLA = 0.0f;\r
+       //float fRY = 0.0f;\r
+       //float fRN = 0.0f;\r
+       //float fRA = 0.0f;\r
+       for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               if (q == a && q == b && a == b)\r
+                       continue;\r
+\r
+               unsigned ngaps = 0;\r
+               if (isgap(q))\r
+                       ++ngaps;\r
+               if (isgap(a))\r
+                       ++ngaps;\r
+               if (isgap(b))\r
+                       ++ngaps;\r
+\r
+               if (opt_skipgaps)\r
+                       {\r
+                       if (ngaps == 3)\r
+                               continue;\r
+                       }\r
+               else\r
+                       {\r
+                       if (ngaps == 2)\r
+                               continue;\r
+                       }\r
+\r
+               if (!FirstA)\r
+                       swap(a, b);\r
+\r
+               //float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4;\r
+               //if (Col < ColXLo)\r
+               //      {\r
+               //      if (q == a && q != b)\r
+               //              fLY += AvgCons;\r
+               //      else if (q == b && q != a)\r
+               //              fLN += AvgCons;\r
+               //      else\r
+               //              fLA += AvgCons;\r
+               //      }\r
+               //else if (Col > ColXHi)\r
+               //      {\r
+               //      if (q == b && q != a)\r
+               //              fRY += AvgCons;\r
+               //      else if (q == a && q != b)\r
+               //              fRN += AvgCons;\r
+               //      else\r
+               //              fRA += AvgCons;\r
+               //      }\r
+\r
+               if (opt_skipgaps2)\r
+                       {\r
+                       if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+                               continue;\r
+                       if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+                               continue;\r
+                       }\r
+\r
+               //if (Col > 0 && isgap(Q3Seq[Col-1]))\r
+                       //continue;\r
+               //if (Col + 1 < ColCount && isgap(Q3Seq[Col+1]))\r
+               //      continue;\r
+\r
+               if (Col < ColXLo)\r
+                       {\r
+                       if (q == a && q != b)\r
+                               ++Hit.CS_LY;\r
+                       else if (q == b && q != a)\r
+                               ++Hit.CS_LN;\r
+                       else\r
+                               ++Hit.CS_LA;\r
+                       }\r
+               else if (Col > ColXHi)\r
+                       {\r
+                       if (q == b && q != a)\r
+                               ++Hit.CS_RY;\r
+                       else if (q == a && q != b)\r
+                               ++Hit.CS_RN;\r
+                       else\r
+                               ++Hit.CS_RA;\r
+                       }\r
+               }\r
+\r
+       double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);\r
+       double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);\r
+       Hit.Score = ScoreL*ScoreR;\r
+\r
+       extern bool g_UchimeDeNovo;\r
+\r
+       //if (0)//g_UchimeDeNovo)\r
+       //      {\r
+       //      double AbQ = GetAbFromLabel(QLabel.c_str());\r
+       //      double AbA = GetAbFromLabel(ALabel.c_str());\r
+       //      double AbB = GetAbFromLabel(BLabel.c_str());\r
+       //      if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0)\r
+       //              {\r
+       //              double MinAb = min(AbA, AbB);\r
+       //              double Ratio = MinAb/AbQ;\r
+       //              double t = Ratio - opt_abx;\r
+       //      //      double Factor = 2.0/(1.0 + exp(-t));\r
+       //              double Factor = min(Ratio, opt_abx)/opt_abx;\r
+       //              if (opt_verbose)\r
+       //                      Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str());\r
+       //              Hit.Score *= Factor;\r
+       //              }\r
+       //      }\r
+\r
+       extern FILE *g_fUChimeAlns;\r
+       if (g_fUChimeAlns != 0 && Hit.Div > 0.0)\r
+               {\r
+               void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit);\r
+               WriteChimeHitX(g_fUChimeAlns, Hit);\r
+               }\r
+       }\r
+\r
+void AlignChime3(const string &Q3, const string &A3, const string &B3,\r
+  const string &QLabel, const string &ALabel, const string &BLabel,\r
+  ChimeHit2 &Hit)\r
+       {\r
+       if (opt_ucl)\r
+               AlignChimeLocal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);\r
+       else\r
+               AlignChimeGlobal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);\r
+       }\r
+\r
+static void StripGaps(const byte *Seq, unsigned L, string &s)\r
+       {\r
+       s.clear();\r
+       for (unsigned i = 0; i < L; ++i)\r
+               {\r
+               char c = Seq[i];\r
+               if (!isgap(c))\r
+                       s.push_back(c);\r
+               }\r
+       }\r
+\r
+static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut)\r
+       {\r
+       SDOut = SDIn;\r
+       byte *s = myalloc(byte, SDIn.L);\r
+       unsigned k = 0;\r
+       for (unsigned i = 0; i < SDIn.L; ++i)\r
+               {\r
+               char c = SDIn.Seq[i];\r
+               if (!isgap(c))\r
+                       s[k++] = toupper(c);\r
+               }\r
+       SDOut.Seq = s;\r
+       SDOut.L = k;\r
+       }\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+  const string &PathQA, const string &PathQB, ChimeHit2 &Hit)\r
+       {\r
+       //if (opt_ucl)\r
+       //      {\r
+       //      AlignChimeLocal(QSD, ASD, BSD, PathQA, PathQB, Hit);\r
+       //      return;\r
+       //      }\r
+\r
+       string Q3;\r
+       string A3;\r
+       string B3;\r
+       Make3Way(QSD, ASD, BSD, PathQA, PathQB, Q3, A3, B3);\r
+\r
+       AlignChime3(Q3, A3, B3, QSD.Label, ASD.Label, BSD.Label, Hit);\r
+       }\r
+\r
+void AlignChime3SDRealign(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,\r
+  ChimeHit2 &Hit)\r
+       {\r
+       SeqData QSD;\r
+       SeqData ASD;\r
+       SeqData BSD;\r
+       StripGapsAlloc(QSD3, QSD);\r
+       StripGapsAlloc(ASD3, ASD);\r
+       StripGapsAlloc(BSD3, BSD);\r
+\r
+       string PathQA;\r
+       string PathQB;\r
+       bool FoundQA = GlobalAlign(QSD, ASD, PathQA);\r
+       bool FoundQB = GlobalAlign(QSD, BSD, PathQB);\r
+       if (!FoundQA || !FoundQB)\r
+               {\r
+               Hit.Clear();\r
+               Hit.QLabel = QSD3.Label;\r
+               return;\r
+               }\r
+\r
+       AlignChime(QSD, ASD, BSD, PathQA, PathQB, Hit);\r
+\r
+       myfree((void *) QSD.Seq);\r
+       myfree((void *) ASD.Seq);\r
+       myfree((void *) BSD.Seq);\r
+       }\r
+\r
+void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,\r
+  ChimeHit2 &Hit)\r
+       {\r
+       if (opt_realign)\r
+               {\r
+               AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit);\r
+               return;\r
+               }\r
+\r
+       string Q3;\r
+       string A3;\r
+       string B3;\r
+\r
+       const unsigned ColCount = QSD3.L;\r
+       asserta(ASD3.L == ColCount && BSD3.L == ColCount);\r
+\r
+       Q3.reserve(ColCount);\r
+       A3.reserve(ColCount);\r
+       B3.reserve(ColCount);\r
+\r
+       const byte *QS = QSD3.Seq;\r
+       const byte *AS = ASD3.Seq;\r
+       const byte *BS = BSD3.Seq;\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               byte q = toupper(QS[Col]);\r
+               byte a = toupper(AS[Col]);\r
+               byte b = toupper(BS[Col]);\r
+\r
+               if (isgap(q) && isgap(a) && isgap(b))\r
+                       continue;\r
+\r
+               Q3.push_back(q);\r
+               A3.push_back(a);\r
+               B3.push_back(b);\r
+               }\r
+\r
+       AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit);\r
+       }\r
diff --git a/uchime_src/alignchimel.cpp b/uchime_src/alignchimel.cpp

new file mode 100644 (file)

index 0000000..ae152af
--- /dev/null
+++ b/uchime_src/alignchimel.cpp
@@ -0,0 +1,417 @@
+#include "myutils.h"\r
+#include "seq.h"\r
+#include "chime.h"\r
+\r
+#define        TRACE   0\r
+\r
+/***\r
+Let:\r
+       S[i] =  Score of col i: 0=no SNP, +1 = Y, -3 = N or A.\r
+\r
+       V[k] =  Best segment score from j, j+1 .. k for all possible j\r
+                       max(j) Sum i=j..k S[i]\r
+\r
+Recursion relation:\r
+       V[k] =  S[k] + max (V[k-1], 0)\r
+***/\r
+\r
+void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,\r
+  const string &QLabel, const string &ALabel, const string &BLabel,\r
+  ChimeHit2 &Hit);\r
+\r
+void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB,\r
+  const string &PathQA, const string &PathQB,\r
+  string &Q3, string &A3, string &B3);\r
+\r
+double GetScore2(double Y, double N, double A);\r
+\r
+void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,\r
+  const string &QLabel, const string &ALabel, const string &BLabel,\r
+  ChimeHit2 &Hit)\r
+       {\r
+       Hit.Clear();\r
+\r
+       const byte *Q3Seq = (const byte *) Q3.c_str();\r
+       const byte *A3Seq = (const byte *) A3.c_str();\r
+       const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+       const unsigned ColCount = SIZE(Q3);\r
+       asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+       vector<float> ColScoresA(ColCount, 0.0f);\r
+       vector<float> ColScoresB(ColCount, 0.0f);\r
+\r
+       float ScoreN = -(float) opt_xn;\r
+       unsigned QL = 0;\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (!isgap(q))\r
+                       ++QL;\r
+\r
+               if (q == a && q == b && a == b)\r
+                       continue;\r
+\r
+               if (isgap(q) || isgap(a) || isgap(b))\r
+                       continue;\r
+\r
+               if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+                       continue;\r
+\r
+               if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+                       continue;\r
+\r
+               if (q == a && q != b)\r
+                       ColScoresA[Col] = 1;\r
+               else\r
+                       ColScoresA[Col] = ScoreN;\r
+\r
+               if (q == b && q != a)\r
+                       ColScoresB[Col] = 1;\r
+               else\r
+                       ColScoresB[Col] = ScoreN;\r
+               }\r
+\r
+       vector<float> LVA(ColCount, 0.0f);\r
+       vector<float> LVB(ColCount, 0.0f);\r
+\r
+       LVA[0] = ColScoresA[0];\r
+       LVB[0] = ColScoresB[0];\r
+       for (unsigned Col = 1; Col < ColCount; ++Col)\r
+               {\r
+               LVA[Col] = max(LVA[Col-1], 0.0f) + ColScoresA[Col];\r
+               LVB[Col] = max(LVB[Col-1], 0.0f) + ColScoresB[Col];\r
+               }\r
+\r
+       vector<float> RVA(ColCount, 0.0f);\r
+       vector<float> RVB(ColCount, 0.0f);\r
+\r
+       RVA[ColCount-1] = ColScoresA[ColCount-1];\r
+       RVB[ColCount-1] = ColScoresB[ColCount-1];\r
+       for (int Col = ColCount-2; Col >= 0; --Col)\r
+               {\r
+               RVA[Col] = max(RVA[Col+1], 0.0f) + ColScoresA[Col];\r
+               RVB[Col] = max(RVB[Col+1], 0.0f) + ColScoresB[Col];\r
+               }\r
+\r
+       bool FirstA = true;\r
+       float MaxSum = 0.0;\r
+       unsigned ColX = UINT_MAX;\r
+       for (unsigned Col = 1; Col < ColCount-1; ++Col)\r
+               {\r
+               float Sum = LVA[Col] + RVB[Col+1];\r
+               if (Sum > MaxSum)\r
+                       {\r
+                       FirstA = true;\r
+                       MaxSum = Sum;\r
+                       ColX = Col;\r
+                       }\r
+               }\r
+\r
+       for (unsigned Col = 1; Col < ColCount-1; ++Col)\r
+               {\r
+               float Sum = LVB[Col] + RVA[Col+1];\r
+               if (Sum > MaxSum)\r
+                       {\r
+                       FirstA = false;\r
+                       MaxSum = Sum;\r
+                       ColX = Col;\r
+                       }\r
+               }\r
+       if (ColX == UINT_MAX)\r
+               return;\r
+\r
+       unsigned ColLo = UINT_MAX;\r
+       unsigned ColHi = UINT_MAX;\r
+       if (FirstA)\r
+               {\r
+               float Sum = 0.0f;\r
+               for (int Col = ColX; Col >= 0; --Col)\r
+                       {\r
+                       Sum += ColScoresA[Col];\r
+                       if (Sum >= LVA[ColX])\r
+                               {\r
+                               ColLo = Col;\r
+                               break;\r
+                               }\r
+                       }\r
+               asserta(Sum >= LVA[ColX]);\r
+               Sum = 0.0f;\r
+               for (unsigned Col = ColX+1; Col < ColCount; ++Col)\r
+                       {\r
+                       Sum += ColScoresB[Col];\r
+                       if (Sum >= RVB[ColX])\r
+                               {\r
+                               ColHi = Col;\r
+                               break;\r
+                               }\r
+                       }\r
+               asserta(Sum >= RVB[ColX]);\r
+               }\r
+       else\r
+               {\r
+               float Sum = 0.0f;\r
+               for (int Col = ColX; Col >= 0; --Col)\r
+                       {\r
+                       Sum += ColScoresB[Col];\r
+                       if (Sum >= LVB[ColX])\r
+                               {\r
+                               ColLo = Col;\r
+                               break;\r
+                               }\r
+                       }\r
+               asserta(Sum >= LVB[ColX]);\r
+               Sum = 0.0f;\r
+               for (unsigned Col = ColX+1; Col < ColCount; ++Col)\r
+                       {\r
+                       Sum += ColScoresA[Col];\r
+                       if (Sum >= RVA[ColX])\r
+                               {\r
+                               ColHi = Col;\r
+                               break;\r
+                               }\r
+                       }\r
+               asserta(Sum >= RVA[ColX]);\r
+               }\r
+\r
+       unsigned ColXHi = ColX;\r
+       for (unsigned Col = ColX + 1; Col < ColCount; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               \r
+               if (q == a && q == b && !isgap(q))\r
+                       ColXHi = Col;\r
+               else\r
+                       break;\r
+               }\r
+\r
+       unsigned ColXLo = ColX;\r
+       for (int Col = (int) ColX - 1; Col >= 0; --Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               \r
+               if (q == a && q == b && !isgap(q))\r
+                       ColXLo = Col;\r
+               else\r
+                       break;\r
+               }\r
+\r
+       unsigned IdQA = 0;\r
+       unsigned IdQB = 0;\r
+       unsigned IdAB = 0;\r
+       unsigned NQA = 0;\r
+       unsigned NQB = 0;\r
+       unsigned NAB = 0;\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (!isgap(q) && !isgap(a))\r
+                       {\r
+                       ++NQA;\r
+                       if (q == a)\r
+                               ++IdQA;\r
+                       }\r
+\r
+               if (!isgap(q) && !isgap(b))\r
+                       {\r
+                       ++NQB;\r
+                       if (q == b)\r
+                               ++IdQB;\r
+                       }\r
+\r
+               if (!isgap(a) && !isgap(b))\r
+                       {\r
+                       ++NAB;\r
+                       if (a == b)\r
+                               ++IdAB;\r
+                       }\r
+               }\r
+\r
+       Hit.PctIdQA = Pct(IdQA, NQA);\r
+       Hit.PctIdQB = Pct(IdQB, NQB);\r
+       Hit.PctIdAB = Pct(IdAB, NAB);\r
+\r
+       unsigned LIdQA = 0;\r
+       unsigned LIdQB = 0;\r
+       for (unsigned Col = ColLo; Col < ColXLo; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (!isgap(q) && !isgap(a))\r
+                       {\r
+                       if (q == a)\r
+                               ++LIdQA;\r
+                       }\r
+\r
+               if (!isgap(q) && !isgap(b))\r
+                       {\r
+                       if (q == b)\r
+                               ++LIdQB;\r
+                       }\r
+               }\r
+\r
+       unsigned RIdQA = 0;\r
+       unsigned RIdQB = 0;\r
+       for (unsigned Col = ColXHi+1; Col <= ColHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+\r
+               if (!isgap(q) && !isgap(a))\r
+                       {\r
+                       if (q == a)\r
+                               ++RIdQA;\r
+                       }\r
+\r
+               if (!isgap(q) && !isgap(b))\r
+                       {\r
+                       if (q == b)\r
+                               ++RIdQB;\r
+                       }\r
+               }\r
+\r
+       unsigned IdDiffL = max(LIdQA, LIdQB) - min(LIdQA, LIdQB);\r
+       unsigned IdDiffR = max(RIdQA, RIdQB) - min(RIdQA, RIdQB);\r
+       unsigned MinIdDiff = min(IdDiffL, IdDiffR);\r
+       unsigned ColRange = ColHi - ColLo + 1;\r
+       if (opt_queryfract > 0.0f && float(ColRange)/float(QL) < opt_queryfract)\r
+               return;\r
+\r
+//     double Div = Pct(MinIdDiff, QSD.L);\r
+\r
+#if    TRACE\r
+       {\r
+       Log("  Col  A Q B   ScoreA   ScoreB      LVA      LVB      RVA      RVB\n");\r
+       Log("-----  - - -  -------  -------  -------  -------  -------  -------\n");\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               if (ColScoresA[Col] == 0.0 && ColScoresB[Col] == 0.0)\r
+                       continue;\r
+\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               Log("%5u  %c %c %c", Col, a, q, b);\r
+\r
+               if (ColScoresA[Col] == 0.0)\r
+                       Log("  %7.7s", "");\r
+               else\r
+                       Log("  %7.1f", ColScoresA[Col]);\r
+\r
+               if (ColScoresB[Col] == 0.0)\r
+                       Log("  %7.7s", "");\r
+               else\r
+                       Log("  %7.1f", ColScoresB[Col]);\r
+\r
+               Log("  %7.1f  %7.1f  %7.1f  %7.1f", LVA[Col], LVB[Col], RVA[Col], RVB[Col]);\r
+\r
+               Log("\n");\r
+               }\r
+       Log("\n");\r
+       Log("MaxSum %.1f, ColLo %u, ColXLo %u, ColX %u, ColXHi %u, ColHi %u, AF %c\n",\r
+         MaxSum, ColLo, ColXLo, ColX, ColXHi, ColHi, tof(FirstA));\r
+       Log("  LIdQA %u, LIdQB %u, RIdQA %u, RIdQB %u\n", LIdQA, LIdQB, RIdQA, RIdQB);\r
+       }\r
+#endif\r
+\r
+       string Q3L;\r
+       string A3L;\r
+       string B3L;\r
+       for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+               {\r
+               char q = Q3[Col];\r
+               char a = A3[Col];\r
+               char b = B3[Col];\r
+\r
+               Q3L += q;\r
+               A3L += a;\r
+               B3L += b;\r
+               }\r
+\r
+       AlignChimeGlobal3(Q3L, A3L, B3L, QLabel, ALabel, BLabel, Hit);\r
+\r
+#if    0\r
+// CS SNPs\r
+       Hit.CS_LY = 0;\r
+       Hit.CS_LN = 0;\r
+       Hit.CS_RY = 0;\r
+       Hit.CS_RN = 0;\r
+       Hit.CS_LA = 0;\r
+       Hit.CS_RA = 0;\r
+       for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+               {\r
+               char q = Q3Seq[Col];\r
+               char a = A3Seq[Col];\r
+               char b = B3Seq[Col];\r
+               if (q == a && q == b && a == b)\r
+                       continue;\r
+               if (isgap(q) || isgap(a) || isgap(b))\r
+                       continue;\r
+               if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+                       continue;\r
+               if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+                       continue;\r
+\r
+               if (!FirstA)\r
+                       swap(a, b);\r
+\r
+               if (Col < ColXLo)\r
+                       {\r
+                       if (q == a && q != b)\r
+                               ++Hit.CS_LY;\r
+                       else if (q == b && q != a)\r
+                               ++Hit.CS_LN;\r
+                       else\r
+                               ++Hit.CS_LA;\r
+                       }\r
+               else if (Col > ColXHi)\r
+                       {\r
+                       if (q == b && q != a)\r
+                               ++Hit.CS_RY;\r
+                       else if (q == a && q != b)\r
+                               ++Hit.CS_RN;\r
+                       else\r
+                               ++Hit.CS_RA;\r
+                       }\r
+               }\r
+\r
+       double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);\r
+       double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);\r
+       Hit.Score = ScoreL*ScoreR;\r
+\r
+       //Hit.QSD = QSD;\r
+       //if (FirstA)\r
+       //      {\r
+       //      Hit.ASD = ASD;\r
+       //      Hit.BSD = BSD;\r
+       //      Hit.PathQA = PathQA;\r
+       //      Hit.PathQB = PathQB;\r
+       //      }\r
+       //else\r
+       //      {\r
+       //      Hit.ASD = BSD;\r
+       //      Hit.BSD = ASD;\r
+       //      }\r
+\r
+       //Hit.ColLo = ColLo;\r
+       //Hit.ColXLo = ColXLo;\r
+       //Hit.ColXHi = ColXHi;\r
+       //Hit.ColHi = ColHi;\r
+       //Hit.Div = Div;\r
+\r
+//     Hit.LogMe();\r
+#endif\r
+       }\r
diff --git a/uchime_src/allocs.h b/uchime_src/allocs.h

new file mode 100644 (file)

index 0000000..157d03e
--- /dev/null
+++ b/uchime_src/allocs.h
@@ -0,0 +1,24 @@
+A(Alpha)\r
+A(Mx)\r
+A(ChainBrute)\r
+A(Chainer)\r
+A(Test)\r
+A(CompressPath)\r
+A(HSPFinder)\r
+A(Main)\r
+A(Clumps)\r
+A(Path)\r
+A(SeqDB)\r
+A(SFasta)\r
+A(SWUngapped)\r
+A(AllocBit)\r
+A(Ultra)\r
+A(UPGMA)\r
+A(Windex)\r
+A(XDropBwd)\r
+A(Xlat)\r
+A(MPath)\r
+A(ScoreCache)\r
+A(TargetHits)\r
+A(Out)\r
+A(Hashdex)\r
diff --git a/uchime_src/alnheuristics.h b/uchime_src/alnheuristics.h

new file mode 100644 (file)

index 0000000..9a8d283
--- /dev/null
+++ b/uchime_src/alnheuristics.h
@@ -0,0 +1,29 @@
+#ifndef alnheuristics_h\r
+#define alnheuristics_h\r
+\r
+struct AlnParams;\r
+\r
+struct AlnHeuristics\r
+       {\r
+       unsigned BandRadius;\r
+       unsigned HSPFinderWordLength;\r
+       float SeedT;\r
+\r
+       float XDropG;                   //  GappedBlast default\r
+       float XDropU;                   //  UngappedBlast default\r
+       float XDropUG;                  //  UngappedBlast called by GappedBlast\r
+\r
+       unsigned MinGlobalHSPLength;\r
+\r
+       AlnHeuristics();\r
+       void InitFromCmdLine(const AlnParams &AP);\r
+       void InitGlobalFull();\r
+\r
+       bool IsGlobalFull() const\r
+               {\r
+               return MinGlobalHSPLength == 0 && BandRadius == 0;\r
+               }\r
+\r
+       };\r
+\r
+#endif // alnheuristics_h\r
diff --git a/uchime_src/alnparams.cpp b/uchime_src/alnparams.cpp

new file mode 100644 (file)

index 0000000..d1b9036
--- /dev/null
+++ b/uchime_src/alnparams.cpp
@@ -0,0 +1,414 @@
+#include "myutils.h"\r
+#include <float.h>     // for FLT_MAX\r
+#include "mx.h"\r
+#include "alnparams.h"\r
+#include "hsp.h"\r
+\r
+#define TEST   0\r
+\r
+void SetBLOSUM62();
+void SetNucSubstMx(double Match, double Mismatch);\r
+void ReadSubstMx(const string &FileName, Mx<float> &Mxf);\r
+
+extern Mx<float> g_SubstMxf;
+extern float **g_SubstMx;
+\r
+void AlnParams::Clear()\r
+       {\r
+       SubstMxName = 0;\r
+       LocalOpen = OBVIOUSLY_WRONG_PENALTY;\r
+       LocalExt = OBVIOUSLY_WRONG_PENALTY;\r
+       OpenA = OBVIOUSLY_WRONG_PENALTY;\r
+       OpenB = OBVIOUSLY_WRONG_PENALTY;\r
+       ExtA = OBVIOUSLY_WRONG_PENALTY;\r
+       ExtB = OBVIOUSLY_WRONG_PENALTY;\r
+       LOpenA = OBVIOUSLY_WRONG_PENALTY;\r
+       LOpenB = OBVIOUSLY_WRONG_PENALTY;\r
+       ROpenA = OBVIOUSLY_WRONG_PENALTY;\r
+       ROpenB = OBVIOUSLY_WRONG_PENALTY;\r
+       LExtA = OBVIOUSLY_WRONG_PENALTY;\r
+       LExtB = OBVIOUSLY_WRONG_PENALTY;\r
+       RExtA = OBVIOUSLY_WRONG_PENALTY;\r
+       RExtB = OBVIOUSLY_WRONG_PENALTY;\r
+       Nucleo = false;\r
+       NucleoSet = false;\r
+       }\r
+\r
+bool AlnParams::Is2() const\r
+       {\r
+       float g = OpenA;\r
+       float e = ExtA;\r
+       if (OpenB != g || LOpenA != g || LOpenB != g || ROpenA != g || ROpenB != g)\r
+               return false;\r
+       if (ExtB != e || LExtA != e || LExtB != e || RExtA != e || RExtB != e)\r
+               return false;\r
+       return true;\r
+       }\r
+\r
+bool AlnParams::Is4() const\r
+       {\r
+       float g = OpenA;\r
+       float tg = LOpenA;\r
+       float e = ExtA;\r
+       float te = LExtA;\r
+       if (OpenB != g || LOpenA != tg || LOpenB != tg || ROpenA != tg || ROpenB != tg)\r
+               return false;\r
+       if (ExtB != e || LExtA != te || LExtB != te || RExtA != te || RExtB != te)\r
+               return false;\r
+       return true;\r
+       }\r
+\r
+const char *AlnParams::GetType() const\r
+       {\r
+       if (Is2())\r
+               return "2";\r
+       else if (Is4())\r
+               return "4";\r
+       return "12";\r
+       }\r
+\r
+void AlnParams::Init2(const float * const *Mx, float Open, float Ext)\r
+       {\r
+       SubstMx = Mx;\r
+       OpenA = OpenB = LOpenA = LOpenB = ROpenA = ROpenB = Open;\r
+       ExtA = ExtB = LExtA = LExtB = RExtA = RExtB = Ext;\r
+       }\r
+\r
+void AlnParams::SetLocal(float Open, float Ext)\r
+       {\r
+       LocalOpen = Open;\r
+       LocalExt = Ext;\r
+       }\r
+\r
+void AlnParams::Init4(const float * const *Mx, float Open, float Ext,\r
+  float TermOpen, float TermExt)\r
+       {\r
+       SubstMx = Mx;\r
+       OpenA = OpenB = Open;\r
+       LOpenA = LOpenB = ROpenA = ROpenB = TermOpen;\r
+       ExtA = ExtB = Ext;\r
+       LExtA = LExtB = RExtA = RExtB = TermExt;\r
+       }\r
+\r
+void AlnParams::Init(const AlnParams &AP, const HSPData &HSP,\r
+  unsigned LA, unsigned LB)\r
+       {\r
+       SubstMx = AP.SubstMx;\r
+       OpenA = AP.OpenA;\r
+       OpenB = AP.OpenB;\r
+       ExtA = AP.ExtA;\r
+       ExtB = AP.ExtB;\r
+\r
+       if (HSP.LeftA())\r
+               {\r
+               LOpenA = AP.LOpenA;\r
+               LExtA = AP.LExtA;\r
+               }\r
+       else\r
+               {\r
+               LOpenA = AP.OpenA;\r
+               LExtA = AP.ExtA;\r
+               }\r
+\r
+       if (HSP.LeftB())\r
+               {\r
+               LOpenB = AP.LOpenB;\r
+               LExtB = AP.LExtB;\r
+               }\r
+       else\r
+               {\r
+               LOpenB = AP.OpenB;\r
+               LExtB = AP.ExtB;\r
+               }\r
+\r
+       if (HSP.RightA(LA))\r
+               {\r
+               ROpenA = AP.ROpenA;\r
+               RExtA = AP.RExtA;\r
+               }\r
+       else\r
+               {\r
+               ROpenA = AP.OpenA;\r
+               RExtA = AP.ExtA;\r
+               }\r
+\r
+       if (HSP.RightB(LB))\r
+               {\r
+               ROpenB = AP.ROpenB;\r
+               RExtB = AP.RExtB;\r
+               }\r
+       else\r
+               {\r
+               ROpenB = AP.OpenB;\r
+               RExtB = AP.ExtB;\r
+               }\r
+       }\r
+\r
+void AlnParams::LogMe() const\r
+       {\r
+       Log("AlnParams(%s)", GetType());\r
+       if (Is2())\r
+               Log(" g=%.1f e=%.1f", -OpenA, -ExtA);\r
+       else if (Is4())\r
+               Log(" g=%.1f tg=%.1f e=%.1f te=%.1f", -OpenA, -ExtA, -LOpenA, -LExtA);\r
+       else\r
+               Log(\r
+" gA=%.1f gB=%.1f gAL=%.1f gBL=%.1f gAR=%.1f gBR=%.1f eA=%.1f eB=%.1f eAL=%.1f eBL=%.1f eAR=%.1f eBR=%.1f",\r
+                 OpenA, OpenB, LOpenA, LOpenB, ROpenA, ROpenB, ExtA, ExtB, LExtA, LExtB, RExtA, RExtB);\r
+       Log("\n");\r
+       }\r
+\r
+/***\r
+Open/Ext format string is one or more:\r
+       [<flag><flag>...]<value>\r
+\r
+Value is (positive) penalty or * (disabled).\r
+Flag is:\r
+       Q               Query.\r
+       T               Target sequence.\r
+       I               Internal gaps (defafault internal and terminal).\r
+       E               End gaps (default internal and terminal).\r
+       L               Left end.\r
+       R               Right end.\r
+***/\r
+\r
+static void ParseGapStr(const string &s,\r
+  float &QI, float &QL, float &QR,\r
+  float &TI, float &TL, float &TR)\r
+       {\r
+       if (s.empty())\r
+               return;\r
+\r
+       bool Q = false;\r
+       bool T = false;\r
+       bool I = false;\r
+       bool E = false;\r
+       bool L = false;\r
+       bool R = false;\r
+\r
+       const unsigned K = SIZE(s);\r
+       unsigned Dec = 0;\r
+       float Value = FLT_MAX;\r
+       for (unsigned i = 0; i <= K; ++i)\r
+               {\r
+               char c = s.c_str()[i];\r
+               if (c == 0 || c == '/')\r
+                       {\r
+                       if (Value == FLT_MAX)\r
+                               Die("Invalid gap penalty string, missing penalty '%s'", s.c_str());\r
+                       if (!Q && !T && !I && !E && !L && !R)\r
+                               {\r
+                               Q = true;\r
+                               T = true;\r
+                               L = true;\r
+                               R = true;\r
+                               I = true;\r
+                               }\r
+\r
+                       if (!E && !I && !L && !R)\r
+                               {\r
+                               E = false;\r
+                               I = true;\r
+                               L = true;\r
+                               R = true;\r
+                               }\r
+\r
+                       if (E)\r
+                               {\r
+                               if (L || R)\r
+                                       Die("Invalid gap penalty string (E and L or R) '%s'", s.c_str());\r
+                               L = true;\r
+                               R = true;\r
+                               }\r
+\r
+                       if (!Q && !T)\r
+                               {\r
+                               Q = true;\r
+                               T = true;\r
+                               }\r
+\r
+                       if (Q && L)\r
+                               QL = -Value;\r
+                       if (Q && R)\r
+                               QR = -Value;\r
+                       if (Q && I)\r
+                               QI = -Value;\r
+                       if (T && L)\r
+                               TL = -Value;\r
+                       if (T && R)\r
+                               TR = -Value;\r
+                       if (T && I)\r
+                               TI = -Value;\r
+                       \r
+                       Value = FLT_MAX;\r
+                       Dec = 0;\r
+                       Q = false;\r
+                       T = false;\r
+                       I = false;\r
+                       E = false;\r
+                       L = false;\r
+                       R = false;\r
+                       }\r
+               else if (c == '*')\r
+                       {\r
+                       if (Value != FLT_MAX)\r
+                               Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str());\r
+                       Value = -MINUS_INFINITY;\r
+                       }\r
+               else if (isdigit(c))\r
+                       {\r
+                       if (Value == -MINUS_INFINITY)\r
+                               Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str());\r
+                       if (Value == FLT_MAX)\r
+                               Value = 0.0;\r
+                       if (Dec > 0)\r
+                               {\r
+                               Dec *= 10;\r
+                               Value += float(c - '0')/Dec;\r
+                               }\r
+                       else\r
+                               Value = Value*10 + (c - '0');\r
+                       }\r
+               else if (c == '.')\r
+                       {\r
+                       if (Dec > 0)\r
+                               Die("Invalid gap penalty (two decimal points) '%s'", s.c_str());\r
+                       Dec = 1;\r
+                       }\r
+               else\r
+                       {\r
+                       switch (c)\r
+                               {\r
+                       case 'Q':\r
+                               Q = true;\r
+                               break;\r
+                       case 'T':\r
+                               T = true;\r
+                               break;\r
+                       case 'I':\r
+                               I = true;\r
+                               break;\r
+                       case 'L':\r
+                               L = true;\r
+                               break;\r
+                       case 'R':\r
+                               R = true;\r
+                               break;\r
+                       case 'E':\r
+                               E = true;\r
+                               break;\r
+                       default:\r
+                               Die("Invalid char '%c' in gap penalty string '%s'", c, s.c_str());\r
+                               }\r
+                       }\r
+               }\r
+       }\r
+\r
+void AlnParams::SetPenalties(const string &OpenStr, const string &ExtStr)\r
+       {\r
+       ParseGapStr(OpenStr, OpenA, LOpenA, ROpenA, OpenB, LOpenB, ROpenB);\r
+       ParseGapStr(ExtStr, ExtA, LExtA, RExtA, ExtB, LExtB, RExtB);\r
+       }\r
+\r
+void AlnParams::SetMxFromCmdLine(bool IsNucleo)\r
+       {\r
+       if (IsNucleo)\r
+               SetNucSubstMx(opt_match, opt_mismatch);
+       else\r
+               {\r
+               if (opt_matrix == "")\r
+                       {\r
+                       SubstMxName = "BLOSUM62";\r
+                       SetBLOSUM62();
+                       }
+               else\r
+                       {\r
+                       ReadSubstMx(opt_matrix, g_SubstMxf);\r
+                       g_SubstMx = g_SubstMxf.GetData();\r
+                       g_SubstMxf.LogMe();\r
+                       SubstMxName = opt_matrix.c_str();\r
+                       }\r
+               }\r
+       SubstMx = g_SubstMx;\r
+       asserta(SubstMx != 0);\r
+       }\r
+\r
+void AlnParams::InitFromCmdLine(bool IsNucleo)\r
+       {\r
+       Clear();\r
+       Nucleo = IsNucleo;\r
+       NucleoSet = true;\r
+\r
+       SetMxFromCmdLine(IsNucleo);\r
+\r
+// Local\r
+       if (optset_lopen || optset_lext)\r
+               {\r
+               if (!optset_lopen || !optset_lext)\r
+                       Die("Must set both --lopen and --lext");\r
+               if (opt_lopen < 0.0 || opt_lext < 0.0)\r
+                       Die("Invalid --lopen/--lext, gap penalties must be >= 0");\r
+               SetLocal(float(-opt_lopen), float(-opt_lext));\r
+               }\r
+       else\r
+               {\r
+       // Same penalties, if-statement to note could differ.\r
+               if (IsNucleo)\r
+                       SetLocal(-10.0f, -1.0f);\r
+               else\r
+                       SetLocal(-10.0f, -1.0f);\r
+               }\r
+\r
+// Global\r
+       if (IsNucleo)\r
+               Init4(g_SubstMx, -10.0, -1.0, -0.5, -0.5);
+       else\r
+               Init4(g_SubstMx, -17.0, -1.0, -0.5, -0.5);
+       SetPenalties(opt_gapopen, opt_gapext);\r
+       }\r
+\r
+float AlnParams::GetLocalOpen() const\r
+       {\r
+       return LocalOpen;\r
+       }\r
+\r
+float AlnParams::GetLocalExt() const\r
+       {\r
+       return LocalExt;\r
+       }\r
+\r
+bool AlnParams::GetIsNucleo() const\r
+       {\r
+       asserta(NucleoSet);\r
+       return Nucleo;\r
+       }\r
+\r
+unsigned GetWindexWordLength(bool Nucleo)\r
+       {\r
+       if (optset_w)\r
+               return opt_w;\r
+\r
+       if (Nucleo)\r
+               return 8;\r
+       else\r
+               return 5;\r
+       }\r
+\r
+#if    TEST\r
+static void Test1(const string &os, const string &es)\r
+       {\r
+       AlnParams AP;\r
+       Log("\n");\r
+       Log("OpenStr %s\n", os.c_str());\r
+       Log(" ExtStr %s\n", es.c_str());\r
+       AP.SetPenalties(os, es);\r
+       AP.LogMe();\r
+       }\r
+\r
+void TestGapStr()\r
+       {\r
+       Test1("17I/0.5E", "1I/0.5E");\r
+       Test1("17I/0.5L/0.4R", "1Q/2T");\r
+       Test1("1QL/2QR/3QI/4TL/5TR/6TI", ".1QL/.2QR/.3QI/.4TL/.5TR/.6TI");\r
+       }\r
+#endif // TEST\r
diff --git a/uchime_src/alnparams.h b/uchime_src/alnparams.h

new file mode 100644 (file)

index 0000000..4037912
--- /dev/null
+++ b/uchime_src/alnparams.h
@@ -0,0 +1,59 @@
+#ifndef alnparams_h\r
+#define alnparams_h\r
+\r
+struct HSPData;\r
+\r
+// Gap penalty scores are negative\r
+// (i.e., are scores, not penalties).\r
+struct AlnParams\r
+       {\r
+       const char *SubstMxName;\r
+       const float * const *SubstMx;\r
+\r
+       bool Nucleo;\r
+       bool NucleoSet;\r
+\r
+// Local gaps\r
+       float LocalOpen;\r
+       float LocalExt;\r
+\r
+// Global internal gaps\r
+       float OpenA;\r
+       float OpenB;\r
+\r
+       float ExtA;\r
+       float ExtB;\r
+\r
+// Global terminal gaps\r
+       float LOpenA;\r
+       float LOpenB;\r
+       float ROpenA;\r
+       float ROpenB;\r
+\r
+       float LExtA;\r
+       float LExtB;\r
+       float RExtA;\r
+       float RExtB;\r
+\r
+       void Clear();\r
+       void SetLocal(float Open, float Ext);\r
+       void Init2(const float * const *Mx, float Open, float Ext);\r
+       void Init4(const float * const *Mx, float Open, float Ext, float TermOpen, float TermExt);\r
+       void Init(const AlnParams &AP, const HSPData &HSP, unsigned LA, unsigned LB);\r
+       void InitFromCmdLine(bool Nucleo);\r
+       void SetMxFromCmdLine(bool Nucleo);\r
+       void SetPenalties(const string &OpenStr, const string &ExtStr);\r
+       float GetLocalOpen() const;\r
+       float GetLocalExt() const;\r
+       bool GetIsNucleo() const;\r
+\r
+       bool Is2() const;\r
+       bool Is4() const;\r
+       const char *GetType() const;\r
+\r
+       void LogMe() const;\r
+       };\r
+\r
+const float OBVIOUSLY_WRONG_PENALTY = 1000.0;\r
+\r
+#endif // alnparams_h\r
diff --git a/uchime_src/alpha.cpp b/uchime_src/alpha.cpp

new file mode 100644 (file)

index 0000000..0efca3b
--- /dev/null
+++ b/uchime_src/alpha.cpp
@@ -0,0 +1,2761 @@
+// Generated by /p/py/alphac.py
+#include "alpha.h"
+
+unsigned g_CharToLetterAminoStop[256] =
+       {
+       INVALID_LETTER, // [  0] 0x00
+       INVALID_LETTER, // [  1] 0x01
+       INVALID_LETTER, // [  2] 0x02
+       INVALID_LETTER, // [  3] 0x03
+       INVALID_LETTER, // [  4] 0x04
+       INVALID_LETTER, // [  5] 0x05
+       INVALID_LETTER, // [  6] 0x06
+       INVALID_LETTER, // [  7] 0x07
+       INVALID_LETTER, // [  8] 0x08
+       INVALID_LETTER, // [  9] 0x09
+       INVALID_LETTER, // [ 10] 0x0a
+       INVALID_LETTER, // [ 11] 0x0b
+       INVALID_LETTER, // [ 12] 0x0c
+       INVALID_LETTER, // [ 13] 0x0d
+       INVALID_LETTER, // [ 14] 0x0e
+       INVALID_LETTER, // [ 15] 0x0f
+       INVALID_LETTER, // [ 16] 0x10
+       INVALID_LETTER, // [ 17] 0x11
+       INVALID_LETTER, // [ 18] 0x12
+       INVALID_LETTER, // [ 19] 0x13
+       INVALID_LETTER, // [ 20] 0x14
+       INVALID_LETTER, // [ 21] 0x15
+       INVALID_LETTER, // [ 22] 0x16
+       INVALID_LETTER, // [ 23] 0x17
+       INVALID_LETTER, // [ 24] 0x18
+       INVALID_LETTER, // [ 25] 0x19
+       INVALID_LETTER, // [ 26] 0x1a
+       INVALID_LETTER, // [ 27] 0x1b
+       INVALID_LETTER, // [ 28] 0x1c
+       INVALID_LETTER, // [ 29] 0x1d
+       INVALID_LETTER, // [ 30] 0x1e
+       INVALID_LETTER, // [ 31] 0x1f
+       INVALID_LETTER, // [ 32] ' '
+       INVALID_LETTER, // [ 33] '!'
+       INVALID_LETTER, // [ 34] '"'
+       INVALID_LETTER, // [ 35] '#'
+       INVALID_LETTER, // [ 36] '$'
+       INVALID_LETTER, // [ 37] '%'
+       INVALID_LETTER, // [ 38] '&'
+       INVALID_LETTER, // [ 39] '''
+       INVALID_LETTER, // [ 40] '('
+       INVALID_LETTER, // [ 41] ')'
+       20 ,            // [ 42] '*' = STP
+       INVALID_LETTER, // [ 43] '+'
+       INVALID_LETTER, // [ 44] ','
+       INVALID_LETTER, // [ 45] '-'
+       INVALID_LETTER, // [ 46] '.'
+       INVALID_LETTER, // [ 47] '/'
+       INVALID_LETTER, // [ 48] '0'
+       INVALID_LETTER, // [ 49] '1'
+       INVALID_LETTER, // [ 50] '2'
+       INVALID_LETTER, // [ 51] '3'
+       INVALID_LETTER, // [ 52] '4'
+       INVALID_LETTER, // [ 53] '5'
+       INVALID_LETTER, // [ 54] '6'
+       INVALID_LETTER, // [ 55] '7'
+       INVALID_LETTER, // [ 56] '8'
+       INVALID_LETTER, // [ 57] '9'
+       INVALID_LETTER, // [ 58] ':'
+       INVALID_LETTER, // [ 59] ';'
+       INVALID_LETTER, // [ 60] '<'
+       INVALID_LETTER, // [ 61] '='
+       INVALID_LETTER, // [ 62] '>'
+       INVALID_LETTER, // [ 63] '?'
+       INVALID_LETTER, // [ 64] '@'
+       0  ,            // [ 65] 'A' = Ala
+       INVALID_LETTER, // [ 66] 'B'
+       1  ,            // [ 67] 'C' = Cys
+       2  ,            // [ 68] 'D' = Asp
+       3  ,            // [ 69] 'E' = Glu
+       4  ,            // [ 70] 'F' = Phe
+       5  ,            // [ 71] 'G' = Gly
+       6  ,            // [ 72] 'H' = His
+       7  ,            // [ 73] 'I' = Ile
+       INVALID_LETTER, // [ 74] 'J'
+       8  ,            // [ 75] 'K' = Lys
+       9  ,            // [ 76] 'L' = Leu
+       10 ,            // [ 77] 'M' = Met
+       11 ,            // [ 78] 'N' = Asn
+       INVALID_LETTER, // [ 79] 'O'
+       12 ,            // [ 80] 'P' = Pro
+       13 ,            // [ 81] 'Q' = Gln
+       14 ,            // [ 82] 'R' = Arg
+       15 ,            // [ 83] 'S' = Ser
+       16 ,            // [ 84] 'T' = Thr
+       INVALID_LETTER, // [ 85] 'U'
+       17 ,            // [ 86] 'V' = Val
+       18 ,            // [ 87] 'W' = Trp
+       INVALID_LETTER, // [ 88] 'X'
+       19 ,            // [ 89] 'Y' = Tyr
+       INVALID_LETTER, // [ 90] 'Z'
+       INVALID_LETTER, // [ 91] '['
+       INVALID_LETTER, // [ 92] '\'
+       INVALID_LETTER, // [ 93] ']'
+       INVALID_LETTER, // [ 94] '^'
+       INVALID_LETTER, // [ 95] '_'
+       INVALID_LETTER, // [ 96] '`'
+       0  ,            // [ 97] 'a' = Ala
+       INVALID_LETTER, // [ 98] 'b'
+       1  ,            // [ 99] 'c' = Cys
+       2  ,            // [100] 'd' = Asp
+       3  ,            // [101] 'e' = Glu
+       4  ,            // [102] 'f' = Phe
+       5  ,            // [103] 'g' = Gly
+       6  ,            // [104] 'h' = His
+       7  ,            // [105] 'i' = Ile
+       INVALID_LETTER, // [106] 'j'
+       8  ,            // [107] 'k' = Lys
+       9  ,            // [108] 'l' = Leu
+       10 ,            // [109] 'm' = Met
+       11 ,            // [110] 'n' = Asn
+       INVALID_LETTER, // [111] 'o'
+       12 ,            // [112] 'p' = Pro
+       13 ,            // [113] 'q' = Gln
+       14 ,            // [114] 'r' = Arg
+       15 ,            // [115] 's' = Ser
+       16 ,            // [116] 't' = Thr
+       INVALID_LETTER, // [117] 'u'
+       17 ,            // [118] 'v' = Val
+       18 ,            // [119] 'w' = Trp
+       INVALID_LETTER, // [120] 'x'
+       19 ,            // [121] 'y' = Tyr
+       INVALID_LETTER, // [122] 'z'
+       INVALID_LETTER, // [123] '{'
+       INVALID_LETTER, // [124] '|'
+       INVALID_LETTER, // [125] '}'
+       INVALID_LETTER, // [126] '~'
+       INVALID_LETTER, // [127] 0x7f
+       INVALID_LETTER, // [128] 0x80
+       INVALID_LETTER, // [129] 0x81
+       INVALID_LETTER, // [130] 0x82
+       INVALID_LETTER, // [131] 0x83
+       INVALID_LETTER, // [132] 0x84
+       INVALID_LETTER, // [133] 0x85
+       INVALID_LETTER, // [134] 0x86
+       INVALID_LETTER, // [135] 0x87
+       INVALID_LETTER, // [136] 0x88
+       INVALID_LETTER, // [137] 0x89
+       INVALID_LETTER, // [138] 0x8a
+       INVALID_LETTER, // [139] 0x8b
+       INVALID_LETTER, // [140] 0x8c
+       INVALID_LETTER, // [141] 0x8d
+       INVALID_LETTER, // [142] 0x8e
+       INVALID_LETTER, // [143] 0x8f
+       INVALID_LETTER, // [144] 0x90
+       INVALID_LETTER, // [145] 0x91
+       INVALID_LETTER, // [146] 0x92
+       INVALID_LETTER, // [147] 0x93
+       INVALID_LETTER, // [148] 0x94
+       INVALID_LETTER, // [149] 0x95
+       INVALID_LETTER, // [150] 0x96
+       INVALID_LETTER, // [151] 0x97
+       INVALID_LETTER, // [152] 0x98
+       INVALID_LETTER, // [153] 0x99
+       INVALID_LETTER, // [154] 0x9a
+       INVALID_LETTER, // [155] 0x9b
+       INVALID_LETTER, // [156] 0x9c
+       INVALID_LETTER, // [157] 0x9d
+       INVALID_LETTER, // [158] 0x9e
+       INVALID_LETTER, // [159] 0x9f
+       INVALID_LETTER, // [160] 0xa0
+       INVALID_LETTER, // [161] 0xa1
+       INVALID_LETTER, // [162] 0xa2
+       INVALID_LETTER, // [163] 0xa3
+       INVALID_LETTER, // [164] 0xa4
+       INVALID_LETTER, // [165] 0xa5
+       INVALID_LETTER, // [166] 0xa6
+       INVALID_LETTER, // [167] 0xa7
+       INVALID_LETTER, // [168] 0xa8
+       INVALID_LETTER, // [169] 0xa9
+       INVALID_LETTER, // [170] 0xaa
+       INVALID_LETTER, // [171] 0xab
+       INVALID_LETTER, // [172] 0xac
+       INVALID_LETTER, // [173] 0xad
+       INVALID_LETTER, // [174] 0xae
+       INVALID_LETTER, // [175] 0xaf
+       INVALID_LETTER, // [176] 0xb0
+       INVALID_LETTER, // [177] 0xb1
+       INVALID_LETTER, // [178] 0xb2
+       INVALID_LETTER, // [179] 0xb3
+       INVALID_LETTER, // [180] 0xb4
+       INVALID_LETTER, // [181] 0xb5
+       INVALID_LETTER, // [182] 0xb6
+       INVALID_LETTER, // [183] 0xb7
+       INVALID_LETTER, // [184] 0xb8
+       INVALID_LETTER, // [185] 0xb9
+       INVALID_LETTER, // [186] 0xba
+       INVALID_LETTER, // [187] 0xbb
+       INVALID_LETTER, // [188] 0xbc
+       INVALID_LETTER, // [189] 0xbd
+       INVALID_LETTER, // [190] 0xbe
+       INVALID_LETTER, // [191] 0xbf
+       INVALID_LETTER, // [192] 0xc0
+       INVALID_LETTER, // [193] 0xc1
+       INVALID_LETTER, // [194] 0xc2
+       INVALID_LETTER, // [195] 0xc3
+       INVALID_LETTER, // [196] 0xc4
+       INVALID_LETTER, // [197] 0xc5
+       INVALID_LETTER, // [198] 0xc6
+       INVALID_LETTER, // [199] 0xc7
+       INVALID_LETTER, // [200] 0xc8
+       INVALID_LETTER, // [201] 0xc9
+       INVALID_LETTER, // [202] 0xca
+       INVALID_LETTER, // [203] 0xcb
+       INVALID_LETTER, // [204] 0xcc
+       INVALID_LETTER, // [205] 0xcd
+       INVALID_LETTER, // [206] 0xce
+       INVALID_LETTER, // [207] 0xcf
+       INVALID_LETTER, // [208] 0xd0
+       INVALID_LETTER, // [209] 0xd1
+       INVALID_LETTER, // [210] 0xd2
+       INVALID_LETTER, // [211] 0xd3
+       INVALID_LETTER, // [212] 0xd4
+       INVALID_LETTER, // [213] 0xd5
+       INVALID_LETTER, // [214] 0xd6
+       INVALID_LETTER, // [215] 0xd7
+       INVALID_LETTER, // [216] 0xd8
+       INVALID_LETTER, // [217] 0xd9
+       INVALID_LETTER, // [218] 0xda
+       INVALID_LETTER, // [219] 0xdb
+       INVALID_LETTER, // [220] 0xdc
+       INVALID_LETTER, // [221] 0xdd
+       INVALID_LETTER, // [222] 0xde
+       INVALID_LETTER, // [223] 0xdf
+       INVALID_LETTER, // [224] 0xe0
+       INVALID_LETTER, // [225] 0xe1
+       INVALID_LETTER, // [226] 0xe2
+       INVALID_LETTER, // [227] 0xe3
+       INVALID_LETTER, // [228] 0xe4
+       INVALID_LETTER, // [229] 0xe5
+       INVALID_LETTER, // [230] 0xe6
+       INVALID_LETTER, // [231] 0xe7
+       INVALID_LETTER, // [232] 0xe8
+       INVALID_LETTER, // [233] 0xe9
+       INVALID_LETTER, // [234] 0xea
+       INVALID_LETTER, // [235] 0xeb
+       INVALID_LETTER, // [236] 0xec
+       INVALID_LETTER, // [237] 0xed
+       INVALID_LETTER, // [238] 0xee
+       INVALID_LETTER, // [239] 0xef
+       INVALID_LETTER, // [240] 0xf0
+       INVALID_LETTER, // [241] 0xf1
+       INVALID_LETTER, // [242] 0xf2
+       INVALID_LETTER, // [243] 0xf3
+       INVALID_LETTER, // [244] 0xf4
+       INVALID_LETTER, // [245] 0xf5
+       INVALID_LETTER, // [246] 0xf6
+       INVALID_LETTER, // [247] 0xf7
+       INVALID_LETTER, // [248] 0xf8
+       INVALID_LETTER, // [249] 0xf9
+       INVALID_LETTER, // [250] 0xfa
+       INVALID_LETTER, // [251] 0xfb
+       INVALID_LETTER, // [252] 0xfc
+       INVALID_LETTER, // [253] 0xfd
+       INVALID_LETTER, // [254] 0xfe
+       INVALID_LETTER, // [255] 0xff
+       };
+unsigned g_CharToLetterAmino[256] =
+       {
+       INVALID_LETTER, // [  0] 0x00
+       INVALID_LETTER, // [  1] 0x01
+       INVALID_LETTER, // [  2] 0x02
+       INVALID_LETTER, // [  3] 0x03
+       INVALID_LETTER, // [  4] 0x04
+       INVALID_LETTER, // [  5] 0x05
+       INVALID_LETTER, // [  6] 0x06
+       INVALID_LETTER, // [  7] 0x07
+       INVALID_LETTER, // [  8] 0x08
+       INVALID_LETTER, // [  9] 0x09
+       INVALID_LETTER, // [ 10] 0x0a
+       INVALID_LETTER, // [ 11] 0x0b
+       INVALID_LETTER, // [ 12] 0x0c
+       INVALID_LETTER, // [ 13] 0x0d
+       INVALID_LETTER, // [ 14] 0x0e
+       INVALID_LETTER, // [ 15] 0x0f
+       INVALID_LETTER, // [ 16] 0x10
+       INVALID_LETTER, // [ 17] 0x11
+       INVALID_LETTER, // [ 18] 0x12
+       INVALID_LETTER, // [ 19] 0x13
+       INVALID_LETTER, // [ 20] 0x14
+       INVALID_LETTER, // [ 21] 0x15
+       INVALID_LETTER, // [ 22] 0x16
+       INVALID_LETTER, // [ 23] 0x17
+       INVALID_LETTER, // [ 24] 0x18
+       INVALID_LETTER, // [ 25] 0x19
+       INVALID_LETTER, // [ 26] 0x1a
+       INVALID_LETTER, // [ 27] 0x1b
+       INVALID_LETTER, // [ 28] 0x1c
+       INVALID_LETTER, // [ 29] 0x1d
+       INVALID_LETTER, // [ 30] 0x1e
+       INVALID_LETTER, // [ 31] 0x1f
+       INVALID_LETTER, // [ 32] ' '
+       INVALID_LETTER, // [ 33] '!'
+       INVALID_LETTER, // [ 34] '"'
+       INVALID_LETTER, // [ 35] '#'
+       INVALID_LETTER, // [ 36] '$'
+       INVALID_LETTER, // [ 37] '%'
+       INVALID_LETTER, // [ 38] '&'
+       INVALID_LETTER, // [ 39] '''
+       INVALID_LETTER, // [ 40] '('
+       INVALID_LETTER, // [ 41] ')'
+       INVALID_LETTER, // [ 42] '*'
+       INVALID_LETTER, // [ 43] '+'
+       INVALID_LETTER, // [ 44] ','
+       INVALID_LETTER, // [ 45] '-'
+       INVALID_LETTER, // [ 46] '.'
+       INVALID_LETTER, // [ 47] '/'
+       INVALID_LETTER, // [ 48] '0'
+       INVALID_LETTER, // [ 49] '1'
+       INVALID_LETTER, // [ 50] '2'
+       INVALID_LETTER, // [ 51] '3'
+       INVALID_LETTER, // [ 52] '4'
+       INVALID_LETTER, // [ 53] '5'
+       INVALID_LETTER, // [ 54] '6'
+       INVALID_LETTER, // [ 55] '7'
+       INVALID_LETTER, // [ 56] '8'
+       INVALID_LETTER, // [ 57] '9'
+       INVALID_LETTER, // [ 58] ':'
+       INVALID_LETTER, // [ 59] ';'
+       INVALID_LETTER, // [ 60] '<'
+       INVALID_LETTER, // [ 61] '='
+       INVALID_LETTER, // [ 62] '>'
+       INVALID_LETTER, // [ 63] '?'
+       INVALID_LETTER, // [ 64] '@'
+       0  ,            // [ 65] 'A' = Ala
+       INVALID_LETTER, // [ 66] 'B'
+       1  ,            // [ 67] 'C' = Cys
+       2  ,            // [ 68] 'D' = Asp
+       3  ,            // [ 69] 'E' = Glu
+       4  ,            // [ 70] 'F' = Phe
+       5  ,            // [ 71] 'G' = Gly
+       6  ,            // [ 72] 'H' = His
+       7  ,            // [ 73] 'I' = Ile
+       INVALID_LETTER, // [ 74] 'J'
+       8  ,            // [ 75] 'K' = Lys
+       9  ,            // [ 76] 'L' = Leu
+       10 ,            // [ 77] 'M' = Met
+       11 ,            // [ 78] 'N' = Asn
+       INVALID_LETTER, // [ 79] 'O'
+       12 ,            // [ 80] 'P' = Pro
+       13 ,            // [ 81] 'Q' = Gln
+       14 ,            // [ 82] 'R' = Arg
+       15 ,            // [ 83] 'S' = Ser
+       16 ,            // [ 84] 'T' = Thr
+       INVALID_LETTER, // [ 85] 'U'
+       17 ,            // [ 86] 'V' = Val
+       18 ,            // [ 87] 'W' = Trp
+       INVALID_LETTER, // [ 88] 'X'
+       19 ,            // [ 89] 'Y' = Tyr
+       INVALID_LETTER, // [ 90] 'Z'
+       INVALID_LETTER, // [ 91] '['
+       INVALID_LETTER, // [ 92] '\'
+       INVALID_LETTER, // [ 93] ']'
+       INVALID_LETTER, // [ 94] '^'
+       INVALID_LETTER, // [ 95] '_'
+       INVALID_LETTER, // [ 96] '`'
+       0  ,            // [ 97] 'a' = Ala
+       INVALID_LETTER, // [ 98] 'b'
+       1  ,            // [ 99] 'c' = Cys
+       2  ,            // [100] 'd' = Asp
+       3  ,            // [101] 'e' = Glu
+       4  ,            // [102] 'f' = Phe
+       5  ,            // [103] 'g' = Gly
+       6  ,            // [104] 'h' = His
+       7  ,            // [105] 'i' = Ile
+       INVALID_LETTER, // [106] 'j'
+       8  ,            // [107] 'k' = Lys
+       9  ,            // [108] 'l' = Leu
+       10 ,            // [109] 'm' = Met
+       11 ,            // [110] 'n' = Asn
+       INVALID_LETTER, // [111] 'o'
+       12 ,            // [112] 'p' = Pro
+       13 ,            // [113] 'q' = Gln
+       14 ,            // [114] 'r' = Arg
+       15 ,            // [115] 's' = Ser
+       16 ,            // [116] 't' = Thr
+       INVALID_LETTER, // [117] 'u'
+       17 ,            // [118] 'v' = Val
+       18 ,            // [119] 'w' = Trp
+       INVALID_LETTER, // [120] 'x'
+       19 ,            // [121] 'y' = Tyr
+       INVALID_LETTER, // [122] 'z'
+       INVALID_LETTER, // [123] '{'
+       INVALID_LETTER, // [124] '|'
+       INVALID_LETTER, // [125] '}'
+       INVALID_LETTER, // [126] '~'
+       INVALID_LETTER, // [127] 0x7f
+       INVALID_LETTER, // [128] 0x80
+       INVALID_LETTER, // [129] 0x81
+       INVALID_LETTER, // [130] 0x82
+       INVALID_LETTER, // [131] 0x83
+       INVALID_LETTER, // [132] 0x84
+       INVALID_LETTER, // [133] 0x85
+       INVALID_LETTER, // [134] 0x86
+       INVALID_LETTER, // [135] 0x87
+       INVALID_LETTER, // [136] 0x88
+       INVALID_LETTER, // [137] 0x89
+       INVALID_LETTER, // [138] 0x8a
+       INVALID_LETTER, // [139] 0x8b
+       INVALID_LETTER, // [140] 0x8c
+       INVALID_LETTER, // [141] 0x8d
+       INVALID_LETTER, // [142] 0x8e
+       INVALID_LETTER, // [143] 0x8f
+       INVALID_LETTER, // [144] 0x90
+       INVALID_LETTER, // [145] 0x91
+       INVALID_LETTER, // [146] 0x92
+       INVALID_LETTER, // [147] 0x93
+       INVALID_LETTER, // [148] 0x94
+       INVALID_LETTER, // [149] 0x95
+       INVALID_LETTER, // [150] 0x96
+       INVALID_LETTER, // [151] 0x97
+       INVALID_LETTER, // [152] 0x98
+       INVALID_LETTER, // [153] 0x99
+       INVALID_LETTER, // [154] 0x9a
+       INVALID_LETTER, // [155] 0x9b
+       INVALID_LETTER, // [156] 0x9c
+       INVALID_LETTER, // [157] 0x9d
+       INVALID_LETTER, // [158] 0x9e
+       INVALID_LETTER, // [159] 0x9f
+       INVALID_LETTER, // [160] 0xa0
+       INVALID_LETTER, // [161] 0xa1
+       INVALID_LETTER, // [162] 0xa2
+       INVALID_LETTER, // [163] 0xa3
+       INVALID_LETTER, // [164] 0xa4
+       INVALID_LETTER, // [165] 0xa5
+       INVALID_LETTER, // [166] 0xa6
+       INVALID_LETTER, // [167] 0xa7
+       INVALID_LETTER, // [168] 0xa8
+       INVALID_LETTER, // [169] 0xa9
+       INVALID_LETTER, // [170] 0xaa
+       INVALID_LETTER, // [171] 0xab
+       INVALID_LETTER, // [172] 0xac
+       INVALID_LETTER, // [173] 0xad
+       INVALID_LETTER, // [174] 0xae
+       INVALID_LETTER, // [175] 0xaf
+       INVALID_LETTER, // [176] 0xb0
+       INVALID_LETTER, // [177] 0xb1
+       INVALID_LETTER, // [178] 0xb2
+       INVALID_LETTER, // [179] 0xb3
+       INVALID_LETTER, // [180] 0xb4
+       INVALID_LETTER, // [181] 0xb5
+       INVALID_LETTER, // [182] 0xb6
+       INVALID_LETTER, // [183] 0xb7
+       INVALID_LETTER, // [184] 0xb8
+       INVALID_LETTER, // [185] 0xb9
+       INVALID_LETTER, // [186] 0xba
+       INVALID_LETTER, // [187] 0xbb
+       INVALID_LETTER, // [188] 0xbc
+       INVALID_LETTER, // [189] 0xbd
+       INVALID_LETTER, // [190] 0xbe
+       INVALID_LETTER, // [191] 0xbf
+       INVALID_LETTER, // [192] 0xc0
+       INVALID_LETTER, // [193] 0xc1
+       INVALID_LETTER, // [194] 0xc2
+       INVALID_LETTER, // [195] 0xc3
+       INVALID_LETTER, // [196] 0xc4
+       INVALID_LETTER, // [197] 0xc5
+       INVALID_LETTER, // [198] 0xc6
+       INVALID_LETTER, // [199] 0xc7
+       INVALID_LETTER, // [200] 0xc8
+       INVALID_LETTER, // [201] 0xc9
+       INVALID_LETTER, // [202] 0xca
+       INVALID_LETTER, // [203] 0xcb
+       INVALID_LETTER, // [204] 0xcc
+       INVALID_LETTER, // [205] 0xcd
+       INVALID_LETTER, // [206] 0xce
+       INVALID_LETTER, // [207] 0xcf
+       INVALID_LETTER, // [208] 0xd0
+       INVALID_LETTER, // [209] 0xd1
+       INVALID_LETTER, // [210] 0xd2
+       INVALID_LETTER, // [211] 0xd3
+       INVALID_LETTER, // [212] 0xd4
+       INVALID_LETTER, // [213] 0xd5
+       INVALID_LETTER, // [214] 0xd6
+       INVALID_LETTER, // [215] 0xd7
+       INVALID_LETTER, // [216] 0xd8
+       INVALID_LETTER, // [217] 0xd9
+       INVALID_LETTER, // [218] 0xda
+       INVALID_LETTER, // [219] 0xdb
+       INVALID_LETTER, // [220] 0xdc
+       INVALID_LETTER, // [221] 0xdd
+       INVALID_LETTER, // [222] 0xde
+       INVALID_LETTER, // [223] 0xdf
+       INVALID_LETTER, // [224] 0xe0
+       INVALID_LETTER, // [225] 0xe1
+       INVALID_LETTER, // [226] 0xe2
+       INVALID_LETTER, // [227] 0xe3
+       INVALID_LETTER, // [228] 0xe4
+       INVALID_LETTER, // [229] 0xe5
+       INVALID_LETTER, // [230] 0xe6
+       INVALID_LETTER, // [231] 0xe7
+       INVALID_LETTER, // [232] 0xe8
+       INVALID_LETTER, // [233] 0xe9
+       INVALID_LETTER, // [234] 0xea
+       INVALID_LETTER, // [235] 0xeb
+       INVALID_LETTER, // [236] 0xec
+       INVALID_LETTER, // [237] 0xed
+       INVALID_LETTER, // [238] 0xee
+       INVALID_LETTER, // [239] 0xef
+       INVALID_LETTER, // [240] 0xf0
+       INVALID_LETTER, // [241] 0xf1
+       INVALID_LETTER, // [242] 0xf2
+       INVALID_LETTER, // [243] 0xf3
+       INVALID_LETTER, // [244] 0xf4
+       INVALID_LETTER, // [245] 0xf5
+       INVALID_LETTER, // [246] 0xf6
+       INVALID_LETTER, // [247] 0xf7
+       INVALID_LETTER, // [248] 0xf8
+       INVALID_LETTER, // [249] 0xf9
+       INVALID_LETTER, // [250] 0xfa
+       INVALID_LETTER, // [251] 0xfb
+       INVALID_LETTER, // [252] 0xfc
+       INVALID_LETTER, // [253] 0xfd
+       INVALID_LETTER, // [254] 0xfe
+       INVALID_LETTER, // [255] 0xff
+       };
+
+unsigned char g_LetterToCharAmino[256] =
+       {
+       'A', // [0] 
+       'C', // [1] 
+       'D', // [2] 
+       'E', // [3] 
+       'F', // [4] 
+       'G', // [5] 
+       'H', // [6] 
+       'I', // [7] 
+       'K', // [8] 
+       'L', // [9] 
+       'M', // [10] 
+       'N', // [11] 
+       'P', // [12] 
+       'Q', // [13] 
+       'R', // [14] 
+       'S', // [15] 
+       'T', // [16] 
+       'V', // [17] 
+       'W', // [18] 
+       'Y', // [19] 
+       '*', // [20] 
+       INVALID_CHAR, // [21]
+       INVALID_CHAR, // [22]
+       INVALID_CHAR, // [23]
+       INVALID_CHAR, // [24]
+       INVALID_CHAR, // [25]
+       INVALID_CHAR, // [26]
+       INVALID_CHAR, // [27]
+       INVALID_CHAR, // [28]
+       INVALID_CHAR, // [29]
+       INVALID_CHAR, // [30]
+       INVALID_CHAR, // [31]
+       INVALID_CHAR, // [32]
+       INVALID_CHAR, // [33]
+       INVALID_CHAR, // [34]
+       INVALID_CHAR, // [35]
+       INVALID_CHAR, // [36]
+       INVALID_CHAR, // [37]
+       INVALID_CHAR, // [38]
+       INVALID_CHAR, // [39]
+       INVALID_CHAR, // [40]
+       INVALID_CHAR, // [41]
+       INVALID_CHAR, // [42]
+       INVALID_CHAR, // [43]
+       INVALID_CHAR, // [44]
+       INVALID_CHAR, // [45]
+       INVALID_CHAR, // [46]
+       INVALID_CHAR, // [47]
+       INVALID_CHAR, // [48]
+       INVALID_CHAR, // [49]
+       INVALID_CHAR, // [50]
+       INVALID_CHAR, // [51]
+       INVALID_CHAR, // [52]
+       INVALID_CHAR, // [53]
+       INVALID_CHAR, // [54]
+       INVALID_CHAR, // [55]
+       INVALID_CHAR, // [56]
+       INVALID_CHAR, // [57]
+       INVALID_CHAR, // [58]
+       INVALID_CHAR, // [59]
+       INVALID_CHAR, // [60]
+       INVALID_CHAR, // [61]
+       INVALID_CHAR, // [62]
+       INVALID_CHAR, // [63]
+       INVALID_CHAR, // [64]
+       INVALID_CHAR, // [65]
+       INVALID_CHAR, // [66]
+       INVALID_CHAR, // [67]
+       INVALID_CHAR, // [68]
+       INVALID_CHAR, // [69]
+       INVALID_CHAR, // [70]
+       INVALID_CHAR, // [71]
+       INVALID_CHAR, // [72]
+       INVALID_CHAR, // [73]
+       INVALID_CHAR, // [74]
+       INVALID_CHAR, // [75]
+       INVALID_CHAR, // [76]
+       INVALID_CHAR, // [77]
+       INVALID_CHAR, // [78]
+       INVALID_CHAR, // [79]
+       INVALID_CHAR, // [80]
+       INVALID_CHAR, // [81]
+       INVALID_CHAR, // [82]
+       INVALID_CHAR, // [83]
+       INVALID_CHAR, // [84]
+       INVALID_CHAR, // [85]
+       INVALID_CHAR, // [86]
+       INVALID_CHAR, // [87]
+       INVALID_CHAR, // [88]
+       INVALID_CHAR, // [89]
+       INVALID_CHAR, // [90]
+       INVALID_CHAR, // [91]
+       INVALID_CHAR, // [92]
+       INVALID_CHAR, // [93]
+       INVALID_CHAR, // [94]
+       INVALID_CHAR, // [95]
+       INVALID_CHAR, // [96]
+       INVALID_CHAR, // [97]
+       INVALID_CHAR, // [98]
+       INVALID_CHAR, // [99]
+       INVALID_CHAR, // [100]
+       INVALID_CHAR, // [101]
+       INVALID_CHAR, // [102]
+       INVALID_CHAR, // [103]
+       INVALID_CHAR, // [104]
+       INVALID_CHAR, // [105]
+       INVALID_CHAR, // [106]
+       INVALID_CHAR, // [107]
+       INVALID_CHAR, // [108]
+       INVALID_CHAR, // [109]
+       INVALID_CHAR, // [110]
+       INVALID_CHAR, // [111]
+       INVALID_CHAR, // [112]
+       INVALID_CHAR, // [113]
+       INVALID_CHAR, // [114]
+       INVALID_CHAR, // [115]
+       INVALID_CHAR, // [116]
+       INVALID_CHAR, // [117]
+       INVALID_CHAR, // [118]
+       INVALID_CHAR, // [119]
+       INVALID_CHAR, // [120]
+       INVALID_CHAR, // [121]
+       INVALID_CHAR, // [122]
+       INVALID_CHAR, // [123]
+       INVALID_CHAR, // [124]
+       INVALID_CHAR, // [125]
+       INVALID_CHAR, // [126]
+       INVALID_CHAR, // [127]
+       INVALID_CHAR, // [128]
+       INVALID_CHAR, // [129]
+       INVALID_CHAR, // [130]
+       INVALID_CHAR, // [131]
+       INVALID_CHAR, // [132]
+       INVALID_CHAR, // [133]
+       INVALID_CHAR, // [134]
+       INVALID_CHAR, // [135]
+       INVALID_CHAR, // [136]
+       INVALID_CHAR, // [137]
+       INVALID_CHAR, // [138]
+       INVALID_CHAR, // [139]
+       INVALID_CHAR, // [140]
+       INVALID_CHAR, // [141]
+       INVALID_CHAR, // [142]
+       INVALID_CHAR, // [143]
+       INVALID_CHAR, // [144]
+       INVALID_CHAR, // [145]
+       INVALID_CHAR, // [146]
+       INVALID_CHAR, // [147]
+       INVALID_CHAR, // [148]
+       INVALID_CHAR, // [149]
+       INVALID_CHAR, // [150]
+       INVALID_CHAR, // [151]
+       INVALID_CHAR, // [152]
+       INVALID_CHAR, // [153]
+       INVALID_CHAR, // [154]
+       INVALID_CHAR, // [155]
+       INVALID_CHAR, // [156]
+       INVALID_CHAR, // [157]
+       INVALID_CHAR, // [158]
+       INVALID_CHAR, // [159]
+       INVALID_CHAR, // [160]
+       INVALID_CHAR, // [161]
+       INVALID_CHAR, // [162]
+       INVALID_CHAR, // [163]
+       INVALID_CHAR, // [164]
+       INVALID_CHAR, // [165]
+       INVALID_CHAR, // [166]
+       INVALID_CHAR, // [167]
+       INVALID_CHAR, // [168]
+       INVALID_CHAR, // [169]
+       INVALID_CHAR, // [170]
+       INVALID_CHAR, // [171]
+       INVALID_CHAR, // [172]
+       INVALID_CHAR, // [173]
+       INVALID_CHAR, // [174]
+       INVALID_CHAR, // [175]
+       INVALID_CHAR, // [176]
+       INVALID_CHAR, // [177]
+       INVALID_CHAR, // [178]
+       INVALID_CHAR, // [179]
+       INVALID_CHAR, // [180]
+       INVALID_CHAR, // [181]
+       INVALID_CHAR, // [182]
+       INVALID_CHAR, // [183]
+       INVALID_CHAR, // [184]
+       INVALID_CHAR, // [185]
+       INVALID_CHAR, // [186]
+       INVALID_CHAR, // [187]
+       INVALID_CHAR, // [188]
+       INVALID_CHAR, // [189]
+       INVALID_CHAR, // [190]
+       INVALID_CHAR, // [191]
+       INVALID_CHAR, // [192]
+       INVALID_CHAR, // [193]
+       INVALID_CHAR, // [194]
+       INVALID_CHAR, // [195]
+       INVALID_CHAR, // [196]
+       INVALID_CHAR, // [197]
+       INVALID_CHAR, // [198]
+       INVALID_CHAR, // [199]
+       INVALID_CHAR, // [200]
+       INVALID_CHAR, // [201]
+       INVALID_CHAR, // [202]
+       INVALID_CHAR, // [203]
+       INVALID_CHAR, // [204]
+       INVALID_CHAR, // [205]
+       INVALID_CHAR, // [206]
+       INVALID_CHAR, // [207]
+       INVALID_CHAR, // [208]
+       INVALID_CHAR, // [209]
+       INVALID_CHAR, // [210]
+       INVALID_CHAR, // [211]
+       INVALID_CHAR, // [212]
+       INVALID_CHAR, // [213]
+       INVALID_CHAR, // [214]
+       INVALID_CHAR, // [215]
+       INVALID_CHAR, // [216]
+       INVALID_CHAR, // [217]
+       INVALID_CHAR, // [218]
+       INVALID_CHAR, // [219]
+       INVALID_CHAR, // [220]
+       INVALID_CHAR, // [221]
+       INVALID_CHAR, // [222]
+       INVALID_CHAR, // [223]
+       INVALID_CHAR, // [224]
+       INVALID_CHAR, // [225]
+       INVALID_CHAR, // [226]
+       INVALID_CHAR, // [227]
+       INVALID_CHAR, // [228]
+       INVALID_CHAR, // [229]
+       INVALID_CHAR, // [230]
+       INVALID_CHAR, // [231]
+       INVALID_CHAR, // [232]
+       INVALID_CHAR, // [233]
+       INVALID_CHAR, // [234]
+       INVALID_CHAR, // [235]
+       INVALID_CHAR, // [236]
+       INVALID_CHAR, // [237]
+       INVALID_CHAR, // [238]
+       INVALID_CHAR, // [239]
+       INVALID_CHAR, // [240]
+       INVALID_CHAR, // [241]
+       INVALID_CHAR, // [242]
+       INVALID_CHAR, // [243]
+       INVALID_CHAR, // [244]
+       INVALID_CHAR, // [245]
+       INVALID_CHAR, // [246]
+       INVALID_CHAR, // [247]
+       INVALID_CHAR, // [248]
+       INVALID_CHAR, // [249]
+       INVALID_CHAR, // [250]
+       INVALID_CHAR, // [251]
+       INVALID_CHAR, // [252]
+       INVALID_CHAR, // [253]
+       INVALID_CHAR, // [254]
+       INVALID_CHAR, // [255]
+       };
+
+unsigned g_CharToLetterNucleo[256] =
+       {
+       INVALID_LETTER, // [  0] = 0x00
+       INVALID_LETTER, // [  1] = 0x01
+       INVALID_LETTER, // [  2] = 0x02
+       INVALID_LETTER, // [  3] = 0x03
+       INVALID_LETTER, // [  4] = 0x04
+       INVALID_LETTER, // [  5] = 0x05
+       INVALID_LETTER, // [  6] = 0x06
+       INVALID_LETTER, // [  7] = 0x07
+       INVALID_LETTER, // [  8] = 0x08
+       INVALID_LETTER, // [  9] = 0x09
+       INVALID_LETTER, // [ 10] = 0x0a
+       INVALID_LETTER, // [ 11] = 0x0b
+       INVALID_LETTER, // [ 12] = 0x0c
+       INVALID_LETTER, // [ 13] = 0x0d
+       INVALID_LETTER, // [ 14] = 0x0e
+       INVALID_LETTER, // [ 15] = 0x0f
+       INVALID_LETTER, // [ 16] = 0x10
+       INVALID_LETTER, // [ 17] = 0x11
+       INVALID_LETTER, // [ 18] = 0x12
+       INVALID_LETTER, // [ 19] = 0x13
+       INVALID_LETTER, // [ 20] = 0x14
+       INVALID_LETTER, // [ 21] = 0x15
+       INVALID_LETTER, // [ 22] = 0x16
+       INVALID_LETTER, // [ 23] = 0x17
+       INVALID_LETTER, // [ 24] = 0x18
+       INVALID_LETTER, // [ 25] = 0x19
+       INVALID_LETTER, // [ 26] = 0x1a
+       INVALID_LETTER, // [ 27] = 0x1b
+       INVALID_LETTER, // [ 28] = 0x1c
+       INVALID_LETTER, // [ 29] = 0x1d
+       INVALID_LETTER, // [ 30] = 0x1e
+       INVALID_LETTER, // [ 31] = 0x1f
+       INVALID_LETTER, // [ 32] = 32
+       INVALID_LETTER, // [ 33] = 33
+       INVALID_LETTER, // [ 34] = 34
+       INVALID_LETTER, // [ 35] = 35
+       INVALID_LETTER, // [ 36] = 36
+       INVALID_LETTER, // [ 37] = 37
+       INVALID_LETTER, // [ 38] = 38
+       INVALID_LETTER, // [ 39] = 39
+       INVALID_LETTER, // [ 40] = 40
+       INVALID_LETTER, // [ 41] = 41
+       INVALID_LETTER, // [ 42] = 42
+       INVALID_LETTER, // [ 43] = 43
+       INVALID_LETTER, // [ 44] = 44
+       INVALID_LETTER, // [ 45] = 45
+       INVALID_LETTER, // [ 46] = 46
+       INVALID_LETTER, // [ 47] = 47
+       INVALID_LETTER, // [ 48] = 48
+       INVALID_LETTER, // [ 49] = 49
+       INVALID_LETTER, // [ 50] = 50
+       INVALID_LETTER, // [ 51] = 51
+       INVALID_LETTER, // [ 52] = 52
+       INVALID_LETTER, // [ 53] = 53
+       INVALID_LETTER, // [ 54] = 54
+       INVALID_LETTER, // [ 55] = 55
+       INVALID_LETTER, // [ 56] = 56
+       INVALID_LETTER, // [ 57] = 57
+       INVALID_LETTER, // [ 58] = 58
+       INVALID_LETTER, // [ 59] = 59
+       INVALID_LETTER, // [ 60] = 60
+       INVALID_LETTER, // [ 61] = 61
+       INVALID_LETTER, // [ 62] = 62
+       INVALID_LETTER, // [ 63] = 63
+       INVALID_LETTER, // [ 64] = 64
+       0  ,            // [ 65] = A (Nucleotide)
+       INVALID_LETTER, // [ 66] = 66
+       1  ,            // [ 67] = C (Nucleotide)
+       INVALID_LETTER, // [ 68] = 68
+       INVALID_LETTER, // [ 69] = 69
+       INVALID_LETTER, // [ 70] = 70
+       2  ,            // [ 71] = G (Nucleotide)
+       INVALID_LETTER, // [ 72] = 72
+       INVALID_LETTER, // [ 73] = 73
+       INVALID_LETTER, // [ 74] = 74
+       INVALID_LETTER, // [ 75] = 75
+       INVALID_LETTER, // [ 76] = 76
+       INVALID_LETTER, // [ 77] = 77
+       INVALID_LETTER, // [ 78] = 78
+       INVALID_LETTER, // [ 79] = 79
+       INVALID_LETTER, // [ 80] = 80
+       INVALID_LETTER, // [ 81] = 81
+       INVALID_LETTER, // [ 82] = 82
+       INVALID_LETTER, // [ 83] = 83
+       3  ,            // [ 84] = T (Nucleotide)
+       3  ,            // [ 85] = U (Nucleotide)
+       INVALID_LETTER, // [ 86] = 86
+       INVALID_LETTER, // [ 87] = 87
+       INVALID_LETTER, // [ 88] = 88
+       INVALID_LETTER, // [ 89] = 89
+       INVALID_LETTER, // [ 90] = 90
+       INVALID_LETTER, // [ 91] = 91
+       INVALID_LETTER, // [ 92] = 92
+       INVALID_LETTER, // [ 93] = 93
+       INVALID_LETTER, // [ 94] = 94
+       INVALID_LETTER, // [ 95] = 95
+       INVALID_LETTER, // [ 96] = 96
+       0  ,            // [ 97] = a (Nucleotide)
+       INVALID_LETTER, // [ 98] = 98
+       1  ,            // [ 99] = c (Nucleotide)
+       INVALID_LETTER, // [100] = 100
+       INVALID_LETTER, // [101] = 101
+       INVALID_LETTER, // [102] = 102
+       2  ,            // [103] = g (Nucleotide)
+       INVALID_LETTER, // [104] = 104
+       INVALID_LETTER, // [105] = 105
+       INVALID_LETTER, // [106] = 106
+       INVALID_LETTER, // [107] = 107
+       INVALID_LETTER, // [108] = 108
+       INVALID_LETTER, // [109] = 109
+       INVALID_LETTER, // [110] = 110
+       INVALID_LETTER, // [111] = 111
+       INVALID_LETTER, // [112] = 112
+       INVALID_LETTER, // [113] = 113
+       INVALID_LETTER, // [114] = 114
+       INVALID_LETTER, // [115] = 115
+       3  ,            // [116] = t (Nucleotide)
+       3  ,            // [117] = u (Nucleotide)
+       INVALID_LETTER, // [118] = 118
+       INVALID_LETTER, // [119] = 119
+       INVALID_LETTER, // [120] = 120
+       INVALID_LETTER, // [121] = 121
+       INVALID_LETTER, // [122] = 122
+       INVALID_LETTER, // [123] = 123
+       INVALID_LETTER, // [124] = 124
+       INVALID_LETTER, // [125] = 125
+       INVALID_LETTER, // [126] = 126
+       INVALID_LETTER, // [127] = 0x7f
+       INVALID_LETTER, // [128] = 0x80
+       INVALID_LETTER, // [129] = 0x81
+       INVALID_LETTER, // [130] = 0x82
+       INVALID_LETTER, // [131] = 0x83
+       INVALID_LETTER, // [132] = 0x84
+       INVALID_LETTER, // [133] = 0x85
+       INVALID_LETTER, // [134] = 0x86
+       INVALID_LETTER, // [135] = 0x87
+       INVALID_LETTER, // [136] = 0x88
+       INVALID_LETTER, // [137] = 0x89
+       INVALID_LETTER, // [138] = 0x8a
+       INVALID_LETTER, // [139] = 0x8b
+       INVALID_LETTER, // [140] = 0x8c
+       INVALID_LETTER, // [141] = 0x8d
+       INVALID_LETTER, // [142] = 0x8e
+       INVALID_LETTER, // [143] = 0x8f
+       INVALID_LETTER, // [144] = 0x90
+       INVALID_LETTER, // [145] = 0x91
+       INVALID_LETTER, // [146] = 0x92
+       INVALID_LETTER, // [147] = 0x93
+       INVALID_LETTER, // [148] = 0x94
+       INVALID_LETTER, // [149] = 0x95
+       INVALID_LETTER, // [150] = 0x96
+       INVALID_LETTER, // [151] = 0x97
+       INVALID_LETTER, // [152] = 0x98
+       INVALID_LETTER, // [153] = 0x99
+       INVALID_LETTER, // [154] = 0x9a
+       INVALID_LETTER, // [155] = 0x9b
+       INVALID_LETTER, // [156] = 0x9c
+       INVALID_LETTER, // [157] = 0x9d
+       INVALID_LETTER, // [158] = 0x9e
+       INVALID_LETTER, // [159] = 0x9f
+       INVALID_LETTER, // [160] = 0xa0
+       INVALID_LETTER, // [161] = 0xa1
+       INVALID_LETTER, // [162] = 0xa2
+       INVALID_LETTER, // [163] = 0xa3
+       INVALID_LETTER, // [164] = 0xa4
+       INVALID_LETTER, // [165] = 0xa5
+       INVALID_LETTER, // [166] = 0xa6
+       INVALID_LETTER, // [167] = 0xa7
+       INVALID_LETTER, // [168] = 0xa8
+       INVALID_LETTER, // [169] = 0xa9
+       INVALID_LETTER, // [170] = 0xaa
+       INVALID_LETTER, // [171] = 0xab
+       INVALID_LETTER, // [172] = 0xac
+       INVALID_LETTER, // [173] = 0xad
+       INVALID_LETTER, // [174] = 0xae
+       INVALID_LETTER, // [175] = 0xaf
+       INVALID_LETTER, // [176] = 0xb0
+       INVALID_LETTER, // [177] = 0xb1
+       INVALID_LETTER, // [178] = 0xb2
+       INVALID_LETTER, // [179] = 0xb3
+       INVALID_LETTER, // [180] = 0xb4
+       INVALID_LETTER, // [181] = 0xb5
+       INVALID_LETTER, // [182] = 0xb6
+       INVALID_LETTER, // [183] = 0xb7
+       INVALID_LETTER, // [184] = 0xb8
+       INVALID_LETTER, // [185] = 0xb9
+       INVALID_LETTER, // [186] = 0xba
+       INVALID_LETTER, // [187] = 0xbb
+       INVALID_LETTER, // [188] = 0xbc
+       INVALID_LETTER, // [189] = 0xbd
+       INVALID_LETTER, // [190] = 0xbe
+       INVALID_LETTER, // [191] = 0xbf
+       INVALID_LETTER, // [192] = 0xc0
+       INVALID_LETTER, // [193] = 0xc1
+       INVALID_LETTER, // [194] = 0xc2
+       INVALID_LETTER, // [195] = 0xc3
+       INVALID_LETTER, // [196] = 0xc4
+       INVALID_LETTER, // [197] = 0xc5
+       INVALID_LETTER, // [198] = 0xc6
+       INVALID_LETTER, // [199] = 0xc7
+       INVALID_LETTER, // [200] = 0xc8
+       INVALID_LETTER, // [201] = 0xc9
+       INVALID_LETTER, // [202] = 0xca
+       INVALID_LETTER, // [203] = 0xcb
+       INVALID_LETTER, // [204] = 0xcc
+       INVALID_LETTER, // [205] = 0xcd
+       INVALID_LETTER, // [206] = 0xce
+       INVALID_LETTER, // [207] = 0xcf
+       INVALID_LETTER, // [208] = 0xd0
+       INVALID_LETTER, // [209] = 0xd1
+       INVALID_LETTER, // [210] = 0xd2
+       INVALID_LETTER, // [211] = 0xd3
+       INVALID_LETTER, // [212] = 0xd4
+       INVALID_LETTER, // [213] = 0xd5
+       INVALID_LETTER, // [214] = 0xd6
+       INVALID_LETTER, // [215] = 0xd7
+       INVALID_LETTER, // [216] = 0xd8
+       INVALID_LETTER, // [217] = 0xd9
+       INVALID_LETTER, // [218] = 0xda
+       INVALID_LETTER, // [219] = 0xdb
+       INVALID_LETTER, // [220] = 0xdc
+       INVALID_LETTER, // [221] = 0xdd
+       INVALID_LETTER, // [222] = 0xde
+       INVALID_LETTER, // [223] = 0xdf
+       INVALID_LETTER, // [224] = 0xe0
+       INVALID_LETTER, // [225] = 0xe1
+       INVALID_LETTER, // [226] = 0xe2
+       INVALID_LETTER, // [227] = 0xe3
+       INVALID_LETTER, // [228] = 0xe4
+       INVALID_LETTER, // [229] = 0xe5
+       INVALID_LETTER, // [230] = 0xe6
+       INVALID_LETTER, // [231] = 0xe7
+       INVALID_LETTER, // [232] = 0xe8
+       INVALID_LETTER, // [233] = 0xe9
+       INVALID_LETTER, // [234] = 0xea
+       INVALID_LETTER, // [235] = 0xeb
+       INVALID_LETTER, // [236] = 0xec
+       INVALID_LETTER, // [237] = 0xed
+       INVALID_LETTER, // [238] = 0xee
+       INVALID_LETTER, // [239] = 0xef
+       INVALID_LETTER, // [240] = 0xf0
+       INVALID_LETTER, // [241] = 0xf1
+       INVALID_LETTER, // [242] = 0xf2
+       INVALID_LETTER, // [243] = 0xf3
+       INVALID_LETTER, // [244] = 0xf4
+       INVALID_LETTER, // [245] = 0xf5
+       INVALID_LETTER, // [246] = 0xf6
+       INVALID_LETTER, // [247] = 0xf7
+       INVALID_LETTER, // [248] = 0xf8
+       INVALID_LETTER, // [249] = 0xf9
+       INVALID_LETTER, // [250] = 0xfa
+       INVALID_LETTER, // [251] = 0xfb
+       INVALID_LETTER, // [252] = 0xfc
+       INVALID_LETTER, // [253] = 0xfd
+       INVALID_LETTER, // [254] = 0xfe
+       INVALID_LETTER, // [255] = 0xff
+       };
+
+unsigned char g_LetterToCharNucleo[256] =
+       {
+       'A', // [0]
+       'C', // [1]
+       'G', // [2]
+       'T', // [3]
+       INVALID_CHAR, // [4]
+       INVALID_CHAR, // [5]
+       INVALID_CHAR, // [6]
+       INVALID_CHAR, // [7]
+       INVALID_CHAR, // [8]
+       INVALID_CHAR, // [9]
+       INVALID_CHAR, // [10]
+       INVALID_CHAR, // [11]
+       INVALID_CHAR, // [12]
+       INVALID_CHAR, // [13]
+       INVALID_CHAR, // [14]
+       INVALID_CHAR, // [15]
+       INVALID_CHAR, // [16]
+       INVALID_CHAR, // [17]
+       INVALID_CHAR, // [18]
+       INVALID_CHAR, // [19]
+       INVALID_CHAR, // [20]
+       INVALID_CHAR, // [21]
+       INVALID_CHAR, // [22]
+       INVALID_CHAR, // [23]
+       INVALID_CHAR, // [24]
+       INVALID_CHAR, // [25]
+       INVALID_CHAR, // [26]
+       INVALID_CHAR, // [27]
+       INVALID_CHAR, // [28]
+       INVALID_CHAR, // [29]
+       INVALID_CHAR, // [30]
+       INVALID_CHAR, // [31]
+       INVALID_CHAR, // [32]
+       INVALID_CHAR, // [33]
+       INVALID_CHAR, // [34]
+       INVALID_CHAR, // [35]
+       INVALID_CHAR, // [36]
+       INVALID_CHAR, // [37]
+       INVALID_CHAR, // [38]
+       INVALID_CHAR, // [39]
+       INVALID_CHAR, // [40]
+       INVALID_CHAR, // [41]
+       INVALID_CHAR, // [42]
+       INVALID_CHAR, // [43]
+       INVALID_CHAR, // [44]
+       INVALID_CHAR, // [45]
+       INVALID_CHAR, // [46]
+       INVALID_CHAR, // [47]
+       INVALID_CHAR, // [48]
+       INVALID_CHAR, // [49]
+       INVALID_CHAR, // [50]
+       INVALID_CHAR, // [51]
+       INVALID_CHAR, // [52]
+       INVALID_CHAR, // [53]
+       INVALID_CHAR, // [54]
+       INVALID_CHAR, // [55]
+       INVALID_CHAR, // [56]
+       INVALID_CHAR, // [57]
+       INVALID_CHAR, // [58]
+       INVALID_CHAR, // [59]
+       INVALID_CHAR, // [60]
+       INVALID_CHAR, // [61]
+       INVALID_CHAR, // [62]
+       INVALID_CHAR, // [63]
+       INVALID_CHAR, // [64]
+       INVALID_CHAR, // [65]
+       INVALID_CHAR, // [66]
+       INVALID_CHAR, // [67]
+       INVALID_CHAR, // [68]
+       INVALID_CHAR, // [69]
+       INVALID_CHAR, // [70]
+       INVALID_CHAR, // [71]
+       INVALID_CHAR, // [72]
+       INVALID_CHAR, // [73]
+       INVALID_CHAR, // [74]
+       INVALID_CHAR, // [75]
+       INVALID_CHAR, // [76]
+       INVALID_CHAR, // [77]
+       INVALID_CHAR, // [78]
+       INVALID_CHAR, // [79]
+       INVALID_CHAR, // [80]
+       INVALID_CHAR, // [81]
+       INVALID_CHAR, // [82]
+       INVALID_CHAR, // [83]
+       INVALID_CHAR, // [84]
+       INVALID_CHAR, // [85]
+       INVALID_CHAR, // [86]
+       INVALID_CHAR, // [87]
+       INVALID_CHAR, // [88]
+       INVALID_CHAR, // [89]
+       INVALID_CHAR, // [90]
+       INVALID_CHAR, // [91]
+       INVALID_CHAR, // [92]
+       INVALID_CHAR, // [93]
+       INVALID_CHAR, // [94]
+       INVALID_CHAR, // [95]
+       INVALID_CHAR, // [96]
+       INVALID_CHAR, // [97]
+       INVALID_CHAR, // [98]
+       INVALID_CHAR, // [99]
+       INVALID_CHAR, // [100]
+       INVALID_CHAR, // [101]
+       INVALID_CHAR, // [102]
+       INVALID_CHAR, // [103]
+       INVALID_CHAR, // [104]
+       INVALID_CHAR, // [105]
+       INVALID_CHAR, // [106]
+       INVALID_CHAR, // [107]
+       INVALID_CHAR, // [108]
+       INVALID_CHAR, // [109]
+       INVALID_CHAR, // [110]
+       INVALID_CHAR, // [111]
+       INVALID_CHAR, // [112]
+       INVALID_CHAR, // [113]
+       INVALID_CHAR, // [114]
+       INVALID_CHAR, // [115]
+       INVALID_CHAR, // [116]
+       INVALID_CHAR, // [117]
+       INVALID_CHAR, // [118]
+       INVALID_CHAR, // [119]
+       INVALID_CHAR, // [120]
+       INVALID_CHAR, // [121]
+       INVALID_CHAR, // [122]
+       INVALID_CHAR, // [123]
+       INVALID_CHAR, // [124]
+       INVALID_CHAR, // [125]
+       INVALID_CHAR, // [126]
+       INVALID_CHAR, // [127]
+       INVALID_CHAR, // [128]
+       INVALID_CHAR, // [129]
+       INVALID_CHAR, // [130]
+       INVALID_CHAR, // [131]
+       INVALID_CHAR, // [132]
+       INVALID_CHAR, // [133]
+       INVALID_CHAR, // [134]
+       INVALID_CHAR, // [135]
+       INVALID_CHAR, // [136]
+       INVALID_CHAR, // [137]
+       INVALID_CHAR, // [138]
+       INVALID_CHAR, // [139]
+       INVALID_CHAR, // [140]
+       INVALID_CHAR, // [141]
+       INVALID_CHAR, // [142]
+       INVALID_CHAR, // [143]
+       INVALID_CHAR, // [144]
+       INVALID_CHAR, // [145]
+       INVALID_CHAR, // [146]
+       INVALID_CHAR, // [147]
+       INVALID_CHAR, // [148]
+       INVALID_CHAR, // [149]
+       INVALID_CHAR, // [150]
+       INVALID_CHAR, // [151]
+       INVALID_CHAR, // [152]
+       INVALID_CHAR, // [153]
+       INVALID_CHAR, // [154]
+       INVALID_CHAR, // [155]
+       INVALID_CHAR, // [156]
+       INVALID_CHAR, // [157]
+       INVALID_CHAR, // [158]
+       INVALID_CHAR, // [159]
+       INVALID_CHAR, // [160]
+       INVALID_CHAR, // [161]
+       INVALID_CHAR, // [162]
+       INVALID_CHAR, // [163]
+       INVALID_CHAR, // [164]
+       INVALID_CHAR, // [165]
+       INVALID_CHAR, // [166]
+       INVALID_CHAR, // [167]
+       INVALID_CHAR, // [168]
+       INVALID_CHAR, // [169]
+       INVALID_CHAR, // [170]
+       INVALID_CHAR, // [171]
+       INVALID_CHAR, // [172]
+       INVALID_CHAR, // [173]
+       INVALID_CHAR, // [174]
+       INVALID_CHAR, // [175]
+       INVALID_CHAR, // [176]
+       INVALID_CHAR, // [177]
+       INVALID_CHAR, // [178]
+       INVALID_CHAR, // [179]
+       INVALID_CHAR, // [180]
+       INVALID_CHAR, // [181]
+       INVALID_CHAR, // [182]
+       INVALID_CHAR, // [183]
+       INVALID_CHAR, // [184]
+       INVALID_CHAR, // [185]
+       INVALID_CHAR, // [186]
+       INVALID_CHAR, // [187]
+       INVALID_CHAR, // [188]
+       INVALID_CHAR, // [189]
+       INVALID_CHAR, // [190]
+       INVALID_CHAR, // [191]
+       INVALID_CHAR, // [192]
+       INVALID_CHAR, // [193]
+       INVALID_CHAR, // [194]
+       INVALID_CHAR, // [195]
+       INVALID_CHAR, // [196]
+       INVALID_CHAR, // [197]
+       INVALID_CHAR, // [198]
+       INVALID_CHAR, // [199]
+       INVALID_CHAR, // [200]
+       INVALID_CHAR, // [201]
+       INVALID_CHAR, // [202]
+       INVALID_CHAR, // [203]
+       INVALID_CHAR, // [204]
+       INVALID_CHAR, // [205]
+       INVALID_CHAR, // [206]
+       INVALID_CHAR, // [207]
+       INVALID_CHAR, // [208]
+       INVALID_CHAR, // [209]
+       INVALID_CHAR, // [210]
+       INVALID_CHAR, // [211]
+       INVALID_CHAR, // [212]
+       INVALID_CHAR, // [213]
+       INVALID_CHAR, // [214]
+       INVALID_CHAR, // [215]
+       INVALID_CHAR, // [216]
+       INVALID_CHAR, // [217]
+       INVALID_CHAR, // [218]
+       INVALID_CHAR, // [219]
+       INVALID_CHAR, // [220]
+       INVALID_CHAR, // [221]
+       INVALID_CHAR, // [222]
+       INVALID_CHAR, // [223]
+       INVALID_CHAR, // [224]
+       INVALID_CHAR, // [225]
+       INVALID_CHAR, // [226]
+       INVALID_CHAR, // [227]
+       INVALID_CHAR, // [228]
+       INVALID_CHAR, // [229]
+       INVALID_CHAR, // [230]
+       INVALID_CHAR, // [231]
+       INVALID_CHAR, // [232]
+       INVALID_CHAR, // [233]
+       INVALID_CHAR, // [234]
+       INVALID_CHAR, // [235]
+       INVALID_CHAR, // [236]
+       INVALID_CHAR, // [237]
+       INVALID_CHAR, // [238]
+       INVALID_CHAR, // [239]
+       INVALID_CHAR, // [240]
+       INVALID_CHAR, // [241]
+       INVALID_CHAR, // [242]
+       INVALID_CHAR, // [243]
+       INVALID_CHAR, // [244]
+       INVALID_CHAR, // [245]
+       INVALID_CHAR, // [246]
+       INVALID_CHAR, // [247]
+       INVALID_CHAR, // [248]
+       INVALID_CHAR, // [249]
+       INVALID_CHAR, // [250]
+       INVALID_CHAR, // [251]
+       INVALID_CHAR, // [252]
+       INVALID_CHAR, // [253]
+       INVALID_CHAR, // [254]
+       INVALID_CHAR, // [255]
+       };
+
+unsigned g_CodonWordToAminoLetter[4*4*4] =
+       {
+       8 , // [ 0] = AAA K (Lys)
+       11, // [ 1] = AAC N (Asn)
+       8 , // [ 2] = AAG K (Lys)
+       11, // [ 3] = AAT N (Asn)
+       16, // [ 4] = ACA T (Thr)
+       16, // [ 5] = ACC T (Thr)
+       16, // [ 6] = ACG T (Thr)
+       16, // [ 7] = ACT T (Thr)
+       14, // [ 8] = AGA R (Arg)
+       15, // [ 9] = AGC S (Ser)
+       14, // [10] = AGG R (Arg)
+       15, // [11] = AGT S (Ser)
+       7 , // [12] = ATA I (Ile)
+       7 , // [13] = ATC I (Ile)
+       10, // [14] = ATG M (Met)
+       7 , // [15] = ATT I (Ile)
+       13, // [16] = CAA Q (Gln)
+       6 , // [17] = CAC H (His)
+       13, // [18] = CAG Q (Gln)
+       6 , // [19] = CAT H (His)
+       12, // [20] = CCA P (Pro)
+       12, // [21] = CCC P (Pro)
+       12, // [22] = CCG P (Pro)
+       12, // [23] = CCT P (Pro)
+       14, // [24] = CGA R (Arg)
+       14, // [25] = CGC R (Arg)
+       14, // [26] = CGG R (Arg)
+       14, // [27] = CGT R (Arg)
+       9 , // [28] = CTA L (Leu)
+       9 , // [29] = CTC L (Leu)
+       9 , // [30] = CTG L (Leu)
+       9 , // [31] = CTT L (Leu)
+       3 , // [32] = GAA E (Glu)
+       2 , // [33] = GAC D (Asp)
+       3 , // [34] = GAG E (Glu)
+       2 , // [35] = GAT D (Asp)
+       0 , // [36] = GCA A (Ala)
+       0 , // [37] = GCC A (Ala)
+       0 , // [38] = GCG A (Ala)
+       0 , // [39] = GCT A (Ala)
+       5 , // [40] = GGA G (Gly)
+       5 , // [41] = GGC G (Gly)
+       5 , // [42] = GGG G (Gly)
+       5 , // [43] = GGT G (Gly)
+       17, // [44] = GTA V (Val)
+       17, // [45] = GTC V (Val)
+       17, // [46] = GTG V (Val)
+       17, // [47] = GTT V (Val)
+       20, // [48] = TAA * (STP)
+       19, // [49] = TAC Y (Tyr)
+       20, // [50] = TAG * (STP)
+       19, // [51] = TAT Y (Tyr)
+       15, // [52] = TCA S (Ser)
+       15, // [53] = TCC S (Ser)
+       15, // [54] = TCG S (Ser)
+       15, // [55] = TCT S (Ser)
+       20, // [56] = TGA * (STP)
+       1 , // [57] = TGC C (Cys)
+       18, // [58] = TGG W (Trp)
+       1 , // [59] = TGT C (Cys)
+       9 , // [60] = TTA L (Leu)
+       4 , // [61] = TTC F (Phe)
+       9 , // [62] = TTG L (Leu)
+       4 , // [63] = TTT F (Phe)
+       };
+
+char g_CodonWordToAminoChar[4*4*4] =
+       {
+       'K', // [ 0] = AAA (Lys)
+       'N', // [ 1] = AAC (Asn)
+       'K', // [ 2] = AAG (Lys)
+       'N', // [ 3] = AAT (Asn)
+       'T', // [ 4] = ACA (Thr)
+       'T', // [ 5] = ACC (Thr)
+       'T', // [ 6] = ACG (Thr)
+       'T', // [ 7] = ACT (Thr)
+       'R', // [ 8] = AGA (Arg)
+       'S', // [ 9] = AGC (Ser)
+       'R', // [10] = AGG (Arg)
+       'S', // [11] = AGT (Ser)
+       'I', // [12] = ATA (Ile)
+       'I', // [13] = ATC (Ile)
+       'M', // [14] = ATG (Met)
+       'I', // [15] = ATT (Ile)
+       'Q', // [16] = CAA (Gln)
+       'H', // [17] = CAC (His)
+       'Q', // [18] = CAG (Gln)
+       'H', // [19] = CAT (His)
+       'P', // [20] = CCA (Pro)
+       'P', // [21] = CCC (Pro)
+       'P', // [22] = CCG (Pro)
+       'P', // [23] = CCT (Pro)
+       'R', // [24] = CGA (Arg)
+       'R', // [25] = CGC (Arg)
+       'R', // [26] = CGG (Arg)
+       'R', // [27] = CGT (Arg)
+       'L', // [28] = CTA (Leu)
+       'L', // [29] = CTC (Leu)
+       'L', // [30] = CTG (Leu)
+       'L', // [31] = CTT (Leu)
+       'E', // [32] = GAA (Glu)
+       'D', // [33] = GAC (Asp)
+       'E', // [34] = GAG (Glu)
+       'D', // [35] = GAT (Asp)
+       'A', // [36] = GCA (Ala)
+       'A', // [37] = GCC (Ala)
+       'A', // [38] = GCG (Ala)
+       'A', // [39] = GCT (Ala)
+       'G', // [40] = GGA (Gly)
+       'G', // [41] = GGC (Gly)
+       'G', // [42] = GGG (Gly)
+       'G', // [43] = GGT (Gly)
+       'V', // [44] = GTA (Val)
+       'V', // [45] = GTC (Val)
+       'V', // [46] = GTG (Val)
+       'V', // [47] = GTT (Val)
+       '*', // [48] = TAA (STP)
+       'Y', // [49] = TAC (Tyr)
+       '*', // [50] = TAG (STP)
+       'Y', // [51] = TAT (Tyr)
+       'S', // [52] = TCA (Ser)
+       'S', // [53] = TCC (Ser)
+       'S', // [54] = TCG (Ser)
+       'S', // [55] = TCT (Ser)
+       '*', // [56] = TGA (STP)
+       'C', // [57] = TGC (Cys)
+       'W', // [58] = TGG (Trp)
+       'C', // [59] = TGT (Cys)
+       'L', // [60] = TTA (Leu)
+       'F', // [61] = TTC (Phe)
+       'L', // [62] = TTG (Leu)
+       'F', // [63] = TTT (Phe)
+       };
+
+unsigned char g_CharToCompChar[256] =
+       {
+       INVALID_CHAR, // [  0]
+       INVALID_CHAR, // [  1]
+       INVALID_CHAR, // [  2]
+       INVALID_CHAR, // [  3]
+       INVALID_CHAR, // [  4]
+       INVALID_CHAR, // [  5]
+       INVALID_CHAR, // [  6]
+       INVALID_CHAR, // [  7]
+       INVALID_CHAR, // [  8]
+       INVALID_CHAR, // [  9]
+       INVALID_CHAR, // [ 10]
+       INVALID_CHAR, // [ 11]
+       INVALID_CHAR, // [ 12]
+       INVALID_CHAR, // [ 13]
+       INVALID_CHAR, // [ 14]
+       INVALID_CHAR, // [ 15]
+       INVALID_CHAR, // [ 16]
+       INVALID_CHAR, // [ 17]
+       INVALID_CHAR, // [ 18]
+       INVALID_CHAR, // [ 19]
+       INVALID_CHAR, // [ 20]
+       INVALID_CHAR, // [ 21]
+       INVALID_CHAR, // [ 22]
+       INVALID_CHAR, // [ 23]
+       INVALID_CHAR, // [ 24]
+       INVALID_CHAR, // [ 25]
+       INVALID_CHAR, // [ 26]
+       INVALID_CHAR, // [ 27]
+       INVALID_CHAR, // [ 28]
+       INVALID_CHAR, // [ 29]
+       INVALID_CHAR, // [ 30]
+       INVALID_CHAR, // [ 31]
+       INVALID_CHAR, // [ 32]
+       INVALID_CHAR, // [ 33]
+       INVALID_CHAR, // [ 34]
+       INVALID_CHAR, // [ 35]
+       INVALID_CHAR, // [ 36]
+       INVALID_CHAR, // [ 37]
+       INVALID_CHAR, // [ 38]
+       INVALID_CHAR, // [ 39]
+       INVALID_CHAR, // [ 40]
+       INVALID_CHAR, // [ 41]
+       INVALID_CHAR, // [ 42]
+       INVALID_CHAR, // [ 43]
+       INVALID_CHAR, // [ 44]
+       INVALID_CHAR, // [ 45]
+       INVALID_CHAR, // [ 46]
+       INVALID_CHAR, // [ 47]
+       INVALID_CHAR, // [ 48]
+       INVALID_CHAR, // [ 49]
+       INVALID_CHAR, // [ 50]
+       INVALID_CHAR, // [ 51]
+       INVALID_CHAR, // [ 52]
+       INVALID_CHAR, // [ 53]
+       INVALID_CHAR, // [ 54]
+       INVALID_CHAR, // [ 55]
+       INVALID_CHAR, // [ 56]
+       INVALID_CHAR, // [ 57]
+       INVALID_CHAR, // [ 58]
+       INVALID_CHAR, // [ 59]
+       INVALID_CHAR, // [ 60]
+       INVALID_CHAR, // [ 61]
+       INVALID_CHAR, // [ 62]
+       INVALID_CHAR, // [ 63]
+       INVALID_CHAR, // [ 64]
+       'T',          // [ 65] A -> T
+       INVALID_CHAR, // [ 66]
+       'G',          // [ 67] C -> G
+       INVALID_CHAR, // [ 68]
+       INVALID_CHAR, // [ 69]
+       INVALID_CHAR, // [ 70]
+       'C',          // [ 71] G -> C
+       INVALID_CHAR, // [ 72]
+       INVALID_CHAR, // [ 73]
+       INVALID_CHAR, // [ 74]
+       INVALID_CHAR, // [ 75]
+       INVALID_CHAR, // [ 76]
+       INVALID_CHAR, // [ 77]
+       INVALID_CHAR, // [ 78]
+       INVALID_CHAR, // [ 79]
+       INVALID_CHAR, // [ 80]
+       INVALID_CHAR, // [ 81]
+       INVALID_CHAR, // [ 82]
+       INVALID_CHAR, // [ 83]
+       'A',          // [ 84] T -> A
+       'A',          // [ 85] U -> A
+       INVALID_CHAR, // [ 86]
+       INVALID_CHAR, // [ 87]
+       INVALID_CHAR, // [ 88]
+       INVALID_CHAR, // [ 89]
+       INVALID_CHAR, // [ 90]
+       INVALID_CHAR, // [ 91]
+       INVALID_CHAR, // [ 92]
+       INVALID_CHAR, // [ 93]
+       INVALID_CHAR, // [ 94]
+       INVALID_CHAR, // [ 95]
+       INVALID_CHAR, // [ 96]
+       'T',          // [ 97] a -> T
+       INVALID_CHAR, // [ 98]
+       'G',          // [ 99] c -> G
+       INVALID_CHAR, // [100]
+       INVALID_CHAR, // [101]
+       INVALID_CHAR, // [102]
+       'C',          // [103] g -> C
+       INVALID_CHAR, // [104]
+       INVALID_CHAR, // [105]
+       INVALID_CHAR, // [106]
+       INVALID_CHAR, // [107]
+       INVALID_CHAR, // [108]
+       INVALID_CHAR, // [109]
+       INVALID_CHAR, // [110]
+       INVALID_CHAR, // [111]
+       INVALID_CHAR, // [112]
+       INVALID_CHAR, // [113]
+       INVALID_CHAR, // [114]
+       INVALID_CHAR, // [115]
+       'A',          // [116] t -> A
+       'A',          // [117] u -> A
+       INVALID_CHAR, // [118]
+       INVALID_CHAR, // [119]
+       INVALID_CHAR, // [120]
+       INVALID_CHAR, // [121]
+       INVALID_CHAR, // [122]
+       INVALID_CHAR, // [123]
+       INVALID_CHAR, // [124]
+       INVALID_CHAR, // [125]
+       INVALID_CHAR, // [126]
+       INVALID_CHAR, // [127]
+       INVALID_CHAR, // [128]
+       INVALID_CHAR, // [129]
+       INVALID_CHAR, // [130]
+       INVALID_CHAR, // [131]
+       INVALID_CHAR, // [132]
+       INVALID_CHAR, // [133]
+       INVALID_CHAR, // [134]
+       INVALID_CHAR, // [135]
+       INVALID_CHAR, // [136]
+       INVALID_CHAR, // [137]
+       INVALID_CHAR, // [138]
+       INVALID_CHAR, // [139]
+       INVALID_CHAR, // [140]
+       INVALID_CHAR, // [141]
+       INVALID_CHAR, // [142]
+       INVALID_CHAR, // [143]
+       INVALID_CHAR, // [144]
+       INVALID_CHAR, // [145]
+       INVALID_CHAR, // [146]
+       INVALID_CHAR, // [147]
+       INVALID_CHAR, // [148]
+       INVALID_CHAR, // [149]
+       INVALID_CHAR, // [150]
+       INVALID_CHAR, // [151]
+       INVALID_CHAR, // [152]
+       INVALID_CHAR, // [153]
+       INVALID_CHAR, // [154]
+       INVALID_CHAR, // [155]
+       INVALID_CHAR, // [156]
+       INVALID_CHAR, // [157]
+       INVALID_CHAR, // [158]
+       INVALID_CHAR, // [159]
+       INVALID_CHAR, // [160]
+       INVALID_CHAR, // [161]
+       INVALID_CHAR, // [162]
+       INVALID_CHAR, // [163]
+       INVALID_CHAR, // [164]
+       INVALID_CHAR, // [165]
+       INVALID_CHAR, // [166]
+       INVALID_CHAR, // [167]
+       INVALID_CHAR, // [168]
+       INVALID_CHAR, // [169]
+       INVALID_CHAR, // [170]
+       INVALID_CHAR, // [171]
+       INVALID_CHAR, // [172]
+       INVALID_CHAR, // [173]
+       INVALID_CHAR, // [174]
+       INVALID_CHAR, // [175]
+       INVALID_CHAR, // [176]
+       INVALID_CHAR, // [177]
+       INVALID_CHAR, // [178]
+       INVALID_CHAR, // [179]
+       INVALID_CHAR, // [180]
+       INVALID_CHAR, // [181]
+       INVALID_CHAR, // [182]
+       INVALID_CHAR, // [183]
+       INVALID_CHAR, // [184]
+       INVALID_CHAR, // [185]
+       INVALID_CHAR, // [186]
+       INVALID_CHAR, // [187]
+       INVALID_CHAR, // [188]
+       INVALID_CHAR, // [189]
+       INVALID_CHAR, // [190]
+       INVALID_CHAR, // [191]
+       INVALID_CHAR, // [192]
+       INVALID_CHAR, // [193]
+       INVALID_CHAR, // [194]
+       INVALID_CHAR, // [195]
+       INVALID_CHAR, // [196]
+       INVALID_CHAR, // [197]
+       INVALID_CHAR, // [198]
+       INVALID_CHAR, // [199]
+       INVALID_CHAR, // [200]
+       INVALID_CHAR, // [201]
+       INVALID_CHAR, // [202]
+       INVALID_CHAR, // [203]
+       INVALID_CHAR, // [204]
+       INVALID_CHAR, // [205]
+       INVALID_CHAR, // [206]
+       INVALID_CHAR, // [207]
+       INVALID_CHAR, // [208]
+       INVALID_CHAR, // [209]
+       INVALID_CHAR, // [210]
+       INVALID_CHAR, // [211]
+       INVALID_CHAR, // [212]
+       INVALID_CHAR, // [213]
+       INVALID_CHAR, // [214]
+       INVALID_CHAR, // [215]
+       INVALID_CHAR, // [216]
+       INVALID_CHAR, // [217]
+       INVALID_CHAR, // [218]
+       INVALID_CHAR, // [219]
+       INVALID_CHAR, // [220]
+       INVALID_CHAR, // [221]
+       INVALID_CHAR, // [222]
+       INVALID_CHAR, // [223]
+       INVALID_CHAR, // [224]
+       INVALID_CHAR, // [225]
+       INVALID_CHAR, // [226]
+       INVALID_CHAR, // [227]
+       INVALID_CHAR, // [228]
+       INVALID_CHAR, // [229]
+       INVALID_CHAR, // [230]
+       INVALID_CHAR, // [231]
+       INVALID_CHAR, // [232]
+       INVALID_CHAR, // [233]
+       INVALID_CHAR, // [234]
+       INVALID_CHAR, // [235]
+       INVALID_CHAR, // [236]
+       INVALID_CHAR, // [237]
+       INVALID_CHAR, // [238]
+       INVALID_CHAR, // [239]
+       INVALID_CHAR, // [240]
+       INVALID_CHAR, // [241]
+       INVALID_CHAR, // [242]
+       INVALID_CHAR, // [243]
+       INVALID_CHAR, // [244]
+       INVALID_CHAR, // [245]
+       INVALID_CHAR, // [246]
+       INVALID_CHAR, // [247]
+       INVALID_CHAR, // [248]
+       INVALID_CHAR, // [249]
+       INVALID_CHAR, // [250]
+       INVALID_CHAR, // [251]
+       INVALID_CHAR, // [252]
+       INVALID_CHAR, // [253]
+       INVALID_CHAR, // [254]
+       INVALID_CHAR, // [255]
+};
+
+unsigned g_CharToCompLetter[256] =
+       {
+       INVALID_LETTER, // [  0]
+       INVALID_LETTER, // [  1]
+       INVALID_LETTER, // [  2]
+       INVALID_LETTER, // [  3]
+       INVALID_LETTER, // [  4]
+       INVALID_LETTER, // [  5]
+       INVALID_LETTER, // [  6]
+       INVALID_LETTER, // [  7]
+       INVALID_LETTER, // [  8]
+       INVALID_LETTER, // [  9]
+       INVALID_LETTER, // [ 10]
+       INVALID_LETTER, // [ 11]
+       INVALID_LETTER, // [ 12]
+       INVALID_LETTER, // [ 13]
+       INVALID_LETTER, // [ 14]
+       INVALID_LETTER, // [ 15]
+       INVALID_LETTER, // [ 16]
+       INVALID_LETTER, // [ 17]
+       INVALID_LETTER, // [ 18]
+       INVALID_LETTER, // [ 19]
+       INVALID_LETTER, // [ 20]
+       INVALID_LETTER, // [ 21]
+       INVALID_LETTER, // [ 22]
+       INVALID_LETTER, // [ 23]
+       INVALID_LETTER, // [ 24]
+       INVALID_LETTER, // [ 25]
+       INVALID_LETTER, // [ 26]
+       INVALID_LETTER, // [ 27]
+       INVALID_LETTER, // [ 28]
+       INVALID_LETTER, // [ 29]
+       INVALID_LETTER, // [ 30]
+       INVALID_LETTER, // [ 31]
+       INVALID_LETTER, // [ 32]
+       INVALID_LETTER, // [ 33]
+       INVALID_LETTER, // [ 34]
+       INVALID_LETTER, // [ 35]
+       INVALID_LETTER, // [ 36]
+       INVALID_LETTER, // [ 37]
+       INVALID_LETTER, // [ 38]
+       INVALID_LETTER, // [ 39]
+       INVALID_LETTER, // [ 40]
+       INVALID_LETTER, // [ 41]
+       INVALID_LETTER, // [ 42]
+       INVALID_LETTER, // [ 43]
+       INVALID_LETTER, // [ 44]
+       INVALID_LETTER, // [ 45]
+       INVALID_LETTER, // [ 46]
+       INVALID_LETTER, // [ 47]
+       INVALID_LETTER, // [ 48]
+       INVALID_LETTER, // [ 49]
+       INVALID_LETTER, // [ 50]
+       INVALID_LETTER, // [ 51]
+       INVALID_LETTER, // [ 52]
+       INVALID_LETTER, // [ 53]
+       INVALID_LETTER, // [ 54]
+       INVALID_LETTER, // [ 55]
+       INVALID_LETTER, // [ 56]
+       INVALID_LETTER, // [ 57]
+       INVALID_LETTER, // [ 58]
+       INVALID_LETTER, // [ 59]
+       INVALID_LETTER, // [ 60]
+       INVALID_LETTER, // [ 61]
+       INVALID_LETTER, // [ 62]
+       INVALID_LETTER, // [ 63]
+       INVALID_LETTER, // [ 64]
+       3,              // [ 65] A -> T
+       INVALID_LETTER, // [ 66]
+       2,              // [ 67] C -> G
+       INVALID_LETTER, // [ 68]
+       INVALID_LETTER, // [ 69]
+       INVALID_LETTER, // [ 70]
+       1,              // [ 71] G -> C
+       INVALID_LETTER, // [ 72]
+       INVALID_LETTER, // [ 73]
+       INVALID_LETTER, // [ 74]
+       INVALID_LETTER, // [ 75]
+       INVALID_LETTER, // [ 76]
+       INVALID_LETTER, // [ 77]
+       INVALID_LETTER, // [ 78]
+       INVALID_LETTER, // [ 79]
+       INVALID_LETTER, // [ 80]
+       INVALID_LETTER, // [ 81]
+       INVALID_LETTER, // [ 82]
+       INVALID_LETTER, // [ 83]
+       0,              // [ 84] T -> A
+       0,              // [ 85] U -> A
+       INVALID_LETTER, // [ 86]
+       INVALID_LETTER, // [ 87]
+       INVALID_LETTER, // [ 88]
+       INVALID_LETTER, // [ 89]
+       INVALID_LETTER, // [ 90]
+       INVALID_LETTER, // [ 91]
+       INVALID_LETTER, // [ 92]
+       INVALID_LETTER, // [ 93]
+       INVALID_LETTER, // [ 94]
+       INVALID_LETTER, // [ 95]
+       INVALID_LETTER, // [ 96]
+       3,              // [ 97] a -> T
+       INVALID_LETTER, // [ 98]
+       2,              // [ 99] c -> G
+       INVALID_LETTER, // [100]
+       INVALID_LETTER, // [101]
+       INVALID_LETTER, // [102]
+       1,              // [103] g -> C
+       INVALID_LETTER, // [104]
+       INVALID_LETTER, // [105]
+       INVALID_LETTER, // [106]
+       INVALID_LETTER, // [107]
+       INVALID_LETTER, // [108]
+       INVALID_LETTER, // [109]
+       INVALID_LETTER, // [110]
+       INVALID_LETTER, // [111]
+       INVALID_LETTER, // [112]
+       INVALID_LETTER, // [113]
+       INVALID_LETTER, // [114]
+       INVALID_LETTER, // [115]
+       0,              // [116] t -> A
+       0,              // [117] u -> A
+       INVALID_LETTER, // [118]
+       INVALID_LETTER, // [119]
+       INVALID_LETTER, // [120]
+       INVALID_LETTER, // [121]
+       INVALID_LETTER, // [122]
+       INVALID_LETTER, // [123]
+       INVALID_LETTER, // [124]
+       INVALID_LETTER, // [125]
+       INVALID_LETTER, // [126]
+       INVALID_LETTER, // [127]
+       INVALID_LETTER, // [128]
+       INVALID_LETTER, // [129]
+       INVALID_LETTER, // [130]
+       INVALID_LETTER, // [131]
+       INVALID_LETTER, // [132]
+       INVALID_LETTER, // [133]
+       INVALID_LETTER, // [134]
+       INVALID_LETTER, // [135]
+       INVALID_LETTER, // [136]
+       INVALID_LETTER, // [137]
+       INVALID_LETTER, // [138]
+       INVALID_LETTER, // [139]
+       INVALID_LETTER, // [140]
+       INVALID_LETTER, // [141]
+       INVALID_LETTER, // [142]
+       INVALID_LETTER, // [143]
+       INVALID_LETTER, // [144]
+       INVALID_LETTER, // [145]
+       INVALID_LETTER, // [146]
+       INVALID_LETTER, // [147]
+       INVALID_LETTER, // [148]
+       INVALID_LETTER, // [149]
+       INVALID_LETTER, // [150]
+       INVALID_LETTER, // [151]
+       INVALID_LETTER, // [152]
+       INVALID_LETTER, // [153]
+       INVALID_LETTER, // [154]
+       INVALID_LETTER, // [155]
+       INVALID_LETTER, // [156]
+       INVALID_LETTER, // [157]
+       INVALID_LETTER, // [158]
+       INVALID_LETTER, // [159]
+       INVALID_LETTER, // [160]
+       INVALID_LETTER, // [161]
+       INVALID_LETTER, // [162]
+       INVALID_LETTER, // [163]
+       INVALID_LETTER, // [164]
+       INVALID_LETTER, // [165]
+       INVALID_LETTER, // [166]
+       INVALID_LETTER, // [167]
+       INVALID_LETTER, // [168]
+       INVALID_LETTER, // [169]
+       INVALID_LETTER, // [170]
+       INVALID_LETTER, // [171]
+       INVALID_LETTER, // [172]
+       INVALID_LETTER, // [173]
+       INVALID_LETTER, // [174]
+       INVALID_LETTER, // [175]
+       INVALID_LETTER, // [176]
+       INVALID_LETTER, // [177]
+       INVALID_LETTER, // [178]
+       INVALID_LETTER, // [179]
+       INVALID_LETTER, // [180]
+       INVALID_LETTER, // [181]
+       INVALID_LETTER, // [182]
+       INVALID_LETTER, // [183]
+       INVALID_LETTER, // [184]
+       INVALID_LETTER, // [185]
+       INVALID_LETTER, // [186]
+       INVALID_LETTER, // [187]
+       INVALID_LETTER, // [188]
+       INVALID_LETTER, // [189]
+       INVALID_LETTER, // [190]
+       INVALID_LETTER, // [191]
+       INVALID_LETTER, // [192]
+       INVALID_LETTER, // [193]
+       INVALID_LETTER, // [194]
+       INVALID_LETTER, // [195]
+       INVALID_LETTER, // [196]
+       INVALID_LETTER, // [197]
+       INVALID_LETTER, // [198]
+       INVALID_LETTER, // [199]
+       INVALID_LETTER, // [200]
+       INVALID_LETTER, // [201]
+       INVALID_LETTER, // [202]
+       INVALID_LETTER, // [203]
+       INVALID_LETTER, // [204]
+       INVALID_LETTER, // [205]
+       INVALID_LETTER, // [206]
+       INVALID_LETTER, // [207]
+       INVALID_LETTER, // [208]
+       INVALID_LETTER, // [209]
+       INVALID_LETTER, // [210]
+       INVALID_LETTER, // [211]
+       INVALID_LETTER, // [212]
+       INVALID_LETTER, // [213]
+       INVALID_LETTER, // [214]
+       INVALID_LETTER, // [215]
+       INVALID_LETTER, // [216]
+       INVALID_LETTER, // [217]
+       INVALID_LETTER, // [218]
+       INVALID_LETTER, // [219]
+       INVALID_LETTER, // [220]
+       INVALID_LETTER, // [221]
+       INVALID_LETTER, // [222]
+       INVALID_LETTER, // [223]
+       INVALID_LETTER, // [224]
+       INVALID_LETTER, // [225]
+       INVALID_LETTER, // [226]
+       INVALID_LETTER, // [227]
+       INVALID_LETTER, // [228]
+       INVALID_LETTER, // [229]
+       INVALID_LETTER, // [230]
+       INVALID_LETTER, // [231]
+       INVALID_LETTER, // [232]
+       INVALID_LETTER, // [233]
+       INVALID_LETTER, // [234]
+       INVALID_LETTER, // [235]
+       INVALID_LETTER, // [236]
+       INVALID_LETTER, // [237]
+       INVALID_LETTER, // [238]
+       INVALID_LETTER, // [239]
+       INVALID_LETTER, // [240]
+       INVALID_LETTER, // [241]
+       INVALID_LETTER, // [242]
+       INVALID_LETTER, // [243]
+       INVALID_LETTER, // [244]
+       INVALID_LETTER, // [245]
+       INVALID_LETTER, // [246]
+       INVALID_LETTER, // [247]
+       INVALID_LETTER, // [248]
+       INVALID_LETTER, // [249]
+       INVALID_LETTER, // [250]
+       INVALID_LETTER, // [251]
+       INVALID_LETTER, // [252]
+       INVALID_LETTER, // [253]
+       INVALID_LETTER, // [254]
+       INVALID_LETTER, // [255]
+};
+
+bool g_IsAminoChar[256] =
+       {
+       false, // [  0] 0x00
+       false, // [  1] 0x01
+       false, // [  2] 0x02
+       false, // [  3] 0x03
+       false, // [  4] 0x04
+       false, // [  5] 0x05
+       false, // [  6] 0x06
+       false, // [  7] 0x07
+       false, // [  8] 0x08
+       false, // [  9] 0x09
+       false, // [ 10] 0x0a
+       false, // [ 11] 0x0b
+       false, // [ 12] 0x0c
+       false, // [ 13] 0x0d
+       false, // [ 14] 0x0e
+       false, // [ 15] 0x0f
+       false, // [ 16] 0x10
+       false, // [ 17] 0x11
+       false, // [ 18] 0x12
+       false, // [ 19] 0x13
+       false, // [ 20] 0x14
+       false, // [ 21] 0x15
+       false, // [ 22] 0x16
+       false, // [ 23] 0x17
+       false, // [ 24] 0x18
+       false, // [ 25] 0x19
+       false, // [ 26] 0x1a
+       false, // [ 27] 0x1b
+       false, // [ 28] 0x1c
+       false, // [ 29] 0x1d
+       false, // [ 30] 0x1e
+       false, // [ 31] 0x1f
+       false, // [ 32] ' '
+       false, // [ 33] '!'
+       false, // [ 34] '"'
+       false, // [ 35] '#'
+       false, // [ 36] '$'
+       false, // [ 37] '%'
+       false, // [ 38] '&'
+       false, // [ 39] '''
+       false, // [ 40] '('
+       false, // [ 41] ')'
+       true,  // [ 42] '*' = STP
+       false, // [ 43] '+'
+       false, // [ 44] ','
+       false, // [ 45] '-'
+       false, // [ 46] '.'
+       false, // [ 47] '/'
+       false, // [ 48] '0'
+       false, // [ 49] '1'
+       false, // [ 50] '2'
+       false, // [ 51] '3'
+       false, // [ 52] '4'
+       false, // [ 53] '5'
+       false, // [ 54] '6'
+       false, // [ 55] '7'
+       false, // [ 56] '8'
+       false, // [ 57] '9'
+       false, // [ 58] ':'
+       false, // [ 59] ';'
+       false, // [ 60] '<'
+       false, // [ 61] '='
+       false, // [ 62] '>'
+       false, // [ 63] '?'
+       false, // [ 64] '@'
+       true,  // [ 65] 'A' = Ala
+       false, // [ 66] 'B'
+       true,  // [ 67] 'C' = Cys
+       true,  // [ 68] 'D' = Asp
+       true,  // [ 69] 'E' = Glu
+       true,  // [ 70] 'F' = Phe
+       true,  // [ 71] 'G' = Gly
+       true,  // [ 72] 'H' = His
+       true,  // [ 73] 'I' = Ile
+       false, // [ 74] 'J'
+       true,  // [ 75] 'K' = Lys
+       true,  // [ 76] 'L' = Leu
+       true,  // [ 77] 'M' = Met
+       true,  // [ 78] 'N' = Asn
+       false, // [ 79] 'O'
+       true,  // [ 80] 'P' = Pro
+       true,  // [ 81] 'Q' = Gln
+       true,  // [ 82] 'R' = Arg
+       true,  // [ 83] 'S' = Ser
+       true,  // [ 84] 'T' = Thr
+       false, // [ 85] 'U'
+       true,  // [ 86] 'V' = Val
+       true,  // [ 87] 'W' = Trp
+       false, // [ 88] 'X'
+       true,  // [ 89] 'Y' = Tyr
+       false, // [ 90] 'Z'
+       false, // [ 91] '['
+       false, // [ 92] '\'
+       false, // [ 93] ']'
+       false, // [ 94] '^'
+       false, // [ 95] '_'
+       false, // [ 96] '`'
+       true,  // [ 97] 'A' = Ala
+       false, // [ 98] 'B'
+       true,  // [ 99] 'C' = Cys
+       true,  // [100] 'D' = Asp
+       true,  // [101] 'E' = Glu
+       true,  // [102] 'F' = Phe
+       true,  // [103] 'G' = Gly
+       true,  // [104] 'H' = His
+       true,  // [105] 'I' = Ile
+       false, // [106] 'J'
+       true,  // [107] 'K' = Lys
+       true,  // [108] 'L' = Leu
+       true,  // [109] 'M' = Met
+       true,  // [110] 'N' = Asn
+       false, // [111] 'O'
+       true,  // [112] 'P' = Pro
+       true,  // [113] 'Q' = Gln
+       true,  // [114] 'R' = Arg
+       true,  // [115] 'S' = Ser
+       true,  // [116] 'T' = Thr
+       false, // [117] 'U'
+       true,  // [118] 'V' = Val
+       true,  // [119] 'W' = Trp
+       false, // [120] 'X'
+       true,  // [121] 'Y' = Tyr
+       false, // [122] 'Z'
+       false, // [123] '{'
+       false, // [124] '|'
+       false, // [125] '}'
+       false, // [126] '~'
+       false, // [127] 0x7f
+       false, // [128] 0x80
+       false, // [129] 0x81
+       false, // [130] 0x82
+       false, // [131] 0x83
+       false, // [132] 0x84
+       false, // [133] 0x85
+       false, // [134] 0x86
+       false, // [135] 0x87
+       false, // [136] 0x88
+       false, // [137] 0x89
+       false, // [138] 0x8a
+       false, // [139] 0x8b
+       false, // [140] 0x8c
+       false, // [141] 0x8d
+       false, // [142] 0x8e
+       false, // [143] 0x8f
+       false, // [144] 0x90
+       false, // [145] 0x91
+       false, // [146] 0x92
+       false, // [147] 0x93
+       false, // [148] 0x94
+       false, // [149] 0x95
+       false, // [150] 0x96
+       false, // [151] 0x97
+       false, // [152] 0x98
+       false, // [153] 0x99
+       false, // [154] 0x9a
+       false, // [155] 0x9b
+       false, // [156] 0x9c
+       false, // [157] 0x9d
+       false, // [158] 0x9e
+       false, // [159] 0x9f
+       false, // [160] 0xa0
+       false, // [161] 0xa1
+       false, // [162] 0xa2
+       false, // [163] 0xa3
+       false, // [164] 0xa4
+       false, // [165] 0xa5
+       false, // [166] 0xa6
+       false, // [167] 0xa7
+       false, // [168] 0xa8
+       false, // [169] 0xa9
+       false, // [170] 0xaa
+       false, // [171] 0xab
+       false, // [172] 0xac
+       false, // [173] 0xad
+       false, // [174] 0xae
+       false, // [175] 0xaf
+       false, // [176] 0xb0
+       false, // [177] 0xb1
+       false, // [178] 0xb2
+       false, // [179] 0xb3
+       false, // [180] 0xb4
+       false, // [181] 0xb5
+       false, // [182] 0xb6
+       false, // [183] 0xb7
+       false, // [184] 0xb8
+       false, // [185] 0xb9
+       false, // [186] 0xba
+       false, // [187] 0xbb
+       false, // [188] 0xbc
+       false, // [189] 0xbd
+       false, // [190] 0xbe
+       false, // [191] 0xbf
+       false, // [192] 0xc0
+       false, // [193] 0xc1
+       false, // [194] 0xc2
+       false, // [195] 0xc3
+       false, // [196] 0xc4
+       false, // [197] 0xc5
+       false, // [198] 0xc6
+       false, // [199] 0xc7
+       false, // [200] 0xc8
+       false, // [201] 0xc9
+       false, // [202] 0xca
+       false, // [203] 0xcb
+       false, // [204] 0xcc
+       false, // [205] 0xcd
+       false, // [206] 0xce
+       false, // [207] 0xcf
+       false, // [208] 0xd0
+       false, // [209] 0xd1
+       false, // [210] 0xd2
+       false, // [211] 0xd3
+       false, // [212] 0xd4
+       false, // [213] 0xd5
+       false, // [214] 0xd6
+       false, // [215] 0xd7
+       false, // [216] 0xd8
+       false, // [217] 0xd9
+       false, // [218] 0xda
+       false, // [219] 0xdb
+       false, // [220] 0xdc
+       false, // [221] 0xdd
+       false, // [222] 0xde
+       false, // [223] 0xdf
+       false, // [224] 0xe0
+       false, // [225] 0xe1
+       false, // [226] 0xe2
+       false, // [227] 0xe3
+       false, // [228] 0xe4
+       false, // [229] 0xe5
+       false, // [230] 0xe6
+       false, // [231] 0xe7
+       false, // [232] 0xe8
+       false, // [233] 0xe9
+       false, // [234] 0xea
+       false, // [235] 0xeb
+       false, // [236] 0xec
+       false, // [237] 0xed
+       false, // [238] 0xee
+       false, // [239] 0xef
+       false, // [240] 0xf0
+       false, // [241] 0xf1
+       false, // [242] 0xf2
+       false, // [243] 0xf3
+       false, // [244] 0xf4
+       false, // [245] 0xf5
+       false, // [246] 0xf6
+       false, // [247] 0xf7
+       false, // [248] 0xf8
+       false, // [249] 0xf9
+       false, // [250] 0xfa
+       false, // [251] 0xfb
+       false, // [252] 0xfc
+       false, // [253] 0xfd
+       false, // [254] 0xfe
+       false, // [255] 0xff
+       };
+
+bool g_IsNucleoChar[256] =
+       {
+       false, // [  0] 0x00
+       false, // [  1] 0x01
+       false, // [  2] 0x02
+       false, // [  3] 0x03
+       false, // [  4] 0x04
+       false, // [  5] 0x05
+       false, // [  6] 0x06
+       false, // [  7] 0x07
+       false, // [  8] 0x08
+       false, // [  9] 0x09
+       false, // [ 10] 0x0a
+       false, // [ 11] 0x0b
+       false, // [ 12] 0x0c
+       false, // [ 13] 0x0d
+       false, // [ 14] 0x0e
+       false, // [ 15] 0x0f
+       false, // [ 16] 0x10
+       false, // [ 17] 0x11
+       false, // [ 18] 0x12
+       false, // [ 19] 0x13
+       false, // [ 20] 0x14
+       false, // [ 21] 0x15
+       false, // [ 22] 0x16
+       false, // [ 23] 0x17
+       false, // [ 24] 0x18
+       false, // [ 25] 0x19
+       false, // [ 26] 0x1a
+       false, // [ 27] 0x1b
+       false, // [ 28] 0x1c
+       false, // [ 29] 0x1d
+       false, // [ 30] 0x1e
+       false, // [ 31] 0x1f
+       false, // [ 32] ' '
+       false, // [ 33] '!'
+       false, // [ 34] '"'
+       false, // [ 35] '#'
+       false, // [ 36] '$'
+       false, // [ 37] '%'
+       false, // [ 38] '&'
+       false, // [ 39] '''
+       false, // [ 40] '('
+       false, // [ 41] ')'
+       false, // [ 42] '*'
+       false, // [ 43] '+'
+       false, // [ 44] ','
+       false, // [ 45] '-'
+       false, // [ 46] '.'
+       false, // [ 47] '/'
+       false, // [ 48] '0'
+       false, // [ 49] '1'
+       false, // [ 50] '2'
+       false, // [ 51] '3'
+       false, // [ 52] '4'
+       false, // [ 53] '5'
+       false, // [ 54] '6'
+       false, // [ 55] '7'
+       false, // [ 56] '8'
+       false, // [ 57] '9'
+       false, // [ 58] ':'
+       false, // [ 59] ';'
+       false, // [ 60] '<'
+       false, // [ 61] '='
+       false, // [ 62] '>'
+       false, // [ 63] '?'
+       false, // [ 64] '@'
+       true,  // [ 65] 'A' (Nucleotide)
+       false, // [ 66] 'B'
+       true,  // [ 67] 'C' (Nucleotide)
+       false, // [ 68] 'D'
+       false, // [ 69] 'E'
+       false, // [ 70] 'F'
+       true,  // [ 71] 'G' (Nucleotide)
+       false, // [ 72] 'H'
+       false, // [ 73] 'I'
+       false, // [ 74] 'J'
+       false, // [ 75] 'K'
+       false, // [ 76] 'L'
+       false, // [ 77] 'M'
+       true,  // [ 78] 'N' (Nucleotide)
+       false, // [ 79] 'O'
+       false, // [ 80] 'P'
+       false, // [ 81] 'Q'
+       false, // [ 82] 'R'
+       false, // [ 83] 'S'
+       true,  // [ 84] 'T' (Nucleotide)
+       true,  // [ 85] 'U' (Nucleotide)
+       false, // [ 86] 'V'
+       false, // [ 87] 'W'
+       false, // [ 88] 'X'
+       false, // [ 89] 'Y'
+       false, // [ 90] 'Z'
+       false, // [ 91] '['
+       false, // [ 92] '\'
+       false, // [ 93] ']'
+       false, // [ 94] '^'
+       false, // [ 95] '_'
+       false, // [ 96] '`'
+       true,  // [ 97] 'A' (Nucleotide)
+       false, // [ 98] 'B'
+       true,  // [ 99] 'C' (Nucleotide)
+       false, // [100] 'D'
+       false, // [101] 'E'
+       false, // [102] 'F'
+       true,  // [103] 'G' (Nucleotide)
+       false, // [104] 'H'
+       false, // [105] 'I'
+       false, // [106] 'J'
+       false, // [107] 'K'
+       false, // [108] 'L'
+       false, // [109] 'M'
+       true,  // [110] 'N' (Nucleotide)
+       false, // [111] 'O'
+       false, // [112] 'P'
+       false, // [113] 'Q'
+       false, // [114] 'R'
+       false, // [115] 'S'
+       true,  // [116] 'T' (Nucleotide)
+       true,  // [117] 'U' (Nucleotide)
+       false, // [118] 'V'
+       false, // [119] 'W'
+       false, // [120] 'X'
+       false, // [121] 'Y'
+       false, // [122] 'Z'
+       false, // [123] '{'
+       false, // [124] '|'
+       false, // [125] '}'
+       false, // [126] '~'
+       false, // [127] 0x7f
+       false, // [128] 0x80
+       false, // [129] 0x81
+       false, // [130] 0x82
+       false, // [131] 0x83
+       false, // [132] 0x84
+       false, // [133] 0x85
+       false, // [134] 0x86
+       false, // [135] 0x87
+       false, // [136] 0x88
+       false, // [137] 0x89
+       false, // [138] 0x8a
+       false, // [139] 0x8b
+       false, // [140] 0x8c
+       false, // [141] 0x8d
+       false, // [142] 0x8e
+       false, // [143] 0x8f
+       false, // [144] 0x90
+       false, // [145] 0x91
+       false, // [146] 0x92
+       false, // [147] 0x93
+       false, // [148] 0x94
+       false, // [149] 0x95
+       false, // [150] 0x96
+       false, // [151] 0x97
+       false, // [152] 0x98
+       false, // [153] 0x99
+       false, // [154] 0x9a
+       false, // [155] 0x9b
+       false, // [156] 0x9c
+       false, // [157] 0x9d
+       false, // [158] 0x9e
+       false, // [159] 0x9f
+       false, // [160] 0xa0
+       false, // [161] 0xa1
+       false, // [162] 0xa2
+       false, // [163] 0xa3
+       false, // [164] 0xa4
+       false, // [165] 0xa5
+       false, // [166] 0xa6
+       false, // [167] 0xa7
+       false, // [168] 0xa8
+       false, // [169] 0xa9
+       false, // [170] 0xaa
+       false, // [171] 0xab
+       false, // [172] 0xac
+       false, // [173] 0xad
+       false, // [174] 0xae
+       false, // [175] 0xaf
+       false, // [176] 0xb0
+       false, // [177] 0xb1
+       false, // [178] 0xb2
+       false, // [179] 0xb3
+       false, // [180] 0xb4
+       false, // [181] 0xb5
+       false, // [182] 0xb6
+       false, // [183] 0xb7
+       false, // [184] 0xb8
+       false, // [185] 0xb9
+       false, // [186] 0xba
+       false, // [187] 0xbb
+       false, // [188] 0xbc
+       false, // [189] 0xbd
+       false, // [190] 0xbe
+       false, // [191] 0xbf
+       false, // [192] 0xc0
+       false, // [193] 0xc1
+       false, // [194] 0xc2
+       false, // [195] 0xc3
+       false, // [196] 0xc4
+       false, // [197] 0xc5
+       false, // [198] 0xc6
+       false, // [199] 0xc7
+       false, // [200] 0xc8
+       false, // [201] 0xc9
+       false, // [202] 0xca
+       false, // [203] 0xcb
+       false, // [204] 0xcc
+       false, // [205] 0xcd
+       false, // [206] 0xce
+       false, // [207] 0xcf
+       false, // [208] 0xd0
+       false, // [209] 0xd1
+       false, // [210] 0xd2
+       false, // [211] 0xd3
+       false, // [212] 0xd4
+       false, // [213] 0xd5
+       false, // [214] 0xd6
+       false, // [215] 0xd7
+       false, // [216] 0xd8
+       false, // [217] 0xd9
+       false, // [218] 0xda
+       false, // [219] 0xdb
+       false, // [220] 0xdc
+       false, // [221] 0xdd
+       false, // [222] 0xde
+       false, // [223] 0xdf
+       false, // [224] 0xe0
+       false, // [225] 0xe1
+       false, // [226] 0xe2
+       false, // [227] 0xe3
+       false, // [228] 0xe4
+       false, // [229] 0xe5
+       false, // [230] 0xe6
+       false, // [231] 0xe7
+       false, // [232] 0xe8
+       false, // [233] 0xe9
+       false, // [234] 0xea
+       false, // [235] 0xeb
+       false, // [236] 0xec
+       false, // [237] 0xed
+       false, // [238] 0xee
+       false, // [239] 0xef
+       false, // [240] 0xf0
+       false, // [241] 0xf1
+       false, // [242] 0xf2
+       false, // [243] 0xf3
+       false, // [244] 0xf4
+       false, // [245] 0xf5
+       false, // [246] 0xf6
+       false, // [247] 0xf7
+       false, // [248] 0xf8
+       false, // [249] 0xf9
+       false, // [250] 0xfa
+       false, // [251] 0xfb
+       false, // [252] 0xfc
+       false, // [253] 0xfd
+       false, // [254] 0xfe
+       false, // [255] 0xff
+       };
+
+bool g_IsACGTU[256] =
+       {
+       false, // [  0] 0x00
+       false, // [  1] 0x01
+       false, // [  2] 0x02
+       false, // [  3] 0x03
+       false, // [  4] 0x04
+       false, // [  5] 0x05
+       false, // [  6] 0x06
+       false, // [  7] 0x07
+       false, // [  8] 0x08
+       false, // [  9] 0x09
+       false, // [ 10] 0x0a
+       false, // [ 11] 0x0b
+       false, // [ 12] 0x0c
+       false, // [ 13] 0x0d
+       false, // [ 14] 0x0e
+       false, // [ 15] 0x0f
+       false, // [ 16] 0x10
+       false, // [ 17] 0x11
+       false, // [ 18] 0x12
+       false, // [ 19] 0x13
+       false, // [ 20] 0x14
+       false, // [ 21] 0x15
+       false, // [ 22] 0x16
+       false, // [ 23] 0x17
+       false, // [ 24] 0x18
+       false, // [ 25] 0x19
+       false, // [ 26] 0x1a
+       false, // [ 27] 0x1b
+       false, // [ 28] 0x1c
+       false, // [ 29] 0x1d
+       false, // [ 30] 0x1e
+       false, // [ 31] 0x1f
+       false, // [ 32] ' '
+       false, // [ 33] '!'
+       false, // [ 34] '"'
+       false, // [ 35] '#'
+       false, // [ 36] '$'
+       false, // [ 37] '%'
+       false, // [ 38] '&'
+       false, // [ 39] '''
+       false, // [ 40] '('
+       false, // [ 41] ')'
+       false, // [ 42] '*'
+       false, // [ 43] '+'
+       false, // [ 44] ','
+       false, // [ 45] '-'
+       false, // [ 46] '.'
+       false, // [ 47] '/'
+       false, // [ 48] '0'
+       false, // [ 49] '1'
+       false, // [ 50] '2'
+       false, // [ 51] '3'
+       false, // [ 52] '4'
+       false, // [ 53] '5'
+       false, // [ 54] '6'
+       false, // [ 55] '7'
+       false, // [ 56] '8'
+       false, // [ 57] '9'
+       false, // [ 58] ':'
+       false, // [ 59] ';'
+       false, // [ 60] '<'
+       false, // [ 61] '='
+       false, // [ 62] '>'
+       false, // [ 63] '?'
+       false, // [ 64] '@'
+       true,  // [ 65] 'A' (ACGT)
+       false, // [ 66] 'B'
+       true,  // [ 67] 'C' (ACGT)
+       false, // [ 68] 'D'
+       false, // [ 69] 'E'
+       false, // [ 70] 'F'
+       true,  // [ 71] 'G' (ACGT)
+       false, // [ 72] 'H'
+       false, // [ 73] 'I'
+       false, // [ 74] 'J'
+       false, // [ 75] 'K'
+       false, // [ 76] 'L'
+       false, // [ 77] 'M'
+       false, // [ 78] 'N'
+       false, // [ 79] 'O'
+       false, // [ 80] 'P'
+       false, // [ 81] 'Q'
+       false, // [ 82] 'R'
+       false, // [ 83] 'S'
+       true,  // [ 84] 'T' (ACGT)
+       true,  // [ 85] 'U' (ACGT)
+       false, // [ 86] 'V'
+       false, // [ 87] 'W'
+       false, // [ 88] 'X'
+       false, // [ 89] 'Y'
+       false, // [ 90] 'Z'
+       false, // [ 91] '['
+       false, // [ 92] '\'
+       false, // [ 93] ']'
+       false, // [ 94] '^'
+       false, // [ 95] '_'
+       false, // [ 96] '`'
+       true,  // [ 97] 'A' (ACGT)
+       false, // [ 98] 'B'
+       true,  // [ 99] 'C' (ACGT)
+       false, // [100] 'D'
+       false, // [101] 'E'
+       false, // [102] 'F'
+       true,  // [103] 'G' (ACGT)
+       false, // [104] 'H'
+       false, // [105] 'I'
+       false, // [106] 'J'
+       false, // [107] 'K'
+       false, // [108] 'L'
+       false, // [109] 'M'
+       false, // [110] 'N'
+       false, // [111] 'O'
+       false, // [112] 'P'
+       false, // [113] 'Q'
+       false, // [114] 'R'
+       false, // [115] 'S'
+       true,  // [116] 'T' (ACGT)
+       true,  // [117] 'U' (ACGT)
+       false, // [118] 'V'
+       false, // [119] 'W'
+       false, // [120] 'X'
+       false, // [121] 'Y'
+       false, // [122] 'Z'
+       false, // [123] '{'
+       false, // [124] '|'
+       false, // [125] '}'
+       false, // [126] '~'
+       false, // [127] 0x7f
+       false, // [128] 0x80
+       false, // [129] 0x81
+       false, // [130] 0x82
+       false, // [131] 0x83
+       false, // [132] 0x84
+       false, // [133] 0x85
+       false, // [134] 0x86
+       false, // [135] 0x87
+       false, // [136] 0x88
+       false, // [137] 0x89
+       false, // [138] 0x8a
+       false, // [139] 0x8b
+       false, // [140] 0x8c
+       false, // [141] 0x8d
+       false, // [142] 0x8e
+       false, // [143] 0x8f
+       false, // [144] 0x90
+       false, // [145] 0x91
+       false, // [146] 0x92
+       false, // [147] 0x93
+       false, // [148] 0x94
+       false, // [149] 0x95
+       false, // [150] 0x96
+       false, // [151] 0x97
+       false, // [152] 0x98
+       false, // [153] 0x99
+       false, // [154] 0x9a
+       false, // [155] 0x9b
+       false, // [156] 0x9c
+       false, // [157] 0x9d
+       false, // [158] 0x9e
+       false, // [159] 0x9f
+       false, // [160] 0xa0
+       false, // [161] 0xa1
+       false, // [162] 0xa2
+       false, // [163] 0xa3
+       false, // [164] 0xa4
+       false, // [165] 0xa5
+       false, // [166] 0xa6
+       false, // [167] 0xa7
+       false, // [168] 0xa8
+       false, // [169] 0xa9
+       false, // [170] 0xaa
+       false, // [171] 0xab
+       false, // [172] 0xac
+       false, // [173] 0xad
+       false, // [174] 0xae
+       false, // [175] 0xaf
+       false, // [176] 0xb0
+       false, // [177] 0xb1
+       false, // [178] 0xb2
+       false, // [179] 0xb3
+       false, // [180] 0xb4
+       false, // [181] 0xb5
+       false, // [182] 0xb6
+       false, // [183] 0xb7
+       false, // [184] 0xb8
+       false, // [185] 0xb9
+       false, // [186] 0xba
+       false, // [187] 0xbb
+       false, // [188] 0xbc
+       false, // [189] 0xbd
+       false, // [190] 0xbe
+       false, // [191] 0xbf
+       false, // [192] 0xc0
+       false, // [193] 0xc1
+       false, // [194] 0xc2
+       false, // [195] 0xc3
+       false, // [196] 0xc4
+       false, // [197] 0xc5
+       false, // [198] 0xc6
+       false, // [199] 0xc7
+       false, // [200] 0xc8
+       false, // [201] 0xc9
+       false, // [202] 0xca
+       false, // [203] 0xcb
+       false, // [204] 0xcc
+       false, // [205] 0xcd
+       false, // [206] 0xce
+       false, // [207] 0xcf
+       false, // [208] 0xd0
+       false, // [209] 0xd1
+       false, // [210] 0xd2
+       false, // [211] 0xd3
+       false, // [212] 0xd4
+       false, // [213] 0xd5
+       false, // [214] 0xd6
+       false, // [215] 0xd7
+       false, // [216] 0xd8
+       false, // [217] 0xd9
+       false, // [218] 0xda
+       false, // [219] 0xdb
+       false, // [220] 0xdc
+       false, // [221] 0xdd
+       false, // [222] 0xde
+       false, // [223] 0xdf
+       false, // [224] 0xe0
+       false, // [225] 0xe1
+       false, // [226] 0xe2
+       false, // [227] 0xe3
+       false, // [228] 0xe4
+       false, // [229] 0xe5
+       false, // [230] 0xe6
+       false, // [231] 0xe7
+       false, // [232] 0xe8
+       false, // [233] 0xe9
+       false, // [234] 0xea
+       false, // [235] 0xeb
+       false, // [236] 0xec
+       false, // [237] 0xed
+       false, // [238] 0xee
+       false, // [239] 0xef
+       false, // [240] 0xf0
+       false, // [241] 0xf1
+       false, // [242] 0xf2
+       false, // [243] 0xf3
+       false, // [244] 0xf4
+       false, // [245] 0xf5
+       false, // [246] 0xf6
+       false, // [247] 0xf7
+       false, // [248] 0xf8
+       false, // [249] 0xf9
+       false, // [250] 0xfa
+       false, // [251] 0xfb
+       false, // [252] 0xfc
+       false, // [253] 0xfd
+       false, // [254] 0xfe
+       false, // [255] 0xff
+       };
+
+float g_AminoFreqs[20] =
+       {
+       0.0777f, // 'A' = Ala
+       0.0161f, // 'C' = Cys
+       0.0527f, // 'D' = Asp
+       0.0631f, // 'E' = Glu
+       0.0417f, // 'F' = Phe
+       0.0718f, // 'G' = Gly
+       0.0238f, // 'H' = His
+       0.0606f, // 'I' = Ile
+       0.0601f, // 'K' = Lys
+       0.0906f, // 'L' = Leu
+       0.0233f, // 'M' = Met
+       0.0439f, // 'N' = Asn
+       0.0456f, // 'P' = Pro
+       0.0368f, // 'Q' = Gln
+       0.0526f, // 'R' = Arg
+       0.0639f, // 'S' = Ser
+       0.0570f, // 'T' = Thr
+       0.0712f, // 'V' = Val
+       0.0134f, // 'W' = Trp
+       0.0339f, // 'Y' = Tyr
+       };
diff --git a/uchime_src/alpha.h b/uchime_src/alpha.h

new file mode 100644 (file)

index 0000000..e021b7f
--- /dev/null
+++ b/uchime_src/alpha.h
@@ -0,0 +1,50 @@
+#ifndef alpha_h\r
+#define alpha_h\r
+\r
+#include <limits.h>\r
+#include <string>\r
+\r
+using namespace std;\r
+\r
+const unsigned INVALID_LETTER = 0;\r
+const unsigned char INVALID_CHAR = '?';\r
+\r
+extern unsigned g_CharToLetterAmino[];\r
+extern unsigned g_CharToLetterAminoStop[];\r
+extern unsigned char g_LetterToCharAmino[];\r
+extern unsigned g_CharToLetterNucleo[];\r
+extern unsigned char g_LetterToCharNucleo[];\r
+extern unsigned g_CodonWordToAminoLetter[];\r
+extern char g_CodonWordToAminoChar[];\r
+extern unsigned char g_CharToCompChar[];\r
+extern unsigned g_CharToCompLetter[];\r
+extern bool g_IsAminoChar[];\r
+extern bool g_IsNucleoChar[];\r
+extern bool g_IsACGTU[];\r
+extern float g_AminoFreqs[];\r
+\r
+extern unsigned g_CharToLetterRed[];\r
+extern unsigned char g_LetterToCharRed[];\r
+extern unsigned g_RedAlphaSize;\r
+\r
+void LogRedAlphaRed();\r
+void ReadRedAlphaFromFile(const string &FileName);\r
+unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2,\r
+  unsigned char c3);\r
+\r
+static inline bool AminoLetterIsStartCodon(unsigned char Letter)\r
+       {\r
+       return Letter == 10;\r
+       }\r
+\r
+static inline bool AminoLetterIsStopCodon(unsigned char Letter)\r
+       {\r
+       return Letter == 20;\r
+       }\r
+\r
+const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo);\r
+const char *WordToStrNucleo(unsigned Word, unsigned WordLength);\r
+const char *WordToStrAmino(unsigned Word, unsigned WordLength);\r
+const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str);\r
+\r
+#endif // alpha_h\r
diff --git a/uchime_src/alpha2.cpp b/uchime_src/alpha2.cpp

new file mode 100644 (file)

index 0000000..26bc1c6
--- /dev/null
+++ b/uchime_src/alpha2.cpp
@@ -0,0 +1,100 @@
+#include "myutils.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+bool isgap(byte c)\r
+       {\r
+       return c == '-' || c == '.';\r
+       }\r
+\r
+const char *WordToStrAmino(unsigned Word, unsigned WordLength)\r
+       {\r
+       static char Str[32];\r
+       for (unsigned i = 0; i < WordLength; ++i)\r
+               {\r
+               unsigned Letter = Word%20;\r
+               Str[WordLength-i-1] = g_LetterToCharAmino[Letter];\r
+               Word /= 20;\r
+               }\r
+       Str[WordLength] = 0;\r
+       return Str;\r
+       }\r
+\r
+const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str)\r
+       {\r
+       for (unsigned i = 0; i < WordLength; ++i)\r
+               {\r
+               unsigned Letter = Word%20;\r
+               Str[WordLength-i-1] = g_LetterToCharAmino[Letter];\r
+               Word /= 20;\r
+               }\r
+       Str[WordLength] = 0;\r
+       return Str;\r
+       }\r
+\r
+const char *WordToStrNucleo(unsigned Word, unsigned WordLength)\r
+       {\r
+       static char Str[32];\r
+       for (unsigned i = 0; i < WordLength; ++i)\r
+               {\r
+               unsigned Letter = Word%4;\r
+               Str[WordLength-i-1] = g_LetterToCharNucleo[Letter];\r
+               Word /= 4;\r
+               }\r
+       Str[WordLength] = 0;\r
+       return Str;\r
+       }\r
+\r
+const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo)\r
+       {\r
+       return (Nucleo ? WordToStrNucleo : WordToStrAmino)(Word, WordLength);\r
+       }\r
+\r
+byte *RevCompAlloc(const byte *Seq, unsigned L)\r
+       {\r
+       byte *RCSeq = MYALLOC(byte, L, Alpha);\r
+\r
+       for (unsigned i = 0; i < L; ++i)\r
+               RCSeq[L-i-1] = g_CharToCompChar[Seq[i]];\r
+\r
+       return RCSeq;\r
+       }\r
+\r
+void RevCompInPlace(byte *Seq, unsigned L)\r
+       {\r
+       unsigned L1 = L - 1;\r
+       unsigned L2 = L/2;\r
+       for (unsigned i = 0; i < L2; ++i)\r
+               {\r
+               unsigned j = L1 - i;\r
+               unsigned ci = Seq[i];\r
+               unsigned cj = Seq[j];\r
+\r
+               unsigned ri = g_CharToCompChar[ci];\r
+               unsigned rj = g_CharToCompChar[cj];\r
+\r
+               Seq[i] = rj;\r
+               Seq[j] = ri;\r
+               }\r
+\r
+       if (L%2 == 1)\r
+               Seq[L2] = g_CharToCompChar[Seq[L2]];\r
+       }\r
+\r
+void RevComp(const byte *Seq, unsigned L, byte *RCSeq)\r
+       {\r
+       for (unsigned i = 0; i < L; ++i)\r
+               RCSeq[L-i-1] = g_CharToCompChar[Seq[i]];\r
+       }\r
+\r
+unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2,\r
+  unsigned char c3)\r
+       {\r
+       unsigned Letter1 = g_CharToLetterNucleo[c1];\r
+       unsigned Letter2 = g_CharToLetterNucleo[c2];\r
+       unsigned Letter3 = g_CharToLetterNucleo[c3];\r
+       unsigned Word = Letter1*(4*4) + Letter2*4 + Letter3;\r
+\r
+       unsigned Letter = g_CodonWordToAminoLetter[Word];\r
+       return g_LetterToCharAmino[Letter];\r
+       }\r
diff --git a/uchime_src/chainer.h b/uchime_src/chainer.h

new file mode 100644 (file)

index 0000000..a954dc0
--- /dev/null
+++ b/uchime_src/chainer.h
@@ -0,0 +1,79 @@
+#ifndef chainer_h\r
+#define chainer_h\r
+\r
+#include "hsp.h"\r
+#include "seq.h"\r
+#include <list>\r
+\r
+const float BAD_SCORE = -9e9f;\r
+\r
+struct TargetHit\r
+       {\r
+       unsigned TargetIndex;\r
+       unsigned TargetLo;\r
+       unsigned TargetHi;\r
+       int QueryFrame;\r
+       float RawScore; // SOMETIMES USED FOR BIT SCORE!!!\r
+//     unsigned TargetLength;\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("lo %u, hi %u, frame %d, score %.1f\n",\r
+                 TargetLo, TargetHi, QueryFrame, RawScore);\r
+               }\r
+       };\r
+\r
+struct ChainData\r
+       {\r
+       unsigned LastHSPIndex;\r
+       unsigned Ahi;\r
+       unsigned Bhi;\r
+       float Score;\r
+       };\r
+\r
+class Chainer\r
+       {\r
+public:\r
+       HSPData **m_HSPs; // memory owned elsewhere\r
+       unsigned m_HSPCount;\r
+       unsigned m_MaxHSPCount;\r
+\r
+       BPData *m_BPs;\r
+\r
+       unsigned *m_PrevHSPIndexes;             // Predecessor in chain\r
+       float *m_HSPIndexToChainScore;\r
+\r
+       list<unsigned> m_Chains;                // Live HSP indexes\r
+\r
+public:\r
+       Chainer();\r
+       ~Chainer();\r
+       void Reset();\r
+       void Clear(bool ctor = false);\r
+       float Chain(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain,\r
+         unsigned &OptChainLength);\r
+       bool ResolveOverlaps(const SeqData &SA, const SeqData &SB, double MinScore,\r
+         const float * const *SubstMx, HSPData **InHSPs, unsigned InHSPCount,\r
+         HSPData **OutHSPs, unsigned &OutHSPCount);\r
+       void ResolveOverlap(HSPData &HSP1, HSPData &HSP2);\r
+\r
+       float ChainBrute(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain,\r
+         unsigned &OptChainLength);\r
+       void LogMe() const;\r
+       void LogHSPs(HSPData **HSPs, unsigned HSPCount) const;\r
+       void LogBPs() const;\r
+\r
+       static bool IsValidChain(HSPData **HSPs, unsigned HSPCount);\r
+       static void AssertValidChain(HSPData **HSPs, unsigned HSPCount);\r
+       static void LogChain(HSPData **HSPs, unsigned HSPCount);\r
+       static void LogChain2(HSPData **HSPs, unsigned HSPCount);\r
+       static float GetChainScore(HSPData **HSPs, unsigned HSPCount);\r
+\r
+private:\r
+       void AllocHSPCount(unsigned MaxHSPCount);\r
+       void SetBPs();\r
+       void SortBPs();\r
+       unsigned FindBestChainLT(unsigned Ahi, unsigned Bhi);\r
+       };\r
+\r
+#endif // chainer_h\r
diff --git a/uchime_src/chime.h b/uchime_src/chime.h

new file mode 100644 (file)

index 0000000..1b0662a
--- /dev/null
+++ b/uchime_src/chime.h
@@ -0,0 +1,104 @@
+#ifndef chime_h\r
+#define chime_h\r
+\r
+#include "seq.h"\r
+\r
+struct ChimeHit2\r
+       {\r
+       string QLabel;\r
+       string ALabel;\r
+       string BLabel;\r
+       string Q3;\r
+       string A3;\r
+       string B3;\r
+\r
+       //unsigned LY, LN, LA, LD;\r
+       //unsigned RY, RN, RA, RD;\r
+       double PctIdQT, PctIdQA, PctIdQB, PctIdQM, PctIdAB;\r
+\r
+       unsigned ColLo;\r
+       unsigned ColXLo;\r
+       unsigned ColXHi;\r
+       unsigned ColHi;\r
+       unsigned QXLo;\r
+       unsigned QXHi;\r
+\r
+       double Div;\r
+       double Score;\r
+       double H;\r
+\r
+       unsigned CS_LY, CS_LN, CS_LA, CS_RY, CS_RN, CS_RA;\r
+\r
+       float AbQ;\r
+       float AbA;\r
+       float AbB;\r
+\r
+       ChimeHit2()\r
+               {\r
+               Clear();\r
+               }\r
+\r
+       void Clear()\r
+               {\r
+               Q3.clear();\r
+               A3.clear();\r
+               B3.clear();\r
+               QLabel.clear();\r
+               ALabel.clear();\r
+               BLabel.clear();\r
+\r
+               //LY = LN = LA = LD = UINT_MAX;\r
+               //RY = RN = RA = RD = UINT_MAX;\r
+               ColLo = ColHi = QXLo = QXHi = ColXLo = ColXHi = UINT_MAX;\r
+               CS_LY = CS_LN = CS_LA = CS_RY = CS_RN = CS_RA = UINT_MAX;\r
+               PctIdQT = PctIdQA = PctIdQB = PctIdQM = PctIdAB = -1.0;\r
+               Div = -1.0;\r
+               H = -1.0;\r
+               Score = -1.0;\r
+               AbQ = AbA = AbB = -1.0f;\r
+               };\r
+\r
+       bool Accept() const\r
+               {\r
+               return Score >= opt_minh && Div >= opt_mindiv && CS_LY >= opt_mindiffs && CS_RY >= opt_mindiffs;\r
+               }\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("@L %c ", yon(Score >= 1.0 && Div >= 1.0));\r
+               Log(" %.4f", Score);\r
+               Log(" LY %u LN %u LA %u", CS_LY, CS_LN, CS_LA);\r
+               Log(" RY %u RN %u RA %u", CS_RY, CS_RN, CS_RA);\r
+               Log(" Div %.1f%%", Div);\r
+               Log(" Q=%s", QLabel.c_str());\r
+               Log(" A=%s", ALabel.c_str());\r
+               Log(" B=%s", BLabel.c_str());\r
+               Log(" QA %.1f%% QB=%.1f%% AB=%.1f%% QM=%.1f%%", PctIdQA, PctIdQB, PctIdAB, PctIdQM);\r
+               Log("\n");\r
+               }\r
+\r
+       bool operator<(const ChimeHit2 &rhs) const\r
+               {\r
+               if (Score == rhs.Score)\r
+                       return Div > rhs.Div;\r
+               return Score > rhs.Score;\r
+               }\r
+       };\r
+\r
+static inline bool isacgt(char c)\r
+       {\r
+       return c == 'A' || c == 'C' || c == 'G' || c == 'T';\r
+       }\r
+\r
+static bool inline isgap(char c)\r
+       {\r
+       return c == '-' || c == '.';\r
+       }\r
+\r
+void GetChunkInfo(unsigned L, unsigned &Length, vector<unsigned> &Los);\r
+float GetAbFromLabel(const string &Label);\r
+void WriteChimeHitCS(FILE *f, const ChimeHit2 &Hit);\r
+void WriteChimeHit(FILE *f, const ChimeHit2 &Hit);\r
+void WriteChimeFileHdr(FILE *f);\r
+\r
+#endif // chime_h\r
diff --git a/uchime_src/counters.h b/uchime_src/counters.h

new file mode 100644 (file)

index 0000000..a433cc8
--- /dev/null
+++ b/uchime_src/counters.h
@@ -0,0 +1,39 @@
+C(Search)\r
+C(SearchBlast)\r
+C(HotHits)\r
+C(HotHits2)\r
+C(WindexAccepts)\r
+C(WindexRejects)\r
+C(AlnAccepts)\r
+C(AlnRejects)\r
+C(Seqs)\r
+C(FilterAccepts)\r
+C(FilterRejects)\r
+C(DiagRejects)\r
+C(DPTooBig)\r
+C(HotHitCut)\r
+C(FastRejects)\r
+C(FastRejects2)\r
+C(Step)\r
+C(HSPConflict)\r
+C(DPArea)\r
+C(DPArea2)\r
+C(DPArea3)\r
+C(DPArea4)\r
+C(DPArea5)\r
+C(HSPIdRejects)\r
+C(NoHSPRejects)\r
+C(NoHSPAccepts)\r
+C(BandRejects)\r
+C(FractIdBestSeg)\r
+C(FractIdHSPs)\r
+C(Excludes)\r
+C(NonExcludes)\r
+C(AlignQueryToSeed)\r
+C(PWA_Align)\r
+C(HitExtends)\r
+C(FailedExtends)\r
+C(HitExtendLetters)\r
+C(FailedExtendLetters)\r
+C(AddWords)\r
+C(AddWordGrows)\r
diff --git a/uchime_src/diagbox.h b/uchime_src/diagbox.h

new file mode 100644 (file)

index 0000000..0c5846c
--- /dev/null
+++ b/uchime_src/diagbox.h
@@ -0,0 +1,193 @@
+#ifndef diagbox_h\r
+#define diagbox_h\r
+\r
+struct DiagBox;\r
+\r
+void GetDiagBox(unsigned LA, unsigned LB, unsigned DiagLo, unsigned DiagHi, DiagBox &Box);\r
+void GetDiagRange(unsigned LA, unsigned LB, unsigned d,\r
+  unsigned &mini, unsigned &minj, unsigned &maxi, unsigned &maxj);\r
+void GetDiagLoHi(unsigned LA, unsigned LB, const char *Path,\r
+  unsigned &dlo, unsigned &dhi);\r
+\r
+struct DiagBox\r
+       {\r
+       DiagBox()\r
+               {\r
+               }\r
+\r
+       DiagBox(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi)\r
+               {\r
+               //GetDiagBox(LA, LB, DiagLo, DiagHi, *this);\r
+               //Validate();\r
+               Init(LA_, LB_, DiagLo, DiagHi);\r
+               }\r
+\r
+       void Init(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi)\r
+               {\r
+               GetDiagBox(LA_, LB_, DiagLo, DiagHi, *this);\r
+               Validate();\r
+               }\r
+\r
+       unsigned LA;\r
+       unsigned LB;\r
+\r
+       unsigned dlo;\r
+       unsigned dhi;\r
+\r
+       unsigned dlo_mini;\r
+       unsigned dlo_minj;\r
+\r
+       unsigned dlo_maxi;\r
+       unsigned dlo_maxj;\r
+\r
+       unsigned dhi_mini;\r
+       unsigned dhi_minj;\r
+\r
+       unsigned dhi_maxi;\r
+       unsigned dhi_maxj;\r
+\r
+       unsigned GetDiag(unsigned i, unsigned j) const\r
+               {\r
+               return LA - i + j;\r
+               }\r
+\r
+// i, j are positions 0..LA-1, 0..LB-1.\r
+       bool InBox(unsigned i, unsigned j) const\r
+               {\r
+               unsigned d = GetDiag(i, j);\r
+               return d >= dlo && d <= dhi;\r
+               }\r
+\r
+/***\r
+i, j are 0-based prefix lengths 0..LA, 0..LB.\r
+\r
+A full path is in the box iff all match pairs are in the box.\r
+\r
+A partial path that aligns a prefix of A to a prefix of B as\r
+in D.P.) is in the box iff it is is the prefix of at least\r
+one full path that is in the box.\r
+\r
+A D.P. matrix entry X[i][j] is in the box iff there is at\r
+least one full path aligning the first i letters of A and\r
+the first j letters of B ending in a column of type X, i.e.\r
+if there exists a partial path in the box that ends in X.\r
+\r
+Assume terminals appear in all paths, and DI/ID forbidden.\r
+\r
+Intuitively seems that by these definitions D is in box iff\r
+DM or MD is in box, I is in box iff IM or MI is in box.\r
+Don't have proof..\r
+***/\r
+       bool InBoxDPM(unsigned i, unsigned j) const\r
+               {\r
+       // Special case for M[0][0]\r
+               if (i == 0 && j == 0)\r
+                       return true;\r
+               if (i == 0 || j == 0)\r
+                       return false;\r
+               unsigned d = GetDiag(i-1, j-1);\r
+               return d >= dlo && d <= dhi;\r
+               }\r
+\r
+       bool InBoxDPD(unsigned i, unsigned j) const\r
+               {\r
+               bool MD = i == 0 ? false : InBoxDPM(i-1, j);\r
+               bool DM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1);\r
+               return MD || DM;\r
+               }\r
+\r
+       bool InBoxDPI(unsigned i, unsigned j) const\r
+               {\r
+               bool MI = j == 0 ? false : InBoxDPM(i, j-1);\r
+               bool IM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1);\r
+               return MI || IM;\r
+               }\r
+\r
+       // d = LA - i + j = 1 .. LA+LB-1\r
+       void Validate() const\r
+               {\r
+               asserta(dlo <= dhi);\r
+               asserta(dlo >= GetDiag(LA-1, 0));\r
+               asserta(dhi <= GetDiag(0, LB-1));\r
+\r
+               asserta(GetDiag(dlo_mini, dlo_minj) == dlo);\r
+               asserta(GetDiag(dlo_maxi, dlo_maxj) == dlo);\r
+               asserta(GetDiag(dhi_mini, dhi_minj) == dhi);\r
+               asserta(GetDiag(dhi_maxi, dhi_maxj) == dhi);\r
+\r
+               asserta(dlo_mini >= dhi_mini);\r
+               asserta(dlo_minj <= dhi_minj);\r
+               asserta(dlo_maxi >= dhi_maxi);\r
+               asserta(dlo_maxj <= dhi_maxj);\r
+               }\r
+\r
+       unsigned GetMini() const\r
+               {\r
+               return dhi_mini;\r
+               }\r
+\r
+       unsigned GetMaxi() const\r
+               {\r
+               return dlo_maxi;\r
+               }\r
+\r
+       unsigned GetMinj() const\r
+               {\r
+               return dlo_minj;\r
+               }\r
+\r
+       unsigned GetMaxj() const\r
+               {\r
+               return dhi_maxj;\r
+               }\r
+/***\r
+       i = 0..LA-1\r
+       j = 0..LB-1\r
+       d = LA - i + j = 1 .. LA+LB-1\r
+       j = d - LA + i\r
+       i = LA - d + j\r
+***/\r
+       void GetRange_j(unsigned i, unsigned &Startj, unsigned &Endj) const\r
+               {\r
+       // j = d - LA + i\r
+               if (dlo + i >= LA)\r
+                       Startj = dlo + i - LA;\r
+               else\r
+                       Startj = 0;\r
+\r
+               if (Startj >= LB)\r
+                       Startj = LB - 1;\r
+\r
+               if (dhi + i + 1 >= LA)\r
+                       Endj = dhi + i + 1 - LA;\r
+               else\r
+                       Endj = 0;\r
+\r
+               if (Endj > LB)\r
+                       Endj = LB;\r
+\r
+               asserta(Endj >= Startj);\r
+               }\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("LA=%u LB=%d dlo(%u): (%u,%u)-(%u,%u) dhi(%u): (%u,%u)-(%u,%u) i=[%u-%u] j=[%u-%u]\n",\r
+                 LA, LB,\r
+                 dlo,\r
+                 dlo_mini, dlo_minj,\r
+                 dlo_maxi, dlo_maxj,\r
+                 dhi,\r
+                 dhi_mini, dhi_minj,\r
+                 dhi_maxi, dhi_maxj,\r
+                 GetMini(), GetMaxi(),\r
+                 GetMinj(), GetMaxj());\r
+               }\r
+       };\r
+\r
+typedef const char *(*NWDIAG)(const byte *A, unsigned LA, const byte *B, unsigned LB,
+  unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm);
+
+const char *NWBandWrap(NWDIAG NW, const byte *A, unsigned LA, const byte *B, unsigned LB,
+  unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm);
+\r
+#endif // diagbox_h\r
diff --git a/uchime_src/dp.h b/uchime_src/dp.h

new file mode 100644 (file)

index 0000000..c771538
--- /dev/null
+++ b/uchime_src/dp.h
@@ -0,0 +1,164 @@
+#ifndef dp_h\r
+#define dp_h\r
+\r
+#define SAVE_FAST      0\r
+\r
+#include "myutils.h"\r
+#include "mx.h"\r
+#include "seqdb.h"\r
+#include "diagbox.h"\r
+#include "path.h"\r
+#include "alnparams.h"\r
+#include "alnheuristics.h"\r
+#include "hspfinder.h"\r
+\r
+typedef void (*OnPathFn)(const string &Path, bool Full);\r
+\r
+enum XType\r
+       {\r
+       XType_Full=1,\r
+       XType_Fwd=2,\r
+       XType_Bwd=3,\r
+       };\r
+\r
+// public\r
+float ViterbiBrute(const byte *A, unsigned LA, const byte *B, unsigned LB, \r
+  unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiSimpleBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, unsigned DiagLo, unsigned DiagHi, PathData &PD);\r
+\r
+float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiFastBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiFastMainDiag(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  unsigned BandRadius, const AlnParams &AP, PathData &PD);\r
+\r
+float XDropFwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropBwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropFwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropBwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+void XDropAlign(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  unsigned AncLoi, unsigned AncLoj, unsigned AncLen, const AlnParams &AP,\r
+  float XDrop, HSPData &HSP, PathData &PD);\r
+\r
+float SWSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj,\r
+  unsigned &Hij, PathData &PD);\r
+\r
+float SWFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj,\r
+  unsigned &Hij, PathData &PD);\r
+\r
+void SWFast2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+  HSPData &HSP, PathData &PD);\r
+\r
+void SWSimple2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+  HSPData &HSP, PathData &PD);\r
+\r
+float SWUngapped(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const float * const *SubstMx, unsigned &LoA, unsigned &LoB, unsigned &Len);\r
+\r
+void SWUngapped2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+  HSPData &HSP);\r
+\r
+float SWFastNTB(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP);\r
+\r
+void GlobalAlignBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+  const AlnParams &AP, unsigned BandRadius, PathData &PD);\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &AP,\r
+  const AlnHeuristics &AH, HSPFinder &HF, float MinFractId, float &HSPFractId,\r
+  PathData &PD);\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path);\r
+\r
+void GetBruteMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetSimpleDPMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetSimpleBandMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetXDropFwdSimpleDPMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+#if    SAVE_FAST\r
+void GetFastMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetFastBandMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+#endif\r
+\r
+// private\r
+void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD);\r
+void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj,\r
+  unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+void EnumPaths(unsigned L1, unsigned L2, bool SubPaths, OnPathFn OnPath);\r
+void AllocBit(unsigned LA, unsigned LB);\r
+\r
+const byte TRACEBITS_DM = 0x01;\r
+const byte TRACEBITS_IM = 0x02;\r
+const byte TRACEBITS_MD = 0x04;\r
+const byte TRACEBITS_MI = 0x08;\r
+const byte TRACEBITS_SM = 0x10;\r
+const byte TRACEBITS_UNINIT = ~0x1f;\r
+\r
+extern Mx<byte> g_Mx_TBBit;\r
+extern float *g_DPRow1;\r
+extern float *g_DPRow2;\r
+extern byte **g_TBBit;\r
+\r
+static inline void Max_xM(float &Score, float MM, float DM, float IM, byte &State)\r
+       {\r
+       Score = MM;\r
+       State = 'M';\r
+\r
+       if (DM > Score)\r
+               {\r
+               Score = DM;\r
+               State = 'D';\r
+               }\r
+       if (IM > Score)\r
+               {\r
+               Score = IM;\r
+               State = 'I';\r
+               }\r
+       }\r
+\r
+static inline void Max_xD(float &Score, float MD, float DD, byte &State)\r
+       {\r
+       if (MD >= DD)\r
+               {\r
+               Score = MD;\r
+               State = 'M';\r
+               }\r
+       else\r
+               {\r
+               Score = DD;\r
+               State = 'D';\r
+               }\r
+       }\r
+\r
+static inline void Max_xI(float &Score, float MI, float II, byte &State)\r
+       {\r
+       if (MI >= II)\r
+               {\r
+               Score = MI;\r
+               State = 'M';\r
+               }\r
+       else\r
+               {\r
+               Score = II;\r
+               State = 'I';\r
+               }\r
+       }\r
+\r
+#endif // dp_h\r
diff --git a/uchime_src/evalue.h b/uchime_src/evalue.h

new file mode 100644 (file)

index 0000000..c9308db
--- /dev/null
+++ b/uchime_src/evalue.h
@@ -0,0 +1,25 @@
+#ifndef evalue_h\r
+#define evalue_h\r
+\r
+#include <float.h>\r
+\r
+void SetKarlin(double GappedLambda, double UngappedLambda,\r
+  double GappedK, double UngappedK, double DBLength);\\r
+\r
+double GetKarlinDBLength();\r
+void SetKarlinDBLength(double DBLength);\r
+void LogKarlin();\r
+void SetKarlinAmino(double DBLength);\r
+void SetKarlinNucleo(double DBLength);\r
+void SetKarlin(double DBLength, bool Nucleo);\r
+double ComputeBitScoreGapped(double Score);\r
+double ComputeBitScoreUngapped(double Score);\r
+double ComputeEvalueGapped(double Score, unsigned QueryLength);\r
+double ComputeEvalueUngapped(double Score, unsigned QueryLength);\r
+double ComputeMinScoreGivenEvalueAGapped(double Evalue, unsigned Area);\r
+double ComputeMinScoreGivenEvalueAUngapped(double Evalue, unsigned Area);\r
+double ComputeMinScoreGivenEvalueQGapped(double Evalue, unsigned QueryLength);\r
+double ComputeMinScoreGivenEvalueQUngapped(double Evalue, unsigned QueryLength);\r
+double ComputeEvalueGappedFromBitScore(double BitScore, unsigned QueryLength);\r
+\r
+#endif // evalue_h\r
diff --git a/uchime_src/fractid.cpp b/uchime_src/fractid.cpp

new file mode 100644 (file)

index 0000000..f298877
--- /dev/null
+++ b/uchime_src/fractid.cpp
@@ -0,0 +1,449 @@
+#include "myutils.h"\r
+#include "alpha.h"\r
+\r
+//unsigned g_MaxL = 0;\r
+\r
+static bool *g_IsChar = g_IsAminoChar;\r
+\r
+// Term gaps allowed in query (A) only\r
+static double GetFractIdGivenPathDerep(const byte *A, const byte *B, const char *Path,\r
+  char *ptrDesc)\r
+       {\r
+       if (*Path == 'D')\r
+               {\r
+               if (ptrDesc != 0)\r
+                       sprintf(ptrDesc, "(term gap in Query)");\r
+               return 0;\r
+               }\r
+\r
+       const char *LastM = 0;\r
+       for (const char *p = Path; *p; ++p)\r
+               if (*p == 'M')\r
+                       LastM = p;\r
+\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Ids = 0;\r
+       unsigned Diffs = 0;\r
+       unsigned Cols = 0;\r
+       for (const char *p = Path; *p && p != LastM; ++p)\r
+               {\r
+               ++Cols;\r
+               char c = *p;\r
+               if (c == 'M')\r
+                       {\r
+                       byte a = toupper(A[PosA]);\r
+                       byte b = toupper(B[PosB]);\r
+                       if (g_IsChar[a] && g_IsChar[b])\r
+                               {\r
+                               if (a == b)\r
+                                       ++Ids;\r
+                               else\r
+                                       ++Diffs;\r
+                               }\r
+                       else\r
+                               --Cols;\r
+                       }\r
+               if (c == 'D' || c == 'I')\r
+                       ++Diffs;\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+\r
+       double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+       return FractId;\r
+       }\r
+\r
+static double GetFractIdGivenPathAllDiffs(const byte *A, const byte *B, const char *Path,\r
+  char *ptrDesc)\r
+       {\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Ids = 0;\r
+       unsigned Diffs = 0;\r
+       unsigned Cols = 0;\r
+       for (const char *p = Path; *p; ++p)\r
+               {\r
+               ++Cols;\r
+               char c = *p;\r
+               if (c == 'M')\r
+                       {\r
+                       byte a = toupper(A[PosA]);\r
+                       byte b = toupper(B[PosB]);\r
+                       if (g_IsChar[a] && g_IsChar[b])\r
+                               {\r
+                               if (a == b)\r
+                                       ++Ids;\r
+                               else\r
+                                       ++Diffs;\r
+                               }\r
+                       else\r
+                               --Cols;\r
+                       }\r
+               if (c == 'D' || c == 'I')\r
+                       ++Diffs;\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+\r
+       double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+       return FractId;\r
+       }\r
+\r
+static double GetFractIdGivenPathInternalDiffs(const byte *A, const byte *B,\r
+  const char *Path, char *ptrDesc)\r
+       {\r
+       unsigned i = 0;\r
+       unsigned FirstM = UINT_MAX;\r
+       unsigned LastM = UINT_MAX;\r
+       for (const char *p = Path; *p; ++p)\r
+               {\r
+               if (*p == 'M')\r
+                       {\r
+                       if (FirstM == UINT_MAX)\r
+                               FirstM = i;\r
+                       LastM = i;\r
+                       }\r
+               ++i;\r
+               }\r
+       if (FirstM == UINT_MAX)\r
+               {\r
+               if (ptrDesc != 0)\r
+                       strcpy(ptrDesc, "(no matches)");\r
+               return 0.0;\r
+               }\r
+\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Ids = 0;\r
+       unsigned Diffs = 0;\r
+       unsigned Cols = 0;\r
+       for (unsigned i = 0; i < FirstM; ++i)\r
+               {\r
+               char c = Path[i];\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+\r
+       for (unsigned i = FirstM; i <= LastM; ++i)\r
+               {\r
+               ++Cols;\r
+               char c = Path[i];\r
+               if (c == 'M')\r
+                       {\r
+                       byte a = toupper(A[PosA]);\r
+                       byte b = toupper(B[PosB]);\r
+                       if (g_IsChar[a] && g_IsChar[b])\r
+                               {\r
+                               if (a == b)\r
+                                       ++Ids;\r
+                               else\r
+                                       ++Diffs;\r
+                               }\r
+                       else\r
+                               --Cols;\r
+                       }\r
+               if (c == 'D' || c == 'I')\r
+                       ++Diffs;\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+\r
+       double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+       return FractId;\r
+       }\r
+\r
+static double GetFractIdGivenPathMBL(const byte *A, const byte *B, const char *Path,\r
+  char *ptrDesc)\r
+       {\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Mismatches = 0;\r
+       unsigned Gaps = 0;\r
+       for (const char *p = Path; *p; ++p)\r
+               {\r
+               char c = *p;\r
+               if (c == 'M' && toupper(A[PosA]) != toupper(B[PosB]))\r
+                       ++Mismatches;\r
+               if (c == 'D' || c == 'I' && (p == Path || p[-1] == 'M'))\r
+                       ++Gaps;\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+       unsigned Diffs = Gaps + Mismatches;\r
+       double FractDiffs = (PosB == 0 ? 0.0 : double(Diffs)/double(PosB));\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "Gap opens %u, Id=1 - [(diffs=%u)/(target_length=%u)]",\r
+                 Gaps, Diffs, PosB);\r
+       double FractId = 1.0 - FractDiffs;\r
+       if (FractId < 0.0)\r
+               return 0.0;\r
+       return FractId;\r
+       }\r
+\r
+static double GetFractIdGivenPathBLAST(const byte *A, const byte *B, const char *Path,\r
+  char *ptrDesc)\r
+       {\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Ids = 0;\r
+       unsigned Wilds = 0;\r
+       unsigned Cols = 0;\r
+       for (const char *p = Path; *p; ++p)\r
+               {\r
+               ++Cols;\r
+               char c = *p;\r
+               if (c == 'M')\r
+                       {\r
+                       byte a = toupper(A[PosA]);\r
+                       byte b = toupper(B[PosB]);\r
+                       if (g_IsChar[a] && g_IsChar[b])\r
+                               {\r
+                               if (a == b)\r
+                                       ++Ids;\r
+                               }\r
+                       else\r
+                               ++Wilds;\r
+                       }\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+       asserta(Cols >= Wilds);\r
+       Cols -= Wilds;\r
+       double FractId = Cols == 0 ? 0.0f : float(Ids)/float(Cols);\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+       return FractId;\r
+       }\r
+\r
+static double GetFractIdGivenPathDefault(const byte *A, const byte *B, const char *Path,\r
+  char *ptrDesc)\r
+       {\r
+       unsigned PosA = 0;\r
+       unsigned PosB = 0;\r
+       unsigned Ids = 0;\r
+       unsigned Wilds = 0;\r
+       for (const char *p = Path; *p; ++p)\r
+               {\r
+               char c = *p;\r
+               if (c == 'M')\r
+                       {\r
+                       byte a = toupper(A[PosA]);\r
+                       byte b = toupper(B[PosB]);\r
+                       if (g_IsChar[a] && g_IsChar[b])\r
+                               {\r
+                               if (a == b)\r
+                                       ++Ids;\r
+                               }\r
+                       else\r
+                               ++Wilds;\r
+                       }\r
+               if (c == 'M' || c == 'D')\r
+                       ++PosA;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PosB;\r
+               }\r
+       unsigned MinLen = min(PosA, PosB) - Wilds;\r
+       double FractId = (MinLen == 0 ? 0.0 : double(Ids)/double(MinLen));\r
+       if (ptrDesc != 0)\r
+               sprintf(ptrDesc, "(ids=%u/shorter_length=%u)", Ids, MinLen);\r
+       return FractId;\r
+       }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
+  bool Nucleo, char *ptrDesc, unsigned IdDef)\r
+       {\r
+       if (Nucleo)\r
+               g_IsChar = g_IsACGTU;\r
+       else\r
+               g_IsChar = g_IsAminoChar;\r
+\r
+       if (Path == 0)\r
+               {\r
+               if (ptrDesc != 0)\r
+                       strcpy(ptrDesc, "(NULL path)");\r
+               return 0.0;\r
+               }\r
+\r
+       unsigned ColCount = (unsigned) strlen(Path);\r
+       if (ColCount == 0)\r
+               return 0.0;\r
+\r
+       if (opt_leftjust)\r
+               {\r
+               if (Path[0] != 'M' || Path[ColCount-1] == 'D')\r
+                       {\r
+                       if (ptrDesc != 0)\r
+                               strcpy(ptrDesc, "(leftjust)");\r
+                       return 0.0;\r
+                       }\r
+               }\r
+\r
+       if (opt_rightjust)\r
+               {\r
+               if (Path[0] == 'D' || Path[ColCount-1] != 'M')\r
+                       {\r
+                       if (ptrDesc != 0)\r
+                               strcpy(ptrDesc, "(rightjust)");\r
+                       return 0.0;\r
+                       }\r
+               }\r
+\r
+       double FractId = 0.0;\r
+       //if (opt_idprefix > 0)\r
+       //      {\r
+       //      for (unsigned i = 0; i < opt_idprefix; ++i)\r
+       //              {\r
+       //              char c = Path[i];\r
+       //              if (c != 'M' || toupper(A[i]) != toupper(B[i]))\r
+       //                      {\r
+       //                      if (ptrDesc != 0)\r
+       //                              sprintf(ptrDesc, "Prefix ids %u < idprefix(%u)",\r
+       //                                i, opt_idprefix);\r
+       //                      return 0.0;\r
+       //                      }\r
+       //              }\r
+       //      }\r
+\r
+       //if (opt_idsuffix > 0)\r
+       //      {\r
+       //      unsigned Cols = strlen(Path);\r
+       //      for (unsigned i = 0; i < opt_idsuffix && i > Cols; ++i)\r
+       //              {\r
+       //              unsigned k = Cols - 1 - i;\r
+       //              char c = Path[k];\r
+       //              if (c != 'M' || toupper(A[k]) != toupper(B[k]))\r
+       //                      {\r
+       //                      if (ptrDesc != 0)\r
+       //                              sprintf(ptrDesc, "Suffix ids %u < idsuffix(%u)",\r
+       //                                i, opt_idsuffix);\r
+       //                      return 0.0;\r
+       //                      }\r
+       //              }\r
+       //      }\r
+\r
+       if (opt_maxqgap > 0 || opt_maxtgap > 0)\r
+               {\r
+               unsigned L = 0;\r
+               const char *LastM = 0;\r
+               for (const char *p = Path; *p; ++p)\r
+                       if (*p == 'M')\r
+                               LastM = p;\r
+\r
+//             g_MaxL = 0;\r
+               for (const char *p = Path; *p && p != LastM; ++p)\r
+                       {\r
+                       char c = *p;\r
+                       switch (c)\r
+                               {\r
+                       case 'M':\r
+                               if (L > 0)\r
+                                       {\r
+                                       if (p[-1] == 'D')\r
+                                               {\r
+                                               if (L > opt_maxtgap)\r
+                                                       {\r
+                                                       if (ptrDesc != 0)\r
+                                                               sprintf(ptrDesc, "(maxtgap)");\r
+                                                       return 0.0;\r
+                                                       }\r
+                                               }\r
+                                       else if (p[-1] == 'I')\r
+                                               {\r
+                                               if (L > opt_maxqgap)\r
+                                                       {\r
+                                                       if (ptrDesc != 0)\r
+                                                               sprintf(ptrDesc, "(maxqgap)");\r
+                                                       return 0.0;\r
+                                                       }\r
+                                               }\r
+                                       else\r
+                                               asserta(false);\r
+                                       }\r
+                               L = 0;\r
+                               break;\r
+\r
+                       case 'D':\r
+                       case 'I':\r
+                               ++L;\r
+                               //if (L > g_MaxL)\r
+                               //      g_MaxL = L;\r
+                               break;\r
+\r
+                       default:\r
+                               asserta(false);\r
+                               }\r
+                       }\r
+               }\r
+\r
+       switch (IdDef)\r
+               {\r
+       case 0:\r
+               FractId = GetFractIdGivenPathDefault(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       case 1:\r
+               FractId = GetFractIdGivenPathAllDiffs(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       case 2:\r
+               FractId = GetFractIdGivenPathInternalDiffs(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       case 3:\r
+               FractId = GetFractIdGivenPathMBL(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       case 4:\r
+               FractId = GetFractIdGivenPathBLAST(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       case 5:\r
+               FractId = GetFractIdGivenPathDerep(A, B, Path, ptrDesc);\r
+               break;\r
+\r
+       default:\r
+               Die("--iddef %u invalid", opt_iddef);\r
+               }\r
+\r
+       return FractId;\r
+       }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
+  bool Nucleo, char *ptrDesc)\r
+       {\r
+       return GetFractIdGivenPath(A, B, Path, Nucleo, ptrDesc, opt_iddef);\r
+       }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo)\r
+       {\r
+       return GetFractIdGivenPath(A, B, Path, Nucleo, (char *) 0);\r
+       }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const string &Path)\r
+       {\r
+       return GetFractIdGivenPath(A, B, Path.c_str(), true);\r
+       }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path)\r
+       {\r
+       return GetFractIdGivenPath(A, B, Path, true);\r
+       }\r
diff --git a/uchime_src/getparents.cpp b/uchime_src/getparents.cpp

new file mode 100644 (file)

index 0000000..d82f902
--- /dev/null
+++ b/uchime_src/getparents.cpp
@@ -0,0 +1,89 @@
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "ultra.h"\r
+#include <set>\r
+\r
+void AddTargets(Ultra &U, const SeqData &Query, set<unsigned> &TargetIndexes);\r
+\r
+void GetChunkInfo(unsigned L, unsigned &Length, vector<unsigned> &Los)\r
+       {\r
+       Los.clear();\r
+\r
+       if (L <= opt_minchunk)\r
+               {\r
+               Length = L;\r
+               Los.push_back(0);\r
+               return;\r
+               }\r
+\r
+       Length = (L - 1)/opt_chunks + 1;\r
+       if (Length < opt_minchunk)\r
+               Length = opt_minchunk;\r
+\r
+       unsigned Lo = 0;\r
+       for (;;)\r
+               {\r
+               if (Lo + Length >= L)\r
+                       {\r
+                       Lo = L - Length - 1;\r
+                       Los.push_back(Lo);\r
+                       return;\r
+                       }\r
+               Los.push_back(Lo);\r
+               Lo += Length;\r
+               }\r
+       }\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+  vector<unsigned> &Parents)\r
+       {\r
+       Parents.clear();\r
+\r
+       set<unsigned> TargetIndexes;\r
+\r
+       unsigned QL = QSD.L;\r
+\r
+       SeqData QuerySD = QSD;\r
+\r
+       unsigned ChunkLength;\r
+       vector<unsigned> ChunkLos;\r
+       GetChunkInfo(QL, ChunkLength, ChunkLos);\r
+       unsigned ChunkCount = SIZE(ChunkLos);\r
+       for (unsigned ChunkIndex = 0; ChunkIndex < ChunkCount; ++ChunkIndex)\r
+               {\r
+               unsigned Lo = ChunkLos[ChunkIndex];\r
+               asserta(Lo + ChunkLength <= QL);\r
+\r
+               const byte *Chunk = QSD.Seq + Lo;\r
+\r
+       // THIS MESSES UP --self!!\r
+               //char Prefix[32];\r
+               //sprintf(Prefix, "%u|", Lo);\r
+               //string ChunkLabel = string(Prefix) + string(QSD.Label);\r
+\r
+               //QuerySD.Label = ChunkLabel.c_str();\r
+               QuerySD.Seq = Chunk;\r
+               QuerySD.L = ChunkLength;\r
+\r
+               AddTargets(U, QuerySD, TargetIndexes);\r
+\r
+               Lo += ChunkLength;\r
+               }\r
+\r
+       for (set<unsigned>::const_iterator p = TargetIndexes.begin();\r
+         p != TargetIndexes.end(); ++p)\r
+               {\r
+               unsigned TargetIndex = *p;\r
+               bool Accept = true;\r
+               if (AbQ > 0.0f)\r
+                       {\r
+                       const char *TargetLabel = U.GetSeedLabel(TargetIndex);\r
+                       float AbT = GetAbFromLabel(string(TargetLabel));\r
+                       if (AbT > 0.0f && AbT < opt_abskew*AbQ)\r
+                               Accept = false;\r
+                       }\r
+\r
+               if (Accept)\r
+                       Parents.push_back(TargetIndex);\r
+               }\r
+       }\r
diff --git a/uchime_src/globalalign2.cpp b/uchime_src/globalalign2.cpp

new file mode 100644 (file)

index 0000000..2adfb71
--- /dev/null
+++ b/uchime_src/globalalign2.cpp
@@ -0,0 +1,45 @@
+#if    UCHIMES
+
+#include "dp.h"
+#include "seq.h"
+
+static AlnParams g_AP;
+static bool g_APInitDone = false;
+
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, PathData &PD)\r
+       {\r
+       if (!g_APInitDone)\r
+               {\r
+               g_AP.InitFromCmdLine(true);\r
+               g_APInitDone = true;\r
+               }\r
+\r
+       ViterbiFast(Query.Seq, Query.L, Target.Seq, Target.L, g_AP, PD);\r
+       return true;\r
+       }\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path)\r
+       {\r
+       PathData PD;\r
+       GlobalAlign(Query, Target, PD);\r
+       Path = string(PD.Start);\r
+       return true;\r
+       }\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &/*AP*/,\r
+  const AlnHeuristics &AH, HSPFinder &/*HF*/, float /*MinFractId*/, float &/*HSPId*/, PathData &PD)\r
+       {\r
+       PD.Clear();\r
+       string Path;\r
+       bool Found = GlobalAlign(Query, Target, Path);\r
+       if (!Found)\r
+               return false;\r
+       unsigned n = SIZE(Path);\r
+       PD.Alloc(n+1);\r
+       memcpy(PD.Front, Path.c_str(), n);\r
+       PD.Start = PD.Front;\r
+       PD.Start[n] = 0;\r
+       return true;\r
+       }\r
+\r
+#endif // UCHIMES\r
diff --git a/uchime_src/help.h b/uchime_src/help.h

new file mode 100644 (file)

index 0000000..9d7a89f
--- /dev/null
+++ b/uchime_src/help.h
@@ -0,0 +1,127 @@
+"\n"
+"Usage\n"
+"-----\n"
+"\n"
+"uchime --input query.fasta [--db db.fasta] [--uchimeout results.uchime]\n"
+"    [--uchimealns results.alns]\n"
+"\n"
+"Options\n"
+"-------\n"
+"\n"
+"--input filename\n"
+"    Query sequences in FASTA format.\n"
+"    If the --db option is not specificed, uchime uses de novo\n"
+"    detection. In de novo mode, relative abundance must be given\n"
+"    by a string /ab=xxx/ somewhere in the label, where xxx is a\n"
+"    floating-point number, e.g. >F00QGH67HG/ab=1.2/.\n"
+"\n"
+"--db filename\n"
+"    Reference database in FASTA format.\n"
+"    Optional, if not specified uchime uses de novo mode.\n"
+"\n"
+"    ***WARNING*** The database is searched ONLY on the plus strand.\n"
+"    You MUST include reverse-complemented sequences in the database\n"
+"    if you want both strands to be searched.\n"
+"\n"
+"--abskew x\n"
+"    Minimum abundance skew. Default 1.9. De novo mode only.\n"
+"    Abundance skew is:\n"
+"        min [ abund(parent1), abund(parent2) ] / abund(query).\n"
+"\n"
+"--uchimeout filename\n"
+"    Output in tabbed format with one record per query sequence.\n"
+"    First field is score (h), second field is query label.\n"
+"    For details, see manual.\n"
+"\n"
+"--uchimealns filename\n"
+"    Multiple alignments of query sequences to parents in human-\n"
+"    readable format. Alignments show columns with differences\n"
+"    that support or contradict a chimeric model.\n"
+"\n"
+"--minh h\n"
+"    Mininum score to report chimera. Default 0.3. Values from 0.1\n"
+"    to 5 might be reasonable. Lower values increase sensitivity\n"
+"    but may report more false positives. If you decrease --xn,\n"
+"    you may need to increase --minh, and vice versa.\n"
+"\n"
+"--mindiv div\n"
+"    Minimum divergence ratio, default 0.5. Div ratio is 100%% - \n"
+"    %%identity between query sequence and the closest candidate for\n"
+"    being a parent. If you don't care about very close chimeras,\n"
+"    then you could increase --mindiv to, say, 1.0 or 2.0, and\n"
+"    also decrease --min h, say to 0.1, to increase sensitivity.\n"
+"    How well this works will depend on your data. Best is to\n"
+"    tune parameters on a good benchmark.\n"
+"\n"
+"--xn beta\n"
+"    Weight of a no vote, also called the beta parameter. Default 8.0.\n"
+"    Decreasing this weight to around 3 or 4 may give better\n"
+"    performance on denoised data.\n"
+"\n"
+"--dn n\n"
+"    Pseudo-count prior on number of no votes. Default 1.4. Probably\n"
+"    no good reason to change this unless you can retune to a good\n"
+"    benchmark for your data. Reasonable values are probably in the\n"
+"    range from 0.2 to 2.\n"
+"\n"
+"--xa w\n"
+"    Weight of an abstain vote. Default 1. So far, results do not\n"
+"    seem to be very sensitive to this parameter, but if you have\n"
+"    a good training set might be worth trying. Reasonable values\n"
+"    might range from 0.1 to 2.\n"
+"\n"
+"--chunks n\n"
+"    Number of chunks to extract from the query sequence when searching\n"
+"    for parents. Default 4.\n"
+"\n"
+"--[no]ovchunks\n"
+"    [Do not] use overlapping chunks. Default do not.\n"
+"\n"
+"--minchunk n\n"
+"    Minimum length of a chunk. Default 64.\n"
+"\n"
+"--idsmoothwindow w\n"
+"    Length of id smoothing window. Default 32.\n"
+"\n"
+"--minsmoothid f\n"
+"    Minimum factional identity over smoothed window of candidate parent.\n"
+"    Default 0.95.\n"
+"\n"
+"--maxp n\n"
+"    Maximum number of candidate parents to consider. Default 2. In tests so\n"
+"    far, increasing --maxp gives only a very small improvement in sensivity\n"
+"    but tends to increase the error rate quite a bit.\n"
+"\n"
+"--[no]skipgaps\n"
+"--[no]skipgaps2\n"
+"    These options control how gapped columns affect counting of diffs.\n"
+"    If --skipgaps is specified, columns containing gaps do not found as diffs.\n"
+"    If --skipgaps2 is specified, if column is immediately adjacent to\n"
+"    a column containing a gap, it is not counted as a diff.\n"
+"    Default is --skipgaps --skipgaps2.\n"
+"\n"
+"--minlen L\n"
+"--maxlen L\n"
+"    Minimum and maximum sequence length. Defaults 10, 10000.\n"
+"    Applies to both query and reference sequences.\n"
+"\n"
+"--ucl\n"
+"    Use local-X alignments. Default is global-X. On tests so far, global-X\n"
+"    is always better; this option is retained because it just might work\n"
+"    well on some future type of data.\n"
+"\n"
+"--queryfract f\n"
+"    Minimum fraction of the query sequence that must be covered by a local-X\n"
+"    alignment. Default 0.5. Applies only when --ucl is specified.\n"
+"\n"
+"--quiet\n"
+"    Do not display progress messages on stderr.\n"
+"\n"
+"--log filename\n"
+"    Write miscellaneous information to the log file. Mostly of interest\n"
+"    to me (the algorithm developer). Use --verbose to get more info.\n"
+"\n"
+"--self\n"
+"    In reference database mode, exclude a reference sequence if it has\n"
+"    the same label as the query. This is useful for benchmarking by using\n"
+"    the ref db as a query to test for false positives.\n"
diff --git a/uchime_src/hsp.h b/uchime_src/hsp.h

new file mode 100644 (file)

index 0000000..339256f
--- /dev/null
+++ b/uchime_src/hsp.h
@@ -0,0 +1,114 @@
+#ifndef hsp_h\r
+#define hsp_h  1\r
+\r
+struct HSPData\r
+       {\r
+       unsigned Loi;\r
+       unsigned Loj;\r
+       unsigned Leni;\r
+       unsigned Lenj;\r
+       float Score;\r
+       unsigned User;\r
+\r
+       unsigned GetLength() const\r
+               {\r
+               if (Leni != Lenj)\r
+                       Die("HSP::GetLength(): Leni %u, Lenj %u, Loi %u, Loj %u, Score %.1f",\r
+                         Leni, Lenj, Loi, Loj, Score);\r
+\r
+               return Leni;\r
+               }\r
+\r
+       unsigned GetHii() const\r
+               {\r
+               assert(Leni > 0);\r
+               return Loi + Leni - 1;\r
+               }\r
+\r
+       unsigned GetHij() const\r
+               {\r
+               assert(Lenj > 0);\r
+               return Loj + Lenj - 1;\r
+               }\r
+\r
+       bool LeftA() const\r
+               {\r
+               return Loi == 0;\r
+               }\r
+\r
+       bool LeftB() const\r
+               {\r
+               return Loj == 0;\r
+               }\r
+\r
+       bool RightA(unsigned LA) const\r
+               {\r
+               return Loi + Leni == LA;\r
+               }\r
+\r
+       bool RightB(unsigned LB) const\r
+               {\r
+               return Loj + Lenj == LB;\r
+               }\r
+\r
+       unsigned GetIdCount(const byte *A, const byte *B) const\r
+               {\r
+               unsigned Count = 0;\r
+               unsigned K = GetLength();\r
+               for (unsigned k = 0; k < K; ++k)\r
+                       {\r
+                       byte a = A[Loi+k];\r
+                       byte b = B[Loj+k];\r
+                       if (toupper(a) == toupper(b))\r
+                               Count++;\r
+                       }\r
+               return Count;\r
+               }\r
+\r
+       double OverlapFract(const HSPData &HSP) const\r
+               {\r
+               if (Leni == 0 || Lenj == 0)\r
+                       return 0.0;\r
+\r
+               unsigned MaxLoi = max(Loi, HSP.Loi);\r
+               unsigned MaxLoj = max(Loj, HSP.Loj);\r
+               unsigned MinHii = min(GetHii(), HSP.GetHii());\r
+               unsigned MinHij = min(GetHij(), HSP.GetHij());\r
+\r
+               unsigned Ovi = (MinHii < MaxLoi) ? 0 : MinHii - MaxLoi;\r
+               unsigned Ovj = (MinHij < MaxLoj) ? 0 : MinHij - MaxLoj;\r
+\r
+               asserta(Ovi <= Leni && Ovj <= Lenj);\r
+               return double(Ovi*Ovj)/double(Leni*Lenj);\r
+               }\r
+\r
+       bool operator<(const HSPData &rhs) const\r
+               {\r
+               return Loi < rhs.Loi;\r
+               }\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("Loi=%u Loj=%u Li=%u Lj=%u Score=%.1f\n", Loi, Loj, Leni, Lenj, Score);\r
+               }\r
+\r
+       void LogMe2() const\r
+               {\r
+               Log("(%u-%u,%u-%u/%.1f)", Loi, GetHii(), Loj, GetHij(), Score);\r
+               }\r
+       };\r
+\r
+// Bendpoint\r
+struct BPData\r
+       {\r
+       unsigned Pos;\r
+       bool IsLo;\r
+       unsigned Index;\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("BP%s Pos %u Ix %u", (IsLo ? "lo" : "hi"), Pos, Index);\r
+               }\r
+       };\r
+\r
+#endif // hsp_h\r
diff --git a/uchime_src/hspfinder.h b/uchime_src/hspfinder.h

new file mode 100644 (file)

index 0000000..2b8e9d8
--- /dev/null
+++ b/uchime_src/hspfinder.h
@@ -0,0 +1,13 @@
+#ifndef hspfinder_h
+#define hspfinder_h
+
+#include "seq.h"
+
+class HSPFinder
+       {
+public:
+       void SetA(const SeqData &/*SD*/) {}
+       void SetB(const SeqData &/*SD*/) {}
+       };
+
+#endif // hspfinder_h
diff --git a/uchime_src/make3way.cpp b/uchime_src/make3way.cpp

new file mode 100644 (file)

index 0000000..ce88f86
--- /dev/null
+++ b/uchime_src/make3way.cpp
@@ -0,0 +1,173 @@
+#include "myutils.h"\r
+#include "sfasta.h"\r
+#include "path.h"\r
+#include "dp.h"\r
+\r
+void Make3Way(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+  const string &PathQA, const string &PathQB,\r
+  string &Q3, string &A3, string &B3)\r
+       {\r
+       Q3.clear();\r
+       A3.clear();\r
+       B3.clear();\r
+\r
+#if    DEBUG\r
+       {\r
+       unsigned QLen = 0;\r
+       unsigned ALen = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+               {\r
+               char c = PathQA[i];\r
+               if (c == 'M' || c == 'D')\r
+                       ++QLen;\r
+               if (c == 'M' || c == 'I')\r
+                       ++ALen;\r
+               }\r
+       asserta(QLen == QSD.L);\r
+       asserta(ALen == ASD.L);\r
+       }\r
+       {\r
+       unsigned QLen = 0;\r
+       unsigned BLen = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+               {\r
+               char c = PathQB[i];\r
+               if (c == 'M' || c == 'D')\r
+                       ++QLen;\r
+               if (c == 'M' || c == 'I')\r
+                       ++BLen;\r
+               }\r
+       asserta(QLen == QSD.L);\r
+       asserta(BLen == BSD.L);\r
+       }\r
+#endif\r
+\r
+       const byte *Q = QSD.Seq;\r
+       const byte *A = ASD.Seq;\r
+       const byte *B = BSD.Seq;\r
+\r
+       unsigned LQ = QSD.L;\r
+       unsigned LA = ASD.L;\r
+       unsigned LB = BSD.L;\r
+\r
+       vector<unsigned> InsertCountsA(LQ+1, 0);\r
+       unsigned QPos = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+               {\r
+               char c = PathQA[i];\r
+               if (c == 'M' || c == 'D')\r
+                       ++QPos;\r
+               else\r
+                       {\r
+                       asserta(c == 'I');\r
+                       asserta(QPos <= LQ);\r
+                       ++(InsertCountsA[QPos]);\r
+                       }\r
+               }\r
+\r
+       vector<unsigned> InsertCountsB(LQ+1, 0);\r
+       QPos = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+               {\r
+               char c = PathQB[i];\r
+               if (c == 'M' || c == 'D')\r
+                       ++QPos;\r
+               else\r
+                       {\r
+                       asserta(c == 'I');\r
+                       asserta(QPos <= LQ);\r
+                       ++(InsertCountsB[QPos]);\r
+                       }\r
+               }\r
+\r
+       vector<unsigned> InsertCounts;\r
+       for (unsigned i = 0; i <= LQ; ++i)\r
+               {\r
+               unsigned is = max(InsertCountsA[i], InsertCountsB[i]);\r
+               InsertCounts.push_back(is);\r
+               }\r
+\r
+       for (unsigned i = 0; i < LQ; ++i)\r
+               {\r
+               for (unsigned k = 0; k < InsertCounts[i]; ++k)\r
+                       Q3.push_back('-');\r
+               asserta(i < LQ);\r
+               Q3.push_back(toupper(Q[i]));\r
+               }\r
+       for (unsigned k = 0; k < InsertCounts[LQ]; ++k)\r
+               Q3.push_back('-');\r
+\r
+// A\r
+       QPos = 0;\r
+       unsigned APos = 0;\r
+       unsigned is = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+               {\r
+               char c = PathQA[i];\r
+               if (c == 'M' || c == 'D')\r
+                       {\r
+                       unsigned isq = InsertCounts[QPos];\r
+                       asserta(is <= isq);\r
+                       for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i)\r
+                               A3.push_back('-');\r
+                       is = 0;\r
+                       ++QPos;\r
+                       }\r
+               if (c == 'M')\r
+                       {\r
+                       asserta(APos < LA);\r
+                       A3.push_back(toupper(A[APos++]));\r
+                       }\r
+               else if (c == 'D')\r
+                       A3.push_back('-');\r
+               else if (c == 'I')\r
+                       {\r
+                       ++is;\r
+                       asserta(APos < LA);\r
+                       A3.push_back(toupper(A[APos++]));\r
+                       }\r
+               }\r
+       asserta(is <= InsertCounts[LQ]);\r
+       for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k)\r
+               A3.push_back('-');\r
+       asserta(QPos == LQ);\r
+       asserta(APos == LA);\r
+\r
+// B\r
+       QPos = 0;\r
+       unsigned BPos = 0;\r
+       is = 0;\r
+       for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+               {\r
+               char c = PathQB[i];\r
+               if (c == 'M' || c == 'D')\r
+                       {\r
+                       asserta(is <= InsertCounts[QPos]);\r
+                       for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i)\r
+                               B3.push_back('-');\r
+                       is = 0;\r
+                       ++QPos;\r
+                       }\r
+               if (c == 'M')\r
+                       {\r
+                       asserta(BPos < LB);\r
+                       B3.push_back(toupper(B[BPos++]));\r
+                       }\r
+               else if (c == 'D')\r
+                       B3.push_back('-');\r
+               else if (c == 'I')\r
+                       {\r
+                       ++is;\r
+                       asserta(BPos < LB);\r
+                       B3.push_back(toupper(B[BPos++]));\r
+                       }\r
+               }\r
+       asserta(is <= InsertCounts[LQ]);\r
+       for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k)\r
+               B3.push_back('-');\r
+       asserta(APos == LA);\r
+       asserta(BPos == LB);\r
+\r
+       asserta(SIZE(Q3) == SIZE(A3));\r
+       asserta(SIZE(Q3) == SIZE(B3));\r
+       }\r
diff --git a/uchime_src/mk b/uchime_src/mk

new file mode 100755 (executable)

index 0000000..24aeba0
--- /dev/null
+++ b/uchime_src/mk
@@ -0,0 +1,27 @@
+#!/bin/bash
+CPPNames='addtargets2 alignchime alignchimel alnparams alpha alpha2 fractid getparents globalalign2 make3way mx myutils path searchchime seqdb setnucmx sfasta tracebackbit uchime_main usort viterbifast writechhit'
+ObjNames='addtargets2.o alignchime.o alignchimel.o alnparams.o alpha.o alpha2.o fractid.o getparents.o globalalign2.o make3way.o mx.o myutils.o path.o searchchime.o seqdb.o setnucmx.o sfasta.o tracebackbit.o uchime_main.o usort.o viterbifast.o writechhit.o'
+
+rm -f *.o mk.stdout mk.stderr tmp.stderr
+
+for CPPName in $CPPNames
+do
+  echo $CPPName >> /dev/tty
+  g++ $ENV_GCC_OPTS -c -O3 -msse2 -mfpmath=sse -D_FILE_OFFSET_BITS=64 -DNDEBUG=1 -DUCHIMES=1 $CPPName.cpp -o $CPPName.o  >> mk.stdout 2>> tmp.stderr
+       cat tmp.stderr
+       cat tmp.stderr >> mk.stderr
+       rm -f tmp.stderr
+done
+
+LINK_OPTS= 
+if [ `uname -s` == Linux ] ; then
+    LINK_OPTS=-static
+fi
+g++ $LINK_OPTS $ENV_LINK_OPTS -g -o uchime $ObjNames >> mk.stdout 2>> tmp.stderr
+cat tmp.stderr
+cat tmp.stderr >> mk.stderr
+rm -f tmp.stderr
+
+strip uchime
+ls -lh uchime
+sum uchime
diff --git a/uchime_src/mx.cpp b/uchime_src/mx.cpp

new file mode 100644 (file)

index 0000000..48c347e
--- /dev/null
+++ b/uchime_src/mx.cpp
@@ -0,0 +1,294 @@
+#include "myutils.h"\r
+#include "mx.h"\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+\r
+char ProbToChar(float p);\r
+\r
+list<MxBase *> *MxBase::m_Matrices = 0;\r
+unsigned MxBase::m_AllocCount;\r
+unsigned MxBase::m_ZeroAllocCount;\r
+unsigned MxBase::m_GrowAllocCount;\r
+double MxBase::m_TotalBytes;\r
+double MxBase::m_MaxBytes;\r
+\r
+static const char *LogizeStr(const char *s)\r
+       {\r
+       double d = atof(s);\r
+       d = log(d);\r
+       return TypeToStr<float>(float(d));\r
+       }\r
+\r
+static const char *ExpizeStr(const char *s)\r
+       {\r
+       double d = atof(s);\r
+       d = exp(d);\r
+       return TypeToStr<float>(float(d));\r
+       }\r
+\r
+void MxBase::OnCtor(MxBase *Mx)\r
+       {\r
+       if (m_Matrices == 0)\r
+               m_Matrices = new list<MxBase *>;\r
+       asserta(m_Matrices != 0);\r
+       m_Matrices->push_front(Mx);\r
+       }\r
+\r
+void MxBase::OnDtor(MxBase *Mx)\r
+       {\r
+       if (m_Matrices == 0)\r
+               {\r
+               Warning("MxBase::OnDtor, m_Matrices = 0");\r
+               return;\r
+               }\r
+       for (list<MxBase*>::iterator p = m_Matrices->begin();\r
+         p != m_Matrices->end(); ++p)\r
+               {\r
+               if (*p == Mx)\r
+                       {\r
+                       m_Matrices->erase(p);\r
+                       if (m_Matrices->empty())\r
+                               delete m_Matrices;\r
+                       return;\r
+                       }\r
+               }\r
+       Warning("MxBase::OnDtor, not found");\r
+       }\r
+\r
+//float **MxBase::Getf(const string &Name)\r
+//     {\r
+//     Mx<float> *m = (Mx<float> *) Get(Name);\r
+//     asserta(m->GetTypeSize() == sizeof(float));\r
+//     return m->GetData();\r
+//     }\r
+//\r
+//double **MxBase::Getd(const string &Name)\r
+//     {\r
+//     Mx<double> *m = (Mx<double> *) Get(Name);\r
+//     asserta(m->GetTypeSize() == sizeof(double));\r
+//     return m->GetData();\r
+//     }\r
+//\r
+//char **MxBase::Getc(const string &Name)\r
+//     {\r
+//     Mx<char> *m = (Mx<char> *) Get(Name);\r
+//     asserta(m->GetTypeSize() == sizeof(char));\r
+//     return m->GetData();\r
+//     }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+  const SeqDB *DB, unsigned IdA, unsigned IdB)\r
+       {\r
+       Alloc(Name, RowCount, ColCount, DB, IdA, IdB, 0, 0);\r
+       }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+  const SeqData *SA, const SeqData *SB)\r
+       {\r
+       Alloc(Name, RowCount, ColCount, 0, UINT_MAX, UINT_MAX, SA, SB);\r
+       }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+  const SeqDB *DB, unsigned IdA, unsigned IdB, const SeqData *SA, const SeqData *SB)\r
+       {\r
+       StartTimer(MxBase_Alloc);\r
+\r
+       ++m_AllocCount;\r
+       if (m_AllocatedRowCount == 0)\r
+               ++m_ZeroAllocCount;\r
+\r
+       if (DB != 0)\r
+               {\r
+               asserta(IdA != UINT_MAX);\r
+               asserta(IdB != UINT_MAX);\r
+               asserta(RowCount >= DB->GetSeqLength(IdA) + 1);\r
+               asserta(ColCount >= DB->GetSeqLength(IdB) + 1);\r
+               }\r
+       if (RowCount > m_AllocatedRowCount || ColCount > m_AllocatedColCount)\r
+               {\r
+               if (m_AllocatedRowCount > 0)\r
+                       {\r
+                       if (opt_logmemgrows)\r
+                               Log("MxBase::Alloc grow %s %u x %u -> %u x %u, %s bytes\n",\r
+                                 Name, m_AllocatedRowCount, m_AllocatedColCount,\r
+                                 RowCount, ColCount,\r
+                                 IntToStr(GetBytes()));\r
+                       ++m_GrowAllocCount;\r
+                       }\r
+\r
+               m_TotalBytes -= GetBytes();\r
+\r
+               PauseTimer(MxBase_Alloc);\r
+               StartTimer(MxBase_FreeData);\r
+               FreeData();\r
+               EndTimer(MxBase_FreeData);\r
+               StartTimer(MxBase_Alloc);\r
+\r
+               unsigned N = max(RowCount + 16, m_AllocatedRowCount);\r
+               unsigned M = max(ColCount + 16, m_AllocatedColCount);\r
+               N = max(N, M);\r
+\r
+               PauseTimer(MxBase_Alloc);\r
+               StartTimer(MxBase_AllocData);\r
+               AllocData(N, N);\r
+               EndTimer(MxBase_AllocData);\r
+               StartTimer(MxBase_Alloc);\r
+\r
+               m_TotalBytes += GetBytes();\r
+               if (m_TotalBytes > m_MaxBytes)\r
+                       m_MaxBytes = m_TotalBytes;\r
+               }\r
+       \r
+       unsigned n = sizeof(m_Name)-1;\r
+       strncpy(m_Name, Name, n);\r
+       m_Name[n] = 0;\r
+       m_RowCount = RowCount;\r
+       m_ColCount = ColCount;\r
+       m_SeqDB = DB;\r
+       m_IdA = IdA;\r
+       m_IdB = IdB;\r
+       m_SA = SA;\r
+       m_SB = SB;\r
+\r
+       EndTimer(MxBase_Alloc);\r
+       }\r
+\r
+void MxBase::LogMe(bool WithData, int Opts) const\r
+       {\r
+       Log("\n");\r
+       if (Opts & OPT_EXP)\r
+               Log("Exp ");\r
+       else if (Opts & OPT_LOG)\r
+               Log("Log ");\r
+       bool ZeroBased = ((Opts & OPT_ZERO_BASED) != 0);\r
+       Log("%s(%p) Rows %u/%u, Cols %u/%u",\r
+         m_Name, this,\r
+         m_RowCount, m_AllocatedRowCount,\r
+         m_ColCount, m_AllocatedColCount);\r
+       if (m_SeqDB != 0 && m_IdA != UINT_MAX)\r
+               Log(", A=%s", m_SeqDB->GetLabel(m_IdA));\r
+       else if (m_SA != 0)\r
+               Log(", A=%s", m_SA->Label);\r
+       if (m_SeqDB != 0 && m_IdB != UINT_MAX)\r
+               Log(", B=%s", m_SeqDB->GetLabel(m_IdB));\r
+       else if (m_SB != 0)\r
+               Log(", B=%s", m_SB->Label);\r
+       Log("\n");\r
+       if (!WithData || m_RowCount == 0 || m_ColCount == 0)\r
+               return;\r
+\r
+       const char *z = GetAsStr(0, 0);\r
+       unsigned Width = strlen(z);\r
+       unsigned Mod = 1;\r
+       for (unsigned i = 0; i < Width; ++i)\r
+               Mod *= 10;\r
+\r
+       if (m_Alpha[0] != 0)\r
+               {\r
+               Log("// Alphabet=%s\n", m_Alpha);\r
+               Log("//      ");\r
+               unsigned n = strlen(m_Alpha);\r
+               for (unsigned j = 0; j < n; ++j)\r
+                       Log(" %*c", Width, m_Alpha[j]);\r
+               Log("\n");\r
+               for (unsigned i = 0; i < n; ++i)\r
+                       {\r
+                       Log("/* %c */ {", m_Alpha[i]);\r
+                       unsigned ci = m_Alpha[i];\r
+                       for (unsigned j = 0; j < n; ++j)\r
+                               {\r
+                               unsigned cj = m_Alpha[j];\r
+                               Log("%s,", GetAsStr(ci, cj));\r
+                               }\r
+                       Log("},  // %c\n", m_Alpha[i]);\r
+                       }\r
+               return;\r
+               }\r
+       else if (m_Alpha2[0] != 0)\r
+               {\r
+               unsigned n = strlen(m_Alpha2);\r
+               Log("// Alphabet=%s\n", m_Alpha2);\r
+               Log("//      ");\r
+               for (unsigned j = 0; j < n; ++j)\r
+                       Log(" %*c", Width, m_Alpha2[j]);\r
+               Log("\n");\r
+               for (unsigned i = 0; i < n; ++i)\r
+                       {\r
+                       Log("/* %c */ {", m_Alpha2[i]);\r
+                       unsigned ci = m_Alpha2[i];\r
+                       for (unsigned j = 0; j < n; ++j)\r
+                               Log("%s,", GetAsStr(i, j));\r
+                       Log("},  // %c\n", m_Alpha2[i]);\r
+                       }\r
+               return;\r
+               }\r
+\r
+       const byte *A = 0;\r
+       const byte *B = 0;\r
+       if (m_SeqDB != 0 && m_IdA != UINT_MAX)\r
+               A = m_SeqDB->GetSeq(m_IdA);\r
+       else if (m_SA != 0)\r
+               A = m_SA->Seq;\r
+       if (m_SeqDB != 0 && m_IdB != UINT_MAX)\r
+               B = m_SeqDB->GetSeq(m_IdB);\r
+       else if (m_SB != 0)\r
+               B = m_SB->Seq;\r
+\r
+       if (B != 0)\r
+               {\r
+               if (A != 0)\r
+                       Log("  ");\r
+               Log("%5.5s", "");\r
+               if (ZeroBased)\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               Log("%*c", Width, B[j]);\r
+               else\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               Log("%*c", Width, j == 0 ? ' ' : B[j-1]);\r
+               Log("\n");\r
+               }\r
+\r
+       if (A != 0)\r
+               Log("  ");\r
+       Log("%5.5s", "");\r
+       for (unsigned j = 0; j < m_ColCount; ++j)\r
+               Log("%*u", Width, j%Mod);\r
+       Log("\n");\r
+\r
+       for (unsigned i = 0; i < m_RowCount; ++i)\r
+               {\r
+               if (A != 0)\r
+                       {\r
+                       if (ZeroBased)\r
+                               Log("%c ", A[i]);\r
+                       else\r
+                               Log("%c ", i == 0 ? ' ' : A[i-1]);\r
+                       }\r
+               Log("%4u ", i);\r
+               \r
+               for (unsigned j = 0; j < m_ColCount; ++j)\r
+                       {\r
+                       const char *s = GetAsStr(i, j);\r
+                       if (Opts & OPT_LOG)\r
+                               s = LogizeStr(s);\r
+                       else if (Opts & OPT_EXP)\r
+                               s = ExpizeStr(s);\r
+                       Log("%s", s);\r
+                       }\r
+               Log("\n");\r
+               }\r
+       }\r
+static unsigned g_MatrixFileCount;\r
+\r
+void MxBase::LogCounts()\r
+       {\r
+       Log("\n");\r
+       Log("MxBase::LogCounts()\n");\r
+       Log("      What           N\n");\r
+       Log("----------  ----------\n");\r
+       Log("    Allocs  %10u\n", m_AllocCount);\r
+       Log("ZeroAllocs  %10u\n", m_ZeroAllocCount);\r
+       Log("     Grows  %10u\n", m_GrowAllocCount);\r
+       Log("     Bytes  %10.10s\n", MemBytesToStr(m_TotalBytes));\r
+       Log(" Max bytes  %10.10s\n", MemBytesToStr(m_MaxBytes));\r
+       }\r
diff --git a/uchime_src/mx.h b/uchime_src/mx.h

new file mode 100644 (file)

index 0000000..1438900
--- /dev/null
+++ b/uchime_src/mx.h
@@ -0,0 +1,454 @@
+#ifndef mx_h\r
+#define mx_h\r
+\r
+#include <list>\r
+#include <limits.h>\r
+#include <math.h>\r
+#include "timing.h"\r
+#include "myutils.h"\r
+\r
+const int OPT_LOG = 0x01;\r
+const int OPT_EXP = 0x02;\r
+const int OPT_ZERO_BASED = 0x04;\r
+const float MINUS_INFINITY = -9e9f;\r
+const float UNINIT = -8e8f;\r
+\r
+struct SeqData;\r
+\r
+template<class T> const char *TypeToStr(T t)\r
+       {\r
+       Die("Unspecialised TypeToStr() called");\r
+       ureturn(0);\r
+       }\r
+\r
+template<> inline const char *TypeToStr<unsigned short>(unsigned short f)\r
+       {\r
+       static char s[16];\r
+\r
+       sprintf(s, "%12u", f);\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<short>(short f)\r
+       {\r
+       static char s[16];\r
+\r
+       sprintf(s, "%12d", f);\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<int>(int f)\r
+       {\r
+       static char s[16];\r
+\r
+       sprintf(s, "%5d", f);\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<float>(float f)\r
+       {\r
+       static char s[16];\r
+\r
+       if (f == UNINIT)\r
+               sprintf(s, "%12.12s", "?");\r
+       else if (f < MINUS_INFINITY/2)\r
+               sprintf(s, "%12.12s", "*");\r
+       else if (f == 0.0f)\r
+               sprintf(s, "%12.12s", ".");\r
+       else if (f >= -1e5 && f <= 1e5)\r
+               sprintf(s, "%12.5f", f);\r
+       else\r
+               sprintf(s, "%12.4g", f);\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<double>(double f)\r
+       {\r
+       static char s[16];\r
+\r
+       if (f < -1e9)\r
+               sprintf(s, "%12.12s", "*");\r
+       else if (f == 0.0f)\r
+               sprintf(s, "%12.12s", ".");\r
+       else if (f >= -1e-5 && f <= 1e5)\r
+               sprintf(s, "%12.5f", f);\r
+       else\r
+               sprintf(s, "%12.4g", f);\r
+       return s;\r
+       }\r
+\r
+static inline const char *FloatToStr(float f, string &s)\r
+       {\r
+       s = TypeToStr<float>(f);\r
+       return s.c_str();\r
+       }\r
+\r
+template<> inline const char *TypeToStr<char>(char c)\r
+       {\r
+       static char s[2];\r
+       s[0] = c;\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<byte>(byte c)\r
+       {\r
+       static char s[2];\r
+       s[0] = c;\r
+       return s;\r
+       }\r
+\r
+template<> inline const char *TypeToStr<bool>(bool tof)\r
+       {\r
+       static char s[2];\r
+       s[0] = tof ? 'T' : 'F';\r
+       return s;\r
+       }\r
+\r
+struct SeqDB;\r
+\r
+struct MxBase\r
+       {\r
+private:\r
+       MxBase(const MxBase &rhs);\r
+       MxBase &operator=(const MxBase &rhs);\r
+\r
+public:\r
+       char m_Name[32];\r
+       char m_Alpha[32];\r
+       char m_Alpha2[32];\r
+       unsigned m_RowCount;\r
+       unsigned m_ColCount;\r
+       unsigned m_AllocatedRowCount;\r
+       unsigned m_AllocatedColCount;\r
+       const SeqDB *m_SeqDB;\r
+       unsigned m_IdA;\r
+       unsigned m_IdB;\r
+       const SeqData *m_SA;\r
+       const SeqData *m_SB;\r
+\r
+       static list<MxBase *> *m_Matrices;\r
+       //static MxBase *Get(const string &Name);\r
+       //static float **Getf(const string &Name);\r
+       //static double **Getd(const string &Name);\r
+       //static char **Getc(const string &Name);\r
+\r
+       static unsigned m_AllocCount;\r
+       static unsigned m_ZeroAllocCount;\r
+       static unsigned m_GrowAllocCount;\r
+       static double m_TotalBytes;\r
+       static double m_MaxBytes;\r
+\r
+       static void OnCtor(MxBase *Mx);\r
+       static void OnDtor(MxBase *Mx);\r
+\r
+       MxBase()\r
+               {\r
+               m_AllocatedRowCount = 0;\r
+               m_AllocatedColCount = 0;\r
+               m_RowCount = 0;\r
+               m_ColCount = 0;\r
+               m_IdA = UINT_MAX;\r
+               m_IdB = UINT_MAX;\r
+               m_SeqDB = 0;\r
+               OnCtor(this);\r
+               }\r
+       virtual ~MxBase()\r
+               {\r
+               OnDtor(this);\r
+               }\r
+\r
+       virtual unsigned GetTypeSize() const = 0;\r
+       virtual unsigned GetBytes() const = 0;\r
+\r
+       void Clear()\r
+               {\r
+               FreeData();\r
+               m_AllocatedRowCount = 0;\r
+               m_AllocatedColCount = 0;\r
+               m_RowCount = 0;\r
+               m_ColCount = 0;\r
+               m_IdA = UINT_MAX;\r
+               m_IdB = UINT_MAX;\r
+               m_SA = 0;\r
+               m_SB = 0;\r
+               }\r
+\r
+       bool Empty() const\r
+               {\r
+               return m_RowCount == 0;\r
+               }\r
+\r
+       virtual void AllocData(unsigned RowCount, unsigned ColCount) = 0;\r
+       virtual void FreeData() = 0;\r
+       virtual const char *GetAsStr(unsigned i, unsigned j) const = 0;\r
+\r
+       void SetAlpha(const char *Alpha)\r
+               {\r
+               unsigned n = sizeof(m_Alpha);\r
+               strncpy(m_Alpha, Alpha, n);\r
+               m_Alpha[n] = 0;\r
+               }\r
+\r
+       void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+         const SeqDB *DB, unsigned IdA, unsigned IdB,\r
+         const SeqData *SA, const SeqData *SB);\r
+\r
+       void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+         const SeqDB *DB = 0, unsigned IdA = UINT_MAX, unsigned IdB = UINT_MAX);\r
+\r
+       void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+         const SeqData *SA, const SeqData *SB);\r
+\r
+       static void LogAll()\r
+               {\r
+               Log("\n");\r
+               if (m_Matrices == 0)\r
+                       {\r
+                       Log("MxBase::m_Matrices=0\n");\r
+                       return;\r
+                       }\r
+               Log("\n");\r
+               Log("AllRows  AllCols    Sz        MB  Name\n");\r
+               Log("-------  -------  ----  --------  ----\n");\r
+               double TotalMB = 0;\r
+               for (list<MxBase *>::const_iterator p = m_Matrices->begin();\r
+                 p != m_Matrices->end(); ++p)\r
+                       {\r
+                       const MxBase *Mx = *p;\r
+                       if (Mx == 0)\r
+                               continue;\r
+                       //if (Mx->m_RowCount != 0 || ShowEmpty)\r
+                       //      Mx->LogMe(WithData);\r
+                       unsigned ar = Mx->m_AllocatedRowCount;\r
+                       if (ar == 0)\r
+                               continue;\r
+                       unsigned ac = Mx->m_AllocatedColCount;\r
+                       unsigned sz = Mx->GetTypeSize();\r
+                       double MB = (double) ar*(double) ac*(double) sz/1e6;\r
+                       TotalMB += MB;\r
+                       Log("%7u  %7u  %4u  %8.2f  %s\n", ar, ac, sz, MB, Mx->m_Name);\r
+                       }\r
+               Log("                        --------\n");\r
+               Log("%7.7s  %7.7s  %4.4s  %8.2f\n", "", "", "", TotalMB);\r
+               }\r
+\r
+       void LogMe(bool WithData = true, int Opts = 0) const;\r
+       static void LogCounts();\r
+       };\r
+\r
+template<class T> struct Mx : public MxBase\r
+       {\r
+// Disable unimplemented stuff\r
+private:\r
+       Mx(Mx &rhs);\r
+       Mx &operator=(Mx &rhs);\r
+       // const Mx &operator=(const Mx &rhs) const;\r
+\r
+public:\r
+       T **m_Data;\r
+\r
+       Mx()\r
+               {\r
+               m_Data = 0;\r
+               }\r
+       \r
+       ~Mx()\r
+               {\r
+               FreeData();\r
+               }\r
+\r
+       virtual void AllocData(unsigned RowCount, unsigned ColCount)\r
+               {\r
+               if (opt_logmemgrows)\r
+                       Log("MxBase::AllocData(%u,%u) %s bytes, Name=%s\n",\r
+                         RowCount, ColCount, IntToStr(GetBytes()), m_Name);\r
+               // m_Data = myalloc<T *>(RowCount);\r
+               m_Data = MYALLOC(T *, RowCount, Mx);\r
+               for (unsigned i = 0; i < RowCount; ++i)\r
+                       // m_Data[i] = myalloc<T>(ColCount);\r
+                       m_Data[i] = MYALLOC(T, ColCount, Mx);\r
+               AddBytes("Mx_AllocData", RowCount*sizeof(T *) + RowCount*ColCount*sizeof(T));\r
+\r
+               m_AllocatedRowCount = RowCount;\r
+               m_AllocatedColCount = ColCount;\r
+               }\r
+\r
+       virtual void FreeData()\r
+               {\r
+               for (unsigned i = 0; i < m_AllocatedRowCount; ++i)\r
+                       MYFREE(m_Data[i], m_AllocatedColCount, Mx);\r
+               MYFREE(m_Data, m_AllocatedRowCount, Mx);\r
+               SubBytes("Mx_AllocData",\r
+                 m_AllocatedRowCount*sizeof(T *) + m_AllocatedRowCount*m_AllocatedColCount*sizeof(T));\r
+\r
+               m_Data = 0;\r
+               m_RowCount = 0;\r
+               m_ColCount = 0;\r
+               m_AllocatedRowCount = 0;\r
+               m_AllocatedColCount = 0;\r
+               }\r
+\r
+       T **GetData()\r
+               {\r
+               return (T **) m_Data;\r
+               }\r
+\r
+       T Get(unsigned i, unsigned j) const\r
+               {\r
+               assert(i < m_RowCount);\r
+               assert(j < m_ColCount);\r
+               return m_Data[i][j];\r
+               }\r
+\r
+       void Put(unsigned i, unsigned j, T x) const\r
+               {\r
+               assert(i < m_RowCount);\r
+               assert(j < m_ColCount);\r
+               m_Data[i][j] = x;\r
+               }\r
+\r
+       T GetOffDiagAvgs(vector<T> &Avgs) const\r
+               {\r
+               if (m_RowCount != m_ColCount)\r
+                       Die("GetOffDiagAvgs, not symmetrical");\r
+               Avgs.clear();\r
+               T Total = T(0);\r
+               for (unsigned i = 0; i < m_RowCount; ++i)\r
+                       {\r
+                       T Sum = T(0);\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               {\r
+                               if (j == i)\r
+                                       continue;\r
+                               Sum += m_Data[i][j];\r
+                               }\r
+                       T Avg = Sum/(m_RowCount-1);\r
+                       Total += Avg;\r
+                       Avgs.push_back(Avg);\r
+                       }\r
+               return m_RowCount == 0 ? T(0) : Total/m_RowCount;\r
+               }\r
+\r
+       unsigned GetTypeSize() const\r
+               {\r
+               return sizeof(T);\r
+               }\r
+\r
+       virtual unsigned GetBytes() const\r
+               {\r
+               return m_AllocatedRowCount*m_AllocatedColCount*GetTypeSize() +\r
+                 m_AllocatedRowCount*sizeof(T *);\r
+               }\r
+\r
+       const char *GetAsStr(unsigned i, unsigned j) const\r
+               {\r
+               return TypeToStr<T>(Get(i, j));\r
+               }\r
+\r
+       const T *const *const GetData() const\r
+               {\r
+               return (const T *const *) m_Data;\r
+               }\r
+\r
+       void Copy(const Mx<T> &rhs)\r
+               {\r
+               Alloc("Copy", rhs.m_RowCount, rhs.m_ColCount, rhs.m_SeqDB, rhs.m_IdA, rhs.m_IdB);\r
+               const T * const *Data = rhs.GetData();\r
+               for (unsigned i = 0; i < m_RowCount; ++i)\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               m_Data[i][j] = Data[i][j];\r
+               }\r
+\r
+       void Assign(T v)\r
+               {\r
+               for (unsigned i = 0; i < m_RowCount; ++i)\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               m_Data[i][j] = v;\r
+               }\r
+\r
+       bool Eq(const Mx &rhs, bool Bwd = false) const\r
+               {\r
+               if (rhs.m_ColCount != m_ColCount)\r
+                       return false;\r
+               if (rhs.m_RowCount != m_RowCount)\r
+                       return false;\r
+               const T * const*d = rhs.GetData();\r
+               int i1 = Bwd ? m_RowCount : 0;\r
+               int j1 = Bwd ? m_ColCount : 0;\r
+               int i2 = Bwd ? -1 : m_RowCount;\r
+               int j2 = Bwd ? -1 : m_ColCount;\r
+               for (int i = i1; i != i2; Bwd ? --i : ++i)\r
+                       for (int j = j1; j != j2; Bwd ? --j : ++j)\r
+                               {\r
+                               float x = m_Data[i][j];\r
+                               float y = d[i][j];\r
+                               if (x < -1e10 && y < -1e10)\r
+                                       continue;\r
+                               if (!feq(x, y))\r
+                                       {\r
+                                       Warning("%s[%d][%d] = %g, %s = %g",\r
+                                         m_Name, i, j, x, rhs.m_Name, y);\r
+                                       return false;\r
+                                       }\r
+                               }\r
+               return true;\r
+               }\r
+\r
+       bool EqMask(const Mx &rhs, const Mx<bool> &Mask) const\r
+               {\r
+               if (rhs.m_ColCount != m_ColCount)\r
+                       return false;\r
+               if (rhs.m_RowCount != m_RowCount)\r
+                       return false;\r
+\r
+               if (Mask.m_ColCount != m_ColCount)\r
+                       return false;\r
+               if (Mask.m_RowCount != m_RowCount)\r
+                       return false;\r
+\r
+               const T * const*d = rhs.GetData();\r
+               bool Bwd = false;\r
+               int i1 = Bwd ? m_RowCount : 0;\r
+               int j1 = Bwd ? m_ColCount : 0;\r
+               int i2 = Bwd ? -1 : m_RowCount;\r
+               int j2 = Bwd ? -1 : m_ColCount;\r
+               for (int i = i1; i != i2; Bwd ? --i : ++i)\r
+                       for (int j = j1; j != j2; Bwd ? --j : ++j)\r
+                               {\r
+                               if (!Mask.m_Data[i][j])\r
+                                       continue;\r
+                               float x = m_Data[i][j];\r
+                               float y = d[i][j];\r
+                               if (x < -1e10 && y < -1e10)\r
+                                       continue;\r
+                               if (!feq(x, y))\r
+                                       {\r
+                                       Warning("%s[%d][%d] = %g, %s = %g",\r
+                                         m_Name, i, j, x, rhs.m_Name, y);\r
+                                       return false;\r
+                                       }\r
+                               }\r
+               return true;\r
+               }\r
+\r
+       void Init(T v)\r
+               {\r
+               for (unsigned i = 0; i < m_RowCount; ++i)\r
+                       for (unsigned j = 0; j < m_ColCount; ++j)\r
+                               m_Data[i][j] = v;\r
+               }\r
+       };\r
+\r
+void WriteMx(const string &Name, Mx<float> &Mxf);\r
+\r
+template<class T> void ReserveMx(Mx<T> &Mxf, unsigned N = UINT_MAX)\r
+       {\r
+       if (Mxf.m_AllocatedRowCount > 0)\r
+               return;\r
+       extern unsigned g_MaxInputSeqLength;\r
+       if (N == UINT_MAX)\r
+               N = g_MaxInputSeqLength+1;\r
+       Mxf.Alloc("(Reserved)", N, N);\r
+       }\r
+\r
+#endif // mx_h\r
diff --git a/uchime_src/myopts.h b/uchime_src/myopts.h

new file mode 100644 (file)

index 0000000..ba901ea
--- /dev/null
+++ b/uchime_src/myopts.h
@@ -0,0 +1,190 @@
+#ifndef MY_VERSION\r
+#define MY_VERSION     "4.2"\r
+#endif\r
+\r
+STR_OPT(       input,                                  0)\r
+STR_OPT(       query,                                  0)\r
+STR_OPT(       db,                                             0)\r
+STR_OPT(       sort,                                   0)\r
+STR_OPT(       output,                                 0)\r
+STR_OPT(       uc,                                             0)\r
+STR_OPT(       clstr2uc,                               0)\r
+STR_OPT(       uc2clstr,                               0)\r
+STR_OPT(       uc2fasta,                               0)\r
+STR_OPT(       uc2fastax,                              0)\r
+STR_OPT(       mergesort,                              0)\r
+STR_OPT(       tmpdir,                                 ".")\r
+STR_OPT(       staralign,                              0)\r
+STR_OPT(       sortuc,                                 0)\r
+STR_OPT(       blastout,                               0)\r
+STR_OPT(       blast6out,                              0)\r
+STR_OPT(       fastapairs,                             0)\r
+STR_OPT(       idchar,                                 "|")\r
+STR_OPT(       diffchar,                               " ")\r
+STR_OPT(       uchime,                                 0)\r
+STR_OPT(       gapopen,                                0)\r
+STR_OPT(       gapext,                                 0)\r
+STR_OPT(       uhire,                                  0)\r
+STR_OPT(       ids,                                    "99,98,95,90,85,80,70,50,35")\r
+STR_OPT(       seeds,                                  0)\r
+STR_OPT(       clump,                                  0)\r
+STR_OPT(       clumpout,                               0)\r
+STR_OPT(       clump2fasta,                    0)\r
+STR_OPT(       clumpfasta,                             0)\r
+STR_OPT(       hireout,                                0)\r
+STR_OPT(       mergeclumps,                    0)\r
+STR_OPT(       alpha,                                  0)\r
+STR_OPT(       hspalpha,                               0)\r
+STR_OPT(       probmx,                                 0)\r
+STR_OPT(       matrix,                                 0)\r
+STR_OPT(       tracestate,                             0)\r
+STR_OPT(       chainout,                               0)\r
+STR_OPT(       cluster,                                0)\r
+STR_OPT(       computekl,                              0)\r
+STR_OPT(       userout,                                0)\r
+STR_OPT(       userfields,                             0)\r
+STR_OPT(       seedsout,                               0)\r
+STR_OPT(       chainhits,                              0)\r
+STR_OPT(       findorfs,                               0)\r
+STR_OPT(       strand,                                 0)\r
+STR_OPT(       getseqs,                                0)\r
+STR_OPT(       labels,                                 0)\r
+STR_OPT(       doug,                                   0)\r
+STR_OPT(       makeindex,                              0)\r
+STR_OPT(       indexstats,                             0)\r
+STR_OPT(       uchimeout,                              0)\r
+STR_OPT(       uchimealns,                             0)\r
+STR_OPT(       xframe,                                 0)\r
+STR_OPT(       mkctest,                                0)\r
+STR_OPT(       allpairs,                               0)\r
+STR_OPT(       fastq2fasta,                    0)\r
+STR_OPT(       otusort,                                0)\r
+STR_OPT(       sparsedist,                             0)\r
+STR_OPT(       sparsedistparams,               0)\r
+STR_OPT(       mcc,                                    0)\r
+STR_OPT(       utax,                                   0)\r
+STR_OPT(       simcl,                                  0)\r
+STR_OPT(       absort,                                 0)\r
+STR_OPT(       cc,                                             0)\r
+STR_OPT(       uslink,                                 0)\r
+\r
+UNS_OPT(       band,                                   16,                     0,                      UINT_MAX)\r
+UNS_OPT(       minlen,                                 10,                     1,                      UINT_MAX)\r
+UNS_OPT(       maxlen,                                 10000,          1,                      UINT_MAX)\r
+UNS_OPT(       w,                                              0,                      1,                      UINT_MAX)\r
+UNS_OPT(       k,                                              0,                      1,                      UINT_MAX)\r
+UNS_OPT(       stepwords,                              8,                      0,                      UINT_MAX)\r
+UNS_OPT(       maxaccepts,                             1,                      0,                      UINT_MAX)\r
+UNS_OPT(       maxrejects,                             8,                      0,                      UINT_MAX)\r
+UNS_OPT(       maxtargets,                             0,                      0,                      UINT_MAX)\r
+UNS_OPT(       minhsp,                                 32,                     1,                      UINT_MAX)\r
+UNS_OPT(       bump,                                   50,                     0,                      100)\r
+UNS_OPT(       rowlen,                                 64,                     8,                      UINT_MAX)\r
+UNS_OPT(       idprefix,                               0,                      0,                      UINT_MAX)\r
+UNS_OPT(       idsuffix,                               0,                      0,                      UINT_MAX)\r
+UNS_OPT(       chunks,                                 4,                      2,                      UINT_MAX)\r
+UNS_OPT(       minchunk,                               64,                     2,                      UINT_MAX)\r
+UNS_OPT(       maxclump,                               1000,           1,                      UINT_MAX)\r
+UNS_OPT(       iddef,                                  0,                      0,                      UINT_MAX)\r
+UNS_OPT(       mincodons,                              20,                     1,                      UINT_MAX)\r
+UNS_OPT(       maxovd,                                 8,                      0,                      UINT_MAX)\r
+UNS_OPT(       max2,                                   40,                     0,                      UINT_MAX)\r
+UNS_OPT(       querylen,                               500,            0,                      UINT_MAX)\r
+UNS_OPT(       targetlen,                              500,            0,                      UINT_MAX)\r
+UNS_OPT(       orfstyle,                               (1+2+4),        0,                      UINT_MAX)\r
+UNS_OPT(       dbstep,                                 1,                      1,                      UINT_MAX)\r
+UNS_OPT(       randseed,                               1,                      0,                      UINT_MAX)\r
+UNS_OPT(       maxp,                                   2,                      2,                      UINT_MAX)\r
+UNS_OPT(       idsmoothwindow,                 32,                     1,                      UINT_MAX)\r
+UNS_OPT(       mindiffs,                               3,                      1,                      UINT_MAX)\r
+UNS_OPT(       maxspan1,                               24,                     1,                      UINT_MAX)\r
+UNS_OPT(       maxspan2,                               24,                     1,                      UINT_MAX)\r
+UNS_OPT(       minorfcov,                              16,                     1,                      UINT_MAX)\r
+UNS_OPT(       hashsize,                               4195879,        1,                      UINT_MAX)\r
+UNS_OPT(       maxpoly,                                0,                      0,                      UINT_MAX)\r
+UNS_OPT(       droppct,                                50,                     0,                      100)\r
+UNS_OPT(       secs,                                   10,                     0,                      UINT_MAX)\r
+UNS_OPT(       maxqgap,                                0,                      0,                      UINT_MAX)\r
+UNS_OPT(       maxtgap,                                0,                      0,                      UINT_MAX)\r
+\r
+INT_OPT(       frame,                                  0,                      -3,                     +3)\r
+\r
+TOG_OPT(       trace,                                  false)\r
+TOG_OPT(       logmemgrows,                    false)\r
+TOG_OPT(       trunclabels,                    false)\r
+TOG_OPT(       verbose,                                false)\r
+TOG_OPT(       wordcountreject,                true)\r
+TOG_OPT(       rev,                                    false)\r
+TOG_OPT(       output_rejects,                 false)\r
+TOG_OPT(       blast_termgaps,                 false)\r
+TOG_OPT(       fastalign,                              true)\r
+TOG_OPT(       flushuc,                                false)\r
+TOG_OPT(       stable_sort,                    false)\r
+TOG_OPT(       minus_frames,                   true)\r
+TOG_OPT(       usort,                                  true)\r
+TOG_OPT(       nb,                                             false)\r
+TOG_OPT(       twohit,                                 true)\r
+TOG_OPT(       ssort,                                  false)\r
+TOG_OPT(       log_query,                              false)\r
+TOG_OPT(       log_hothits,                    false)\r
+TOG_OPT(       logwordstats,                   false)\r
+TOG_OPT(       ucl,                                    false)\r
+TOG_OPT(       skipgaps2,                              true)\r
+TOG_OPT(       skipgaps,                               true)\r
+TOG_OPT(       denovo,                                 false)\r
+TOG_OPT(       cartoon_orfs,                   false)\r
+TOG_OPT(       label_ab,                               false)\r
+TOG_OPT(       wordweight,                             false)\r
+TOG_OPT(       isort,                                  false)\r
+TOG_OPT(       selfid,                                 false)\r
+TOG_OPT(       leftjust,                               false)\r
+TOG_OPT(       rightjust,                              false)\r
+\r
+FLT_OPT(       id,                                             0.0,            0.0,            1.0)\r
+FLT_OPT(       weak_id,                                0.0,            0.0,            1.0)\r
+FLT_OPT(       match,                                  1.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       mismatch,                               -2.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       split,                                  1000.0,         1.0,            FLT_MAX)\r
+FLT_OPT(       evalue,                                 10.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       weak_evalue,                    10.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       evalue_g,                               10.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       chain_evalue,                   10.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       xdrop_u,                                16.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       xdrop_g,                                32.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       xdrop_ug,                               16.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       xdrop_nw,                               16.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       ka_gapped_lambda,               0.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       ka_ungapped_lambda,             0.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       ka_gapped_k,                    0.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       ka_ungapped_k,                  0.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       ka_dbsize,                              0.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       chain_targetfract,              0.0,            0.0,            1.0)\r
+FLT_OPT(       targetfract,                    0.0,            0.0,            1.0)\r
+FLT_OPT(       queryfract,                             0.0,            0.0,            1.0)\r
+FLT_OPT(       fspenalty,                              16.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       sspenalty,                              20.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       seedt1,                                 13.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       seedt2,                                 11.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       lopen,                                  11.0,           0.0,            FLT_MAX)\r
+FLT_OPT(       lext,                                   1.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       minh,                                   0.3,            0.0,            FLT_MAX)\r
+FLT_OPT(       xn,                                             8.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       dn,                                             1.4,            0.0,            FLT_MAX)\r
+FLT_OPT(       xa,                                             1.0,            0.0,            FLT_MAX)\r
+FLT_OPT(       mindiv,                                 0.5,            0.0,            100.0)\r
+FLT_OPT(       abskew,                                 2,                      0.0,            100.0)\r
+FLT_OPT(       abx,                                    8.0,            0.0,            100.0)\r
+FLT_OPT(       minspanratio1,                  0.7,            0.0,            1.0)\r
+FLT_OPT(       minspanratio2,                  0.7,            0.0,            1.0)\r
+\r
+FLAG_OPT(      usersort)\r
+FLAG_OPT(      exact)\r
+FLAG_OPT(      optimal)\r
+FLAG_OPT(      self)\r
+FLAG_OPT(      ungapped)\r
+FLAG_OPT(      global)\r
+FLAG_OPT(      local)\r
+FLAG_OPT(      xlat)\r
+FLAG_OPT(      realign)\r
+FLAG_OPT(      hash)\r
+FLAG_OPT(      derep)\r
diff --git a/uchime_src/myutils.cpp b/uchime_src/myutils.cpp

new file mode 100644 (file)

index 0000000..4fa92b1
--- /dev/null
+++ b/uchime_src/myutils.cpp
@@ -0,0 +1,1844 @@
+#include <time.h>\r
+#include <stdarg.h>\r
+#include <sys/stat.h>\r
+#include <errno.h>\r
+#include <string.h>\r
+#include <ctype.h>\r
+#include <string>\r
+#include <vector>\r
+#include <set>\r
+#include <map>\r
+#include <signal.h>\r
+#include <float.h>\r
+\r
+#ifdef _MSC_VER\r
+#include <crtdbg.h>\r
+#include <process.h>\r
+#include <windows.h>\r
+#include <psapi.h>\r
+#include <io.h>\r
+#else\r
+#include <sys/time.h>\r
+#include <sys/resource.h>\r
+#include <unistd.h>\r
+#include <errno.h>\r
+#include <fcntl.h>\r
+#include <stdlib.h>\r
+#endif\r
+\r
+#include "myutils.h"\r
+\r
+const char *SVN_VERSION =\r
+#include "svnversion.h"\r
+;\r
+\r
+#define        TEST_UTILS                      0\r
+\r
+using namespace std;\r
+\r
+const unsigned MY_IO_BUFSIZ = 32000;\r
+const unsigned MAX_FORMATTED_STRING_LENGTH = 64000;\r
+\r
+static char *g_IOBuffers[256];\r
+static time_t g_StartTime = time(0);\r
+static vector<string> g_Argv;\r
+static double g_PeakMemUseBytes;\r
+\r
+#if    TEST_UTILS\r
+void TestUtils()\r
+       {\r
+       const int C = 100000000;\r
+       for (int i = 0; i < C; ++i)\r
+               ProgressStep(i, C, "something or other");\r
+\r
+       Progress("\n");\r
+       Progress("Longer message\r");\r
+       Sleep(1000);\r
+       Progress("Short\r");\r
+       Sleep(1000);\r
+       Progress("And longer again\r");\r
+       Sleep(1000);\r
+       Progress("Shrt\n");\r
+       Sleep(1000);\r
+       const unsigned N = 10;\r
+       unsigned M = 10;\r
+       for (unsigned i = 0; i < N; ++i)\r
+               {\r
+               ProgressStep(i, N, "Allocating 1MB blocks");\r
+               for (unsigned j = 0; j < M; ++j)\r
+                       {\r
+                       ProgressStep(j, M, "Inner loop"); \r
+                       malloc(100000);\r
+                       Sleep(500);\r
+                       }\r
+               }\r
+       }\r
+#endif // TEST_UTILS\r
+\r
+static void AllocBuffer(FILE *f)\r
+       {\r
+       int fd = fileno(f);\r
+       if (fd < 0 || fd >= 256)\r
+               return;\r
+       if (g_IOBuffers[fd] == 0)\r
+               g_IOBuffers[fd] = myalloc(char, MY_IO_BUFSIZ);\r
+       setvbuf(f, g_IOBuffers[fd], _IOFBF, MY_IO_BUFSIZ);\r
+       }\r
+\r
+static void FreeBuffer(FILE *f)\r
+       {\r
+       int fd = fileno(f);\r
+       if (fd < 0 || fd >= 256)\r
+               return;\r
+       if (g_IOBuffers[fd] == 0)\r
+               return;\r
+       myfree(g_IOBuffers[fd]);\r
+       g_IOBuffers[fd] = 0;\r
+       }\r
+\r
+unsigned GetElapsedSecs()\r
+       {\r
+       return (unsigned) (time(0) - g_StartTime);\r
+       }\r
+\r
+static unsigned g_NewCalls;\r
+static unsigned g_FreeCalls;\r
+static double g_InitialMemUseBytes;\r
+static double g_TotalAllocBytes;\r
+static double g_TotalFreeBytes;\r
+static double g_NetBytes;\r
+static double g_MaxNetBytes;\r
+\r
+void LogAllocStats()\r
+       {\r
+       Log("\n");\r
+       Log("       Allocs  %u\n", g_NewCalls);\r
+       Log("        Frees  %u\n", g_FreeCalls);\r
+       Log("Initial alloc  %s\n", MemBytesToStr(g_InitialMemUseBytes));\r
+       Log("  Total alloc  %s\n", MemBytesToStr(g_TotalAllocBytes));\r
+       Log("   Total free  %s\n", MemBytesToStr(g_TotalFreeBytes));\r
+       Log("    Net bytes  %s\n", MemBytesToStr(g_NetBytes));\r
+       Log("Max net bytes  %s\n", MemBytesToStr(g_MaxNetBytes));\r
+       Log("   Peak total  %s\n", MemBytesToStr(g_MaxNetBytes + g_InitialMemUseBytes));\r
+       }\r
+\r
+bool StdioFileExists(const string &FileName)\r
+       {\r
+       struct stat SD;\r
+       int i = stat(FileName.c_str(), &SD);\r
+       return i == 0;\r
+       }\r
+\r
+void myassertfail(const char *Exp, const char *File, unsigned Line)\r
+       {\r
+       Die("%s(%u) assert failed: %s", File, Line, Exp);\r
+       }\r
+\r
+bool myisatty(int fd)\r
+       {\r
+       return isatty(fd) != 0;\r
+       }\r
+\r
+#ifdef _MSC_VER\r
+#include <io.h>\r
+int fseeko(FILE *stream, off_t offset, int whence)\r
+       {\r
+       off_t FilePos = _fseeki64(stream, offset, whence);\r
+       return (FilePos == -1L) ? -1 : 0;\r
+       }\r
+#define ftello(fm) (off_t) _ftelli64(fm)\r
+#endif\r
+\r
+void LogStdioFileState(FILE *f)\r
+       {\r
+       unsigned long tellpos = (unsigned long) ftello(f);\r
+       long fseek_pos = fseek(f, 0, SEEK_CUR);\r
+       int fd = fileno(f);\r
+       Log("FILE *     %p\n", f);\r
+       Log("fileno     %d\n", fd);\r
+       Log("feof       %d\n", feof(f));\r
+       Log("ferror     %d\n", ferror(f));\r
+       Log("ftell      %ld\n", tellpos);\r
+       Log("fseek      %ld\n", fseek_pos);\r
+#if    !defined(_GNU_SOURCE) && !defined(__APPLE_CC__)\r
+       fpos_t fpos;\r
+       int fgetpos_retval = fgetpos(f, &fpos);\r
+       Log("fpos       %ld (retval %d)\n", (long) fpos, fgetpos_retval);\r
+//     Log("eof        %d\n", _eof(fd));\r
+#endif\r
+#ifdef _MSC_VER\r
+       __int64 pos64 = _ftelli64(f);\r
+       Log("_ftelli64  %lld\n", pos64);\r
+#endif\r
+       }\r
+\r
+FILE *OpenStdioFile(const string &FileName)\r
+       {\r
+       const char *Mode = "rb";\r
+       FILE *f = fopen(FileName.c_str(), Mode);\r
+       if (f == 0)\r
+               {\r
+               if (errno == EFBIG)\r
+                       {\r
+                       if (sizeof(off_t) == 4)\r
+                               Die("File too big, off_t is 32 bits, recompile needed");\r
+                       else\r
+                               Die("Cannot open '%s', file too big (off_t=%u bits)",\r
+                                 FileName.c_str(), sizeof(off_t)*8);\r
+                       }\r
+               Die("Cannot open %s, errno=%d %s",\r
+                 FileName.c_str(), errno, strerror(errno));\r
+               }\r
+       AllocBuffer(f);\r
+       return f;\r
+       }\r
+\r
+FILE *CreateStdioFile(const string &FileName)\r
+       {\r
+       FILE *f = fopen(FileName.c_str(), "wb+");\r
+       if (0 == f)\r
+               Die("Cannot create %s, errno=%d %s",\r
+                 FileName.c_str(), errno, strerror(errno));\r
+       AllocBuffer(f);\r
+       return f;\r
+       }\r
+\r
+void SetStdioFilePos(FILE *f, off_t Pos)\r
+       {\r
+       if (0 == f)\r
+               Die("SetStdioFilePos failed, f=NULL");\r
+       int Ok = fseeko(f, Pos, SEEK_SET);\r
+       off_t NewPos = ftello(f);\r
+       if (Ok != 0 || Pos != NewPos)\r
+               {\r
+               LogStdioFileState(f);\r
+               Die("SetStdioFilePos(%d) failed, Ok=%d NewPos=%d",\r
+                 (int) Pos, Ok, (int) NewPos);\r
+               }\r
+       }\r
+\r
+void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes)\r
+       {\r
+       if (0 == f)\r
+               Die("ReadStdioFile failed, f=NULL");\r
+       SetStdioFilePos(f, Pos);\r
+       unsigned BytesRead = fread(Buffer, 1, Bytes, f);\r
+       if (BytesRead != Bytes)\r
+               {\r
+               LogStdioFileState(f);\r
+               Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d",\r
+                 (int) Bytes, (int) BytesRead, errno);\r
+               }\r
+       }\r
+\r
+void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes)\r
+       {\r
+       if (0 == f)\r
+               Die("ReadStdioFile failed, f=NULL");\r
+       unsigned BytesRead = fread(Buffer, 1, Bytes, f);\r
+       if (BytesRead != Bytes)\r
+               {\r
+               LogStdioFileState(f);\r
+               Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d",\r
+                 (int) Bytes, (int) BytesRead, errno);\r
+               }\r
+       }\r
+\r
+// Return values from functions like lseek, ftell, fgetpos are\r
+// "undefined" for files that cannot seek. Attempt to detect\r
+// whether a file can seek by checking for error returns.\r
+bool CanSetStdioFilePos(FILE *f)\r
+       {\r
+// Common special cases\r
+       if (f == stdin || f == stdout || f == stderr)\r
+               return false;\r
+\r
+       fpos_t CurrPos;\r
+       int ok1 = fgetpos(f, &CurrPos);\r
+       if (ok1 < 0)\r
+               return false;\r
+       int ok2 = fseek(f, 0, SEEK_END);\r
+       if (ok2 < 0)\r
+               return false;\r
+       fpos_t EndPos;\r
+       int ok3 = fgetpos(f, &EndPos);\r
+       int ok4 = fsetpos(f, &CurrPos);\r
+       if (!ok3 || !ok4)\r
+               return false;\r
+       return true;\r
+       }\r
+\r
+byte *ReadAllStdioFile(FILE *f, unsigned &FileSize)\r
+       {\r
+       const unsigned BUFF_SIZE = 1024*1024;\r
+\r
+       if (CanSetStdioFilePos(f))\r
+               {\r
+               off_t Pos = GetStdioFilePos(f);\r
+               off_t FileSize = GetStdioFileSize(f);\r
+               if (FileSize > UINT_MAX)\r
+                       Die("ReadAllStdioFile: file size > UINT_MAX");\r
+               SetStdioFilePos(f, 0);\r
+               byte *Buffer = myalloc(byte, unsigned(FileSize));\r
+               ReadStdioFile(f, Buffer, unsigned(FileSize));\r
+               SetStdioFilePos(f, Pos);\r
+               FileSize = unsigned(FileSize);\r
+               return Buffer;\r
+               }\r
+\r
+// Can't seek, read one buffer at a time.\r
+       FileSize = 0;\r
+\r
+// Just to initialize so that first call to realloc works.\r
+       byte *Buffer = (byte *) malloc(4);\r
+       if (Buffer == 0)\r
+               Die("ReadAllStdioFile, out of memory");\r
+       for (;;)\r
+               {\r
+               Buffer = (byte *) realloc(Buffer, FileSize + BUFF_SIZE);\r
+               unsigned BytesRead = fread(Buffer + FileSize, 1, BUFF_SIZE, f);\r
+               FileSize += BytesRead;\r
+               if (BytesRead < BUFF_SIZE)\r
+                       {\r
+                       Buffer = (byte *) realloc(Buffer, FileSize);\r
+                       return Buffer;\r
+                       }\r
+               }\r
+       }\r
+\r
+byte *ReadAllStdioFile(const std::string &FileName, off_t &FileSize)\r
+       {\r
+#if    WIN32\r
+       FILE *f = OpenStdioFile(FileName);\r
+       FileSize = GetStdioFileSize(f);\r
+       CloseStdioFile(f);\r
+\r
+       HANDLE h = CreateFile(FileName.c_str(), GENERIC_READ, FILE_SHARE_READ,\r
+         NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);\r
+       if (h == INVALID_HANDLE_VALUE)\r
+               Die("ReadAllStdioFile:Open(%s) failed", FileName.c_str());\r
+\r
+       unsigned uFileSize = (unsigned) FileSize;\r
+       if ((off_t) uFileSize != FileSize)\r
+               Die("File too big (%.1f Gb): %s", double(FileSize)/1e9, FileName.c_str());\r
+\r
+       byte *Buffer = myalloc(byte, uFileSize);\r
+       DWORD BytesRead;\r
+       ReadFile(h, Buffer, uFileSize, &BytesRead, NULL);\r
+       if (FileSize != BytesRead)\r
+               Die("ReadAllStdioFile:Error reading %s, attempted %u got %u",\r
+                 FileName.c_str(), FileSize, (unsigned) BytesRead);\r
+\r
+       CloseHandle(h);\r
+       return Buffer;\r
+#else\r
+       int h = open(FileName.c_str(), O_RDONLY);\r
+       if (h < 0)\r
+               Die("ReadAllStdioFile:Cannot open %s", FileName.c_str());\r
+       FileSize = lseek(h, 0, SEEK_END);\r
+       if (FileSize == (off_t) (-1))\r
+               Die("ReadAllStdioFile:Error seeking %s", FileName.c_str());\r
+       // byte *Buffer = myalloc<byte>(FileSize);\r
+       size_t stBytes = (size_t) FileSize;\r
+       if ((off_t) stBytes != FileSize)\r
+               Die("ReadAllStdioFile: off_t overflow");\r
+       byte *Buffer = (byte *) malloc(stBytes);\r
+       if (Buffer == 0)\r
+               Die("ReadAllStdioFile: failed to allocate %s", MemBytesToStr(stBytes));\r
+       lseek(h, 0, SEEK_SET);\r
+       size_t n = read(h, Buffer, stBytes);\r
+       if (n != FileSize)\r
+               Die("ReadAllStdioFile, Error reading %s, attempted %g got %g",\r
+                 FileName.c_str(), (double) FileSize, (double) n);\r
+       close(h);\r
+       return Buffer;\r
+#endif\r
+       }\r
+\r
+void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes)\r
+       {\r
+       if (0 == f)\r
+               Die("WriteStdioFile failed, f=NULL");\r
+       SetStdioFilePos(f, Pos);\r
+       unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f);\r
+       if (BytesWritten != Bytes)\r
+               {\r
+               LogStdioFileState(f);\r
+               Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d",\r
+                 (int) Bytes, (int) BytesWritten, errno);\r
+               }\r
+       }\r
+\r
+void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes)\r
+       {\r
+       if (0 == f)\r
+               Die("WriteStdioFile failed, f=NULL");\r
+       unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f);\r
+       if (BytesWritten != Bytes)\r
+               {\r
+               LogStdioFileState(f);\r
+               Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d",\r
+                 (int) Bytes, (int) BytesWritten, errno);\r
+               }\r
+       }\r
+\r
+// Return false on EOF, true if line successfully read.\r
+bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes)\r
+       {\r
+       if (feof(f))\r
+               return false;\r
+       if ((int) Bytes < 0)\r
+               Die("ReadLineStdioFile: Bytes < 0");\r
+       char *RetVal = fgets(Line, (int) Bytes, f);\r
+       if (NULL == RetVal)\r
+               {\r
+               if (feof(f))\r
+                       return false;\r
+               if (ferror(f))\r
+                       Die("ReadLineStdioFile: errno=%d", errno);\r
+               Die("ReadLineStdioFile: fgets=0, feof=0, ferror=0");\r
+               }\r
+\r
+       if (RetVal != Line)\r
+               Die("ReadLineStdioFile: fgets != Buffer");\r
+       unsigned n = strlen(Line);\r
+       if (n < 1 || Line[n-1] != '\n')\r
+               Die("ReadLineStdioFile: line too long or missing end-of-line");\r
+       if (n > 0 && (Line[n-1] == '\r' || Line[n-1] == '\n'))\r
+               Line[n-1] = 0;\r
+       if (n > 1 && (Line[n-2] == '\r' || Line[n-2] == '\n'))\r
+               Line[n-2] = 0;\r
+       return true;\r
+       }\r
+\r
+// Return false on EOF, true if line successfully read.\r
+bool ReadLineStdioFile(FILE *f, string &Line)\r
+       {\r
+       Line.clear();\r
+       for (;;)\r
+               {\r
+               int c = fgetc(f);\r
+               if (c == -1)\r
+                       {\r
+                       if (feof(f))\r
+                               {\r
+                               if (!Line.empty())\r
+                                       return true;\r
+                               return false;\r
+                               }\r
+                       Die("ReadLineStdioFile, errno=%d", errno);\r
+                       }\r
+               if (c == '\r')\r
+                       continue;\r
+               if (c == '\n')\r
+                       return true;\r
+               Line.push_back((char) c);\r
+               }\r
+       }\r
+\r
+// Copies all of fFrom regardless of current\r
+// file position, appends to fTo.\r
+void AppendStdioFileToFile(FILE *fFrom, FILE *fTo)\r
+       {\r
+       off_t SavedFromPos = GetStdioFilePos(fFrom);\r
+       off_t FileSize = GetStdioFileSize(fFrom);\r
+       const off_t BUFF_SIZE = 1024*1024;\r
+       char *Buffer = myalloc(char, BUFF_SIZE);\r
+       SetStdioFilePos(fFrom, 0);\r
+       off_t BytesRemaining = FileSize;\r
+       while (BytesRemaining > 0)\r
+               {\r
+               off_t BytesToRead = BytesRemaining;\r
+               if (BytesToRead > BUFF_SIZE)\r
+                       BytesToRead = BUFF_SIZE;\r
+               ReadStdioFile(fFrom, Buffer, (unsigned) BytesToRead);\r
+               WriteStdioFile(fTo, Buffer, (unsigned) BytesToRead);\r
+               BytesRemaining -= BytesToRead;\r
+               }\r
+       SetStdioFilePos(fFrom, SavedFromPos);\r
+       }\r
+\r
+void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo)\r
+       {\r
+       int Ok = rename(FileNameFrom.c_str(), FileNameTo.c_str());\r
+       if (Ok != 0)\r
+               Die("RenameStdioFile(%s,%s) failed, errno=%d %s",\r
+                 FileNameFrom.c_str(), FileNameTo.c_str(), errno, strerror(errno));\r
+       }\r
+\r
+void FlushStdioFile(FILE *f)\r
+       {\r
+       int Ok = fflush(f);\r
+       if (Ok != 0)\r
+               Die("fflush(%p)=%d,", f, Ok);\r
+       }\r
+\r
+void CloseStdioFile(FILE *f)\r
+       {\r
+       if (f == 0)\r
+               return;\r
+       int Ok = fclose(f);\r
+       if (Ok != 0)\r
+               Die("fclose(%p)=%d", f, Ok);\r
+       FreeBuffer(f);\r
+       }\r
+\r
+off_t GetStdioFilePos(FILE *f)\r
+       {\r
+       off_t FilePos = ftello(f);\r
+       if (FilePos < 0)\r
+               Die("ftello=%d", (int) FilePos);\r
+       return FilePos;\r
+       }\r
+\r
+off_t GetStdioFileSize(FILE *f)\r
+       {\r
+       off_t CurrentPos = GetStdioFilePos(f);\r
+       int Ok = fseeko(f, 0, SEEK_END);\r
+       if (Ok < 0)\r
+               Die("fseek in GetFileSize");\r
+\r
+       off_t Length = ftello(f);\r
+       if (Length < 0)\r
+               Die("ftello in GetFileSize");\r
+       SetStdioFilePos(f, CurrentPos);\r
+       return Length;\r
+       }\r
+\r
+void DeleteStdioFile(const string &FileName)\r
+       {\r
+       int Ok = remove(FileName.c_str());\r
+       if (Ok != 0)\r
+               Die("remove(%s) failed, errno=%d %s", FileName.c_str(), errno, strerror(errno));\r
+       }\r
+\r
+void myvstrprintf(string &Str, const char *Format, va_list ArgList)\r
+       {\r
+       static char szStr[MAX_FORMATTED_STRING_LENGTH];\r
+       vsnprintf(szStr, MAX_FORMATTED_STRING_LENGTH-1, Format, ArgList);\r
+       szStr[MAX_FORMATTED_STRING_LENGTH - 1] = '\0';\r
+       Str.assign(szStr);\r
+       }\r
+\r
+void myvstrprintf(string &Str, const char *Format, ...)\r
+       {\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       myvstrprintf(Str, Format, ArgList);\r
+       va_end(ArgList);\r
+       }\r
+\r
+FILE *g_fLog = 0;\r
+\r
+void SetLogFileName(const string &FileName)\r
+       {\r
+       if (g_fLog != 0)\r
+               CloseStdioFile(g_fLog);\r
+       g_fLog = 0;\r
+       if (FileName.empty())\r
+               return;\r
+       g_fLog = CreateStdioFile(FileName);\r
+       }\r
+\r
+void Log(const char *Format, ...)\r
+       {\r
+       if (g_fLog == 0)\r
+               return;\r
+\r
+       static bool InLog = false;\r
+       if (InLog)\r
+               return;\r
+\r
+       InLog = true;\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       vfprintf(g_fLog, Format, ArgList);\r
+       va_end(ArgList);\r
+       fflush(g_fLog);\r
+       InLog = false;\r
+       }\r
+\r
+void Die(const char *Format, ...)\r
+       {\r
+       static bool InDie = false;\r
+       if (InDie)\r
+               exit(1);\r
+       InDie = true;\r
+       string Msg;\r
+\r
+       if (g_fLog != 0)\r
+               setbuf(g_fLog, 0);\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       myvstrprintf(Msg, Format, ArgList);\r
+       va_end(ArgList);\r
+\r
+       fprintf(stderr, "\n\n");\r
+       Log("\n");\r
+       time_t t = time(0);\r
+       Log("%s", asctime(localtime(&t)));\r
+       for (unsigned i = 0; i < g_Argv.size(); i++)\r
+               {\r
+               fprintf(stderr, (i == 0) ? "%s" : " %s", g_Argv[i].c_str());\r
+               Log((i == 0) ? "%s" : " %s", g_Argv[i].c_str());\r
+               }\r
+       fprintf(stderr, "\n");\r
+       Log("\n");\r
+\r
+       time_t CurrentTime = time(0);\r
+       unsigned ElapsedSeconds = unsigned(CurrentTime - g_StartTime);\r
+       const char *sstr = SecsToStr(ElapsedSeconds);\r
+       Log("Elapsed time: %s\n", sstr);\r
+\r
+       const char *szStr = Msg.c_str();\r
+       fprintf(stderr, "\n---Fatal error---\n%s\n", szStr);\r
+       Log("\n---Fatal error---\n%s\n", szStr);\r
+\r
+#ifdef _MSC_VER\r
+       if (IsDebuggerPresent())\r
+               __debugbreak();\r
+       _CrtSetDbgFlag(0);\r
+#endif\r
+\r
+       exit(1);\r
+       }\r
+\r
+void Warning(const char *Format, ...)\r
+       {\r
+       string Msg;\r
+\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       myvstrprintf(Msg, Format, ArgList);\r
+       va_end(ArgList);\r
+\r
+       const char *szStr = Msg.c_str();\r
+\r
+       fprintf(stderr, "\nWARNING: %s\n", szStr);\r
+       if (g_fLog != stdout)\r
+               {\r
+               Log("\nWARNING: %s\n", szStr);\r
+               fflush(g_fLog);\r
+               }\r
+       }\r
+\r
+#ifdef _MSC_VER\r
+double GetMemUseBytes()\r
+       {\r
+       HANDLE hProc = GetCurrentProcess();\r
+       PROCESS_MEMORY_COUNTERS PMC;\r
+       BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC));\r
+       if (!bOk)\r
+               return 1000000;\r
+       double Bytes = (double) PMC.WorkingSetSize;\r
+       if (Bytes > g_PeakMemUseBytes)\r
+               g_PeakMemUseBytes = Bytes;\r
+       return Bytes;\r
+       }\r
+#elif  linux || __linux__\r
+double GetMemUseBytes()\r
+       {\r
+       static char statm[64];\r
+       static int PageSize = 1;\r
+       if (0 == statm[0])\r
+               {\r
+               PageSize = sysconf(_SC_PAGESIZE);\r
+               pid_t pid = getpid();\r
+               sprintf(statm, "/proc/%d/statm", (int) pid);\r
+               }\r
+\r
+       int fd = open(statm, O_RDONLY);\r
+       if (-1 == fd)\r
+               return 1000000;\r
+       char Buffer[64];\r
+       int n = read(fd, Buffer, sizeof(Buffer) - 1);\r
+       close(fd);\r
+       fd = -1;\r
+\r
+       if (n <= 0)\r
+               return 1000000;\r
+\r
+       Buffer[n] = 0;\r
+       double Pages = atof(Buffer);\r
+\r
+       double Bytes = Pages*PageSize;\r
+       if (Bytes > g_PeakMemUseBytes)\r
+               g_PeakMemUseBytes = Bytes;\r
+       return Bytes;\r
+       }\r
+#elif defined(__MACH__)\r
+#include <memory.h>\r
+#include <stdlib.h>\r
+#include <stdio.h>\r
+#include <unistd.h>\r
+#include <sys/types.h>\r
+#include <sys/sysctl.h>\r
+#include <sys/socket.h>\r
+#include <sys/gmon.h>\r
+#include <mach/vm_param.h>\r
+#include <netinet/in.h>\r
+#include <netinet/icmp6.h>\r
+#include <sys/vmmeter.h>\r
+#include <sys/proc.h>\r
+#include <mach/vm_statistics.h>\r
+#include <mach/task_info.h>\r
+#include <mach/task.h>\r
+#include <mach/mach_init.h>\r
+\r
+#define DEFAULT_MEM_USE        100000000.0\r
+\r
+double GetMemUseBytes()\r
+       {\r
+       task_t mytask = mach_task_self();\r
+       struct task_basic_info ti;\r
+       memset((void *) &ti, 0, sizeof(ti));\r
+       mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT;\r
+       kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count);\r
+       if (ok == KERN_INVALID_ARGUMENT)\r
+               return DEFAULT_MEM_USE;\r
+\r
+       if (ok != KERN_SUCCESS)\r
+               return DEFAULT_MEM_USE;\r
+\r
+       double Bytes = (double ) ti.resident_size;\r
+       if (Bytes > g_PeakMemUseBytes)\r
+               g_PeakMemUseBytes = Bytes;\r
+       return Bytes;\r
+       }\r
+#else\r
+double GetMemUseBytes()\r
+       {\r
+       return 0;\r
+       }\r
+#endif\r
+\r
+double GetPeakMemUseBytes()\r
+       {\r
+       return g_PeakMemUseBytes;\r
+       }\r
+\r
+const char *SecsToHHMMSS(int Secs)\r
+       {\r
+       int HH = Secs/3600;\r
+       int MM = (Secs - HH*3600)/60;\r
+       int SS = Secs%60;\r
+       static char Str[16];\r
+       if (HH == 0)\r
+               sprintf(Str, "%02d:%02d", MM, SS);\r
+       else\r
+               sprintf(Str, "%02d:%02d:%02d", HH, MM, SS);\r
+       return Str;\r
+       }\r
+\r
+const char *SecsToStr(double Secs)\r
+       {\r
+       if (Secs >= 10.0)\r
+               return SecsToHHMMSS((int) Secs);\r
+\r
+       static char Str[16];\r
+       if (Secs < 1e-6)\r
+               sprintf(Str, "%.2gs", Secs);\r
+       else if (Secs < 1e-3)\r
+               sprintf(Str, "%.2fms", Secs*1e3);\r
+       else\r
+               sprintf(Str, "%.3fs", Secs);\r
+       return Str;\r
+       }\r
+\r
+const char *MemBytesToStr(double Bytes)\r
+       {\r
+       static char Str[32];\r
+\r
+       if (Bytes < 1e6)\r
+               sprintf(Str, "%.1fkb", Bytes/1e3);\r
+       else if (Bytes < 10e6)\r
+               sprintf(Str, "%.1fMb", Bytes/1e6);\r
+       else if (Bytes < 1e9)\r
+               sprintf(Str, "%.0fMb", Bytes/1e6);\r
+       else if (Bytes < 10e9)\r
+               sprintf(Str, "%.1fGb", Bytes/1e9);\r
+       else if (Bytes < 100e9)\r
+               sprintf(Str, "%.0fGb", Bytes/1e9);\r
+       else\r
+               sprintf(Str, "%.3gb", Bytes);\r
+       return Str;\r
+       }\r
+\r
+const char *IntToStr(unsigned i)\r
+       {\r
+       static char Str[32];\r
+\r
+       double d = (double) i;\r
+       if (i < 10000)\r
+               sprintf(Str, "%u", i);\r
+       else if (i < 1e6)\r
+               sprintf(Str, "%.1fk", d/1e3);\r
+       else if (i < 10e6)\r
+               sprintf(Str, "%.1fM", d/1e6);\r
+       else if (i < 1e9)\r
+               sprintf(Str, "%.0fM", d/1e6);\r
+       else if (i < 10e9)\r
+               sprintf(Str, "%.1fG", d/1e9);\r
+       else if (i < 100e9)\r
+               sprintf(Str, "%.0fG", d/1e9);\r
+       else\r
+               sprintf(Str, "%.3g", d);\r
+       return Str;\r
+       }\r
+\r
+const char *FloatToStr(double d)\r
+       {\r
+       static char Str[32];\r
+\r
+       double a = fabs(d);\r
+       if (a < 0.01)\r
+               sprintf(Str, "%.3g", a);\r
+       else if (a >= 0.01 && a < 1)\r
+               sprintf(Str, "%.3f", a);\r
+       else if (a <= 10 && a >= 1)\r
+               {\r
+               double intpart;\r
+               if (modf(a, &intpart) < 0.05)\r
+                       sprintf(Str, "%.0f", d);\r
+               else\r
+                       sprintf(Str, "%.1f", d);\r
+               }\r
+       else if (a > 10 && a < 10000)\r
+               sprintf(Str, "%.0f", d);\r
+       else if (a < 1e6)\r
+               sprintf(Str, "%.1fk", d/1e3);\r
+       else if (a < 10e6)\r
+               sprintf(Str, "%.1fM", d/1e6);\r
+       else if (a < 1e9)\r
+               sprintf(Str, "%.0fM", d/1e6);\r
+       else if (a < 10e9)\r
+               sprintf(Str, "%.1fG", d/1e9);\r
+       else if (a < 100e9)\r
+               sprintf(Str, "%.0fG", d/1e9);\r
+       else\r
+               sprintf(Str, "%.3g", d);\r
+       return Str;\r
+       }\r
+\r
+bool opt_quiet = false;\r
+bool opt_version = false;\r
+bool opt_logopts = false;\r
+bool opt_compilerinfo = false;\r
+bool opt_help = false;\r
+string opt_log = "";\r
+\r
+bool optset_quiet = false;\r
+bool optset_version = false;\r
+bool optset_logopts = false;\r
+bool optset_compilerinfo = false;\r
+bool optset_help = false;\r
+bool optset_log = false;\r
+\r
+static string g_CurrentProgressLine;\r
+static string g_ProgressDesc;\r
+static unsigned g_ProgressIndex;\r
+static unsigned g_ProgressCount;\r
+\r
+static unsigned g_CurrProgressLineLength;\r
+static unsigned g_LastProgressLineLength;\r
+static unsigned g_CountsInterval;\r
+static unsigned g_StepCalls;\r
+static time_t g_TimeLastOutputStep;\r
+\r
+static string &GetProgressPrefixStr(string &s)\r
+       {\r
+       double Bytes = GetMemUseBytes();\r
+       unsigned Secs = GetElapsedSecs();\r
+       s = string(SecsToHHMMSS(Secs));\r
+       if (Bytes > 0)\r
+               {\r
+               s.push_back(' ');\r
+               char Str[32];\r
+               sprintf(Str, "%5.5s", MemBytesToStr(Bytes));\r
+               s += string(Str);\r
+               }\r
+       s.push_back(' ');\r
+       return s;\r
+       }\r
+\r
+void ProgressLog(const char *Format, ...)\r
+       {\r
+       string Str;\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       myvstrprintf(Str, Format, ArgList);\r
+       va_end(ArgList);\r
+\r
+       Log("%s", Str.c_str());\r
+       Progress("%s", Str.c_str());\r
+       }\r
+\r
+void Progress(const char *Format, ...)\r
+       {\r
+       if (opt_quiet)\r
+               return;\r
+\r
+       string Str;\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       myvstrprintf(Str, Format, ArgList);\r
+       va_end(ArgList);\r
+\r
+#if    0\r
+       Log("Progress(");\r
+       for (unsigned i = 0; i < Str.size(); ++i)\r
+               {\r
+               char c = Str[i];\r
+               if (c == '\r')\r
+                       Log("\\r");\r
+               else if (c == '\n')\r
+                       Log("\\n");\r
+               else\r
+                       Log("%c", c);\r
+               }\r
+       Log(")\n");\r
+#endif //0\r
+\r
+       for (unsigned i = 0; i < Str.size(); ++i)\r
+               {\r
+               if (g_CurrProgressLineLength == 0)\r
+                       {\r
+                       string s;\r
+                       GetProgressPrefixStr(s);\r
+                       for (unsigned j = 0; j < s.size(); ++j)\r
+                               {\r
+                               fputc(s[j], stderr);\r
+                               ++g_CurrProgressLineLength;\r
+                               }\r
+                       }\r
+\r
+               char c = Str[i];\r
+               if (c == '\n' || c == '\r')\r
+                       {\r
+                       for (unsigned j = g_CurrProgressLineLength; j < g_LastProgressLineLength; ++j)\r
+                               fputc(' ', stderr);\r
+                       if (c == '\n')\r
+                               g_LastProgressLineLength = 0;\r
+                       else\r
+                               g_LastProgressLineLength = g_CurrProgressLineLength;\r
+                       g_CurrProgressLineLength = 0;\r
+                       fputc(c, stderr);\r
+                       }\r
+               else\r
+                       {\r
+                       fputc(c, stderr);\r
+                       ++g_CurrProgressLineLength;\r
+                       }\r
+               }\r
+       }\r
+\r
+void ProgressExit()\r
+       {\r
+       time_t Now = time(0);\r
+       struct tm *t = localtime(&Now);\r
+       const char *s = asctime(t);\r
+       unsigned Secs = GetElapsedSecs();\r
+\r
+       Log("\n");\r
+       Log("Finished %s", s); // there is a newline in s\r
+       Log("Elapsed time %s\n", SecsToHHMMSS((int) Secs));\r
+       Log("Max memory %s\n", MemBytesToStr(g_PeakMemUseBytes));\r
+#if    WIN32 && DEBUG\r
+// Skip exit(), which can be very slow in DEBUG build\r
+// VERY DANGEROUS practice, because it skips global destructors.\r
+// But if you know the rules, you can break 'em, right?\r
+       ExitProcess(0);\r
+#endif\r
+       }\r
+\r
+const char *PctStr(double x, double y)\r
+       {\r
+       if (y == 0)\r
+               {\r
+               if (x == 0)\r
+                       return "100%";\r
+               else\r
+                       return "inf%";\r
+               }\r
+       static char Str[16];\r
+       double p = x*100.0/y;\r
+       sprintf(Str, "%5.1f%%", p);\r
+       return Str;\r
+       }\r
+\r
+string &GetProgressLevelStr(string &s)\r
+       {\r
+       unsigned Index = g_ProgressIndex;\r
+       unsigned Count = g_ProgressCount;\r
+       if (Count == UINT_MAX)\r
+               {\r
+               if (Index == UINT_MAX)\r
+                       s = "100%";\r
+               else\r
+                       {\r
+                       char Tmp[16];\r
+                       sprintf(Tmp, "%u", Index); \r
+                       s = Tmp;\r
+                       }\r
+               }\r
+       else\r
+               s = string(PctStr(Index+1, Count));\r
+       s += string(" ") + g_ProgressDesc;\r
+       return s;\r
+       }\r
+\r
+void ProgressStep(unsigned i, unsigned N, const char *Format, ...)\r
+       {\r
+       if (opt_quiet)\r
+               return;\r
+\r
+       if (i == 0)\r
+               {\r
+               string Str;\r
+               va_list ArgList;\r
+               va_start(ArgList, Format);\r
+               myvstrprintf(Str, Format, ArgList);\r
+               va_end(ArgList);\r
+               g_ProgressDesc = Str;\r
+               g_ProgressIndex = 0;\r
+               g_ProgressCount = N;\r
+               g_CountsInterval = 1;\r
+               g_StepCalls = 0;\r
+               g_TimeLastOutputStep = 0;\r
+               if (g_CurrProgressLineLength > 0)\r
+                       Progress("\n");\r
+               }\r
+\r
+       if (i >= N && i != UINT_MAX)\r
+               Die("ProgressStep(%u,%u)", i, N);\r
+       bool IsLastStep = (i == UINT_MAX || i + 1 == N);\r
+       if (!IsLastStep)\r
+               {\r
+               ++g_StepCalls;\r
+               if (g_StepCalls%g_CountsInterval != 0)\r
+                       return;\r
+\r
+               time_t Now = time(0);\r
+               if (Now == g_TimeLastOutputStep)\r
+                       {\r
+                       if (g_CountsInterval < 128)\r
+                               g_CountsInterval = (g_CountsInterval*3)/2;\r
+                       else\r
+                               g_CountsInterval += 64;\r
+                       return;\r
+                       }\r
+               else\r
+                       {\r
+                       time_t Secs = Now - g_TimeLastOutputStep;\r
+                       if (Secs > 1)\r
+                               g_CountsInterval = unsigned(g_CountsInterval/(Secs*8));\r
+                       }\r
+\r
+               if (g_CountsInterval < 1)\r
+                       g_CountsInterval = 1;\r
+\r
+               g_TimeLastOutputStep = Now;\r
+               }\r
+\r
+       g_ProgressIndex = i;\r
+\r
+       if (i > 0)\r
+               {\r
+               va_list ArgList;\r
+               va_start(ArgList, Format);\r
+               myvstrprintf(g_ProgressDesc, Format, ArgList);\r
+               }\r
+\r
+       string LevelStr;\r
+       GetProgressLevelStr(LevelStr);\r
+       Progress(" %s\r", LevelStr.c_str());\r
+\r
+       if (IsLastStep)\r
+               {\r
+               g_CountsInterval = 1;\r
+               fputc('\n', stderr);\r
+               }\r
+       }\r
+\r
+enum OptType\r
+       {\r
+       OT_Flag,\r
+       OT_Tog,\r
+       OT_Int,\r
+       OT_Uns,\r
+       OT_Str,\r
+       OT_Float,\r
+       OT_Enum\r
+       };\r
+\r
+struct OptInfo\r
+       {\r
+       void *Value;\r
+       bool *OptSet;\r
+       string LongName;\r
+       OptType Type;\r
+       int iMin;\r
+       int iMax;\r
+       unsigned uMin;\r
+       unsigned uMax;\r
+       double dMin;\r
+       double dMax;\r
+       map<string, unsigned> EnumValues;\r
+\r
+       bool bDefault;\r
+       int iDefault;\r
+       unsigned uDefault;\r
+       double dDefault;\r
+       string strDefault;\r
+\r
+       string Help;\r
+\r
+       bool operator<(const OptInfo &rhs) const\r
+               {\r
+               return LongName < rhs.LongName;\r
+               }\r
+       };\r
+\r
+static set<OptInfo> g_Opts;\r
+\r
+void Help()\r
+       {\r
+       printf("\n");\r
+\r
+       void Usage();\r
+       Usage();\r
+\r
+       for (set<OptInfo>::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p)\r
+               {\r
+               const OptInfo &Opt = *p;\r
+\r
+               printf("\n");\r
+               string LongName = Opt.LongName.c_str();\r
+               if (Opt.Type == OT_Tog)\r
+                       LongName = string("[no]") + LongName;\r
+               printf("  --%s ", LongName.c_str());\r
+\r
+               switch (Opt.Type)\r
+                       {\r
+               case OT_Flag:\r
+                       break;\r
+               case OT_Tog:\r
+                       break;\r
+               case OT_Int:\r
+                       printf("<int>");\r
+                       break;\r
+               case OT_Uns:\r
+                       printf("<uint>");\r
+                       break;\r
+               case OT_Str:\r
+                       printf("<str>");\r
+                       break;\r
+               case OT_Float:\r
+                       printf("<float>");\r
+                       break;\r
+               case OT_Enum:\r
+                       printf("<enum>");\r
+                       break;\r
+               default:\r
+                       printf("??type");\r
+                       break;\r
+                       }\r
+\r
+               printf("  ");\r
+               const string &s = Opt.Help;\r
+               for (string::const_iterator q = s.begin(); q != s.end(); ++q)\r
+                       {\r
+                       char c = *q;\r
+                       if (c == '\n')\r
+                               printf("\n   ");\r
+                       else\r
+                               printf("%c", c);\r
+                       }\r
+               printf("\n");\r
+               }\r
+       printf("\n");\r
+       exit(0);\r
+       }\r
+\r
+void CmdLineErr(const char *Format, ...)\r
+       {\r
+       va_list ArgList;\r
+       va_start(ArgList, Format);\r
+       string Str;\r
+       myvstrprintf(Str, Format, ArgList);\r
+       va_end(ArgList);\r
+       fprintf(stderr, "\n");\r
+       fprintf(stderr, "Invalid command line\n");\r
+       fprintf(stderr, "%s\n", Str.c_str());\r
+       fprintf(stderr, "For list of command-line options use --help.\n");\r
+       fprintf(stderr, "\n");\r
+       exit(1);\r
+       }\r
+\r
+static set<OptInfo>::iterator GetOptInfo(const string &LongName,\r
+  bool ErrIfNotFound)\r
+       {\r
+       for (set<OptInfo>::iterator p = g_Opts.begin();\r
+         p != g_Opts.end(); ++p)\r
+               {\r
+               const OptInfo &Opt = *p;\r
+               if (Opt.LongName == LongName)\r
+                       return p;\r
+               if (Opt.Type == OT_Tog && "no" + Opt.LongName == LongName)\r
+                       return p;\r
+               }\r
+       if (ErrIfNotFound)\r
+               CmdLineErr("Option --%s is invalid", LongName.c_str());\r
+       return g_Opts.end();\r
+       }\r
+\r
+static void AddOpt(const OptInfo &Opt)\r
+       {\r
+       if (GetOptInfo(Opt.LongName, false) != g_Opts.end())\r
+               Die("Option --%s defined twice", Opt.LongName.c_str());\r
+       g_Opts.insert(Opt);\r
+       }\r
+\r
+#ifdef _MSC_VER\r
+#pragma warning(disable: 4505) // unreferenced local function\r
+#endif\r
+\r
+static void DefineFlagOpt(const string &LongName, const string &Help,\r
+  void *Value, bool *OptSet)\r
+       {\r
+       *(bool *) Value = false;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.bDefault = false;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Flag;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void DefineTogOpt(const string &LongName, bool Default, const string &Help,\r
+  void *Value, bool *OptSet)\r
+       {\r
+       *(bool *) Value = Default;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.bDefault = Default;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Tog;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void DefineIntOpt(const string &LongName, int Default, int Min, int Max,\r
+  const string &Help, void *Value, bool *OptSet)\r
+       {\r
+       *(int *) Value = Default;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.iDefault = Default;\r
+       Opt.iMin = Min;\r
+       Opt.iMax = Max;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Int;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void DefineUnsOpt(const string &LongName, unsigned Default, unsigned Min,\r
+  unsigned Max, const string &Help, void *Value, bool *OptSet)\r
+       {\r
+       *(unsigned *) Value = Default;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.uDefault = Default;\r
+       Opt.uMin = Min;\r
+       Opt.uMax = Max;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Uns;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void DefineFloatOpt(const string &LongName, double Default, double Min,\r
+  double Max, const string &Help, void *Value, bool *OptSet)\r
+       {\r
+       *(double *) Value = Default;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.dDefault = Default;\r
+       Opt.dMin = Min;\r
+       Opt.dMax = Max;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Float;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void DefineStrOpt(const string &LongName, const char *Default,\r
+  const string &Help, void *Value, bool *OptSet)\r
+       {\r
+       *(string *) Value = (Default == 0 ? "" : string(Default));\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.OptSet = OptSet;\r
+       Opt.LongName = LongName;\r
+       Opt.strDefault = (Default == 0 ? "" : string(Default));\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Str;\r
+       AddOpt(Opt);\r
+       }\r
+\r
+static void ParseEnumValues(const string &Values, map<string, unsigned> &EnumValues)\r
+       {\r
+       EnumValues.clear();\r
+       \r
+       string Name;\r
+       string Value;\r
+       bool Eq = false;\r
+       for (string::const_iterator p = Values.begin(); ; ++p)\r
+               {\r
+               char c = (p == Values.end() ? '|' : *p);\r
+               if (isspace(c))\r
+                       ;\r
+               else if (c == '|')\r
+                       {\r
+                       if (EnumValues.find(Name) != EnumValues.end())\r
+                               Die("Invalid enum values, '%s' defined twice: '%s'",\r
+                                 Name.c_str(), Values.c_str());\r
+                       if (Name.empty() || Value.empty())\r
+                               Die("Invalid enum values, empty name or value: '%s'",\r
+                                 Values.c_str());\r
+\r
+                       EnumValues[Name] = atoi(Value.c_str());\r
+                       Name.clear();\r
+                       Value.clear();\r
+                       Eq = false;\r
+                       }\r
+               else if (c == '=')\r
+                       Eq = true;\r
+               else if (Eq)\r
+                       Value.push_back(c);\r
+               else\r
+                       Name.push_back(c);\r
+               if (p == Values.end())\r
+                       return;\r
+               }\r
+       }\r
+\r
+static void DefineEnumOpt(const string &LongName, const string &ShortName,\r
+  int Default, const string &Values, const string &Help, void *Value)\r
+       {\r
+       *(int *) Value = Default;\r
+\r
+       OptInfo Opt;\r
+       Opt.Value = Value;\r
+       Opt.LongName = LongName;\r
+       Opt.iDefault = Default;\r
+       Opt.Help = Help;\r
+       Opt.Type = OT_Enum;\r
+       ParseEnumValues(Values, Opt.EnumValues);\r
+       AddOpt(Opt);\r
+       }\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+#define FLAG_OPT(LongName)                                                     bool opt_##LongName; bool optset_##LongName;\r
+#define TOG_OPT(LongName, Default)                                     bool opt_##LongName; bool optset_##LongName;\r
+#define INT_OPT(LongName, Default, Min, Max)           int opt_##LongName; bool optset_##LongName;\r
+#define UNS_OPT(LongName, Default, Min, Max)           unsigned opt_##LongName; bool optset_##LongName;\r
+#define FLT_OPT(LongName, Default, Min, Max)           double opt_##LongName; bool optset_##LongName;\r
+#define STR_OPT(LongName, Default)                                     string opt_##LongName; bool optset_##LongName;\r
+#define ENUM_OPT(LongName, Values, Default)                    int opt_##LongName; bool optset_##LongName;\r
+#include "myopts.h"\r
+\r
+static int EnumStrToInt(const OptInfo &Opt, const string &Value)\r
+       {\r
+       const map<string, unsigned> &e = Opt.EnumValues;\r
+       string s;\r
+       for (map<string, unsigned>::const_iterator p = e.begin(); p != e.end(); ++p)\r
+               {\r
+               if (Value == p->first)\r
+                       return p->second;\r
+               s += " " + p->first;\r
+               }\r
+       CmdLineErr("--%s %s not recognized, valid are: %s",\r
+         Opt.LongName.c_str(), Value.c_str(), s.c_str());\r
+       ureturn(-1);\r
+       }\r
+\r
+static void SetOpt(OptInfo &Opt, const string &Value)\r
+       {\r
+       *Opt.OptSet = true;\r
+       switch (Opt.Type)\r
+               {\r
+       case OT_Int:\r
+               {\r
+               *(int *) Opt.Value = atoi(Value.c_str());\r
+               break;\r
+               }\r
+       case OT_Uns:\r
+               {\r
+               unsigned uValue = 0;\r
+               int n = sscanf(Value.c_str(), "%u", &uValue);\r
+               if (n != 1)\r
+                       CmdLineErr("Invalid value '%s' for --%s",\r
+                         Value.c_str(), Opt.LongName.c_str());\r
+               *(unsigned *) Opt.Value = uValue;\r
+               break;\r
+               }\r
+       case OT_Float:\r
+               {\r
+               *(double *) Opt.Value = atof(Value.c_str());\r
+               break;\r
+               }\r
+       case OT_Str:\r
+               {\r
+               *(string *) Opt.Value = Value;\r
+               break;\r
+               }\r
+       case OT_Enum:\r
+               {\r
+               *(int *) Opt.Value = EnumStrToInt(Opt, Value);\r
+               break;\r
+               }\r
+       default:\r
+               asserta(false);\r
+               }\r
+       }\r
+\r
+void LogOpts()\r
+       {\r
+       for (set<OptInfo>::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p)\r
+               {\r
+               const OptInfo &Opt = *p;\r
+               Log("%s = ", Opt.LongName.c_str());\r
+               switch (Opt.Type)\r
+                       {\r
+               case OT_Flag:\r
+                       Log("%s", (*(bool *) Opt.Value) ? "yes" : "no");\r
+                       break;\r
+               case OT_Tog:\r
+                       Log("%s", (*(bool *) Opt.Value) ? "on" : "off");\r
+                       break;\r
+               case OT_Int:\r
+                       Log("%d", *(int *) Opt.Value);\r
+                       break;\r
+               case OT_Uns:\r
+                       Log("%u", *(unsigned *) Opt.Value);\r
+                       break;\r
+               case OT_Float:\r
+                       {\r
+                       double Value = *(double *) Opt.Value;\r
+                       if (Value == FLT_MAX)\r
+                               Log("*");\r
+                       else\r
+                               Log("%g", Value);\r
+                       break;\r
+                       }\r
+               case OT_Str:\r
+                       Log("%s", (*(string *) Opt.Value).c_str());\r
+                       break;\r
+               case OT_Enum:\r
+                       Log("%d", *(int *) Opt.Value);\r
+                       break;\r
+               default:\r
+                       asserta(false);\r
+                       }\r
+               Log("\n");\r
+               }\r
+       }\r
+\r
+static void CompilerInfo()\r
+       {\r
+#ifdef _FILE_OFFSET_BITS\r
+    printf("_FILE_OFFSET_BITS=%d\n", _FILE_OFFSET_BITS);\r
+#else\r
+    printf("_FILE_OFFSET_BITS not defined\n");\r
+#endif\r
+\r
+#define x(t)   printf("sizeof(" #t ") = %d\n", (int) sizeof(t));\r
+       x(int)\r
+       x(long)\r
+       x(float)\r
+       x(double)\r
+       x(void *)\r
+       x(off_t)\r
+#undef x\r
+       exit(0);\r
+       }\r
+\r
+void Split(const string &Str, vector<string> &Fields, char Sep)\r
+       {\r
+       Fields.clear();\r
+       const unsigned Length = (unsigned) Str.size();\r
+       string s;\r
+       for (unsigned i = 0; i < Length; ++i)\r
+               {\r
+               char c = Str[i];\r
+               if ((Sep == 0 && isspace(c)) || c == Sep)\r
+                       {\r
+                       if (!s.empty() || Sep != 0)\r
+                               Fields.push_back(s);\r
+                       s.clear();\r
+                       }\r
+               else\r
+                       s.push_back(c);\r
+               }\r
+       if (!s.empty())\r
+               Fields.push_back(s);\r
+       }\r
+\r
+static void GetArgsFromFile(const string &FileName, vector<string> &Args)\r
+       {\r
+       Args.clear();\r
+\r
+       FILE *f = OpenStdioFile(FileName);\r
+       string Line;\r
+       while (ReadLineStdioFile(f, Line))\r
+               {\r
+               size_t n = Line.find('#');\r
+               if (n != string::npos)\r
+                       Line = Line.substr(0, n);\r
+               vector<string> Fields;\r
+               Split(Line, Fields);\r
+               Args.insert(Args.end(), Fields.begin(), Fields.end());\r
+               }\r
+       CloseStdioFile(f);\r
+       }\r
+\r
+void MyCmdLine(int argc, char **argv)\r
+       {\r
+       static unsigned RecurseDepth = 0;\r
+       ++RecurseDepth;\r
+\r
+       DefineFlagOpt("compilerinfo", "Write info about compiler types and #defines to stdout.",\r
+         (void *) &opt_compilerinfo, &optset_compilerinfo);\r
+       DefineFlagOpt("quiet", "Turn off progress messages.", (void *) &opt_quiet, &optset_quiet);\r
+       DefineFlagOpt("version", "Show version and exit.", (void *) &opt_version, &optset_version);\r
+       DefineFlagOpt("logopts", "Log options.", (void *) &opt_logopts, &optset_logopts);\r
+       DefineFlagOpt("help", "Display command-line options.", (void *) &opt_help, &optset_help);\r
+       DefineStrOpt("log", "", "Log file name.", (void *) &opt_log, &optset_log);\r
+\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+#define FLAG_OPT(LongName)                                             DefineFlagOpt(#LongName, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define TOG_OPT(LongName, Default)                             DefineTogOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define INT_OPT(LongName, Default, Min, Max)   DefineIntOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define UNS_OPT(LongName, Default, Min, Max)   DefineUnsOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define FLT_OPT(LongName, Default, Min, Max)   DefineFloatOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define STR_OPT(LongName, Default)                             DefineStrOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define ENUM_OPT(LongName, Values, Default)            DefineEnumOpt(#LongName, Values, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#include "myopts.h"\r
+\r
+       if (RecurseDepth == 0)\r
+               g_Argv.clear();\r
+\r
+       for (int i = 0; i < argc; ++i) {\r
+               g_Argv.push_back(string(argv[i]));\r
+       }\r
+\r
+       int i = 1;\r
+       for (;;)\r
+               {\r
+               if (i >= argc)\r
+                       break;\r
+               const string &Arg = g_Argv[i];\r
+                       \r
+               if (Arg.empty())\r
+                       continue;\r
+               else if (Arg == "file:" && i + 1 < argc)\r
+                       {\r
+                       const string &FileName = g_Argv[i+1];\r
+                       vector<string> Args;\r
+                       GetArgsFromFile(FileName, Args);\r
+                       for (vector<string>::const_iterator p = Args.begin();\r
+                         p != Args.end(); ++p)\r
+                               {\r
+                               g_Argv.push_back(*p);\r
+                               ++argc;\r
+                               }\r
+                       i += 2;\r
+                       continue;\r
+                       }\r
+               else if (Arg.size() > 1 && Arg[0] == '-')\r
+                       {\r
+                       string LongName = (Arg.size() > 2 && Arg[1] == '-' ? Arg.substr(2) : Arg.substr(1));\r
+                       OptInfo Opt = *GetOptInfo(LongName, true);\r
+                       *Opt.OptSet = true;\r
+                       if (Opt.Type == OT_Flag)\r
+                               {\r
+                               g_Opts.erase(Opt);\r
+                               *(bool *) Opt.Value = true;\r
+                               g_Opts.insert(Opt);\r
+                               ++i;\r
+                               continue;\r
+                               }\r
+                       else if (Opt.Type == OT_Tog)\r
+                               {\r
+                               g_Opts.erase(Opt);\r
+                               if (string("no") + Opt.LongName == LongName)\r
+                                       *(bool *) Opt.Value = false;\r
+                               else\r
+                                       {\r
+                                       asserta(Opt.LongName == LongName);\r
+                                       *(bool *) Opt.Value = true;\r
+                                       }\r
+                               g_Opts.insert(Opt);\r
+                               ++i;\r
+                               continue;\r
+                               }\r
+\r
+                       ++i;\r
+                       if (i >= argc)\r
+                               CmdLineErr("Missing value for option --%s", LongName.c_str());\r
+\r
+                       string Value = g_Argv[i];\r
+                       SetOpt(Opt, Value);\r
+\r
+                       ++i;\r
+                       continue;\r
+                       }\r
+               else\r
+                       CmdLineErr("Expected -option_name or --option_name, got '%s'", Arg.c_str());\r
+               }\r
+\r
+       --RecurseDepth;\r
+       if (RecurseDepth > 0)\r
+               return;\r
+\r
+       if (opt_help)\r
+               Help();\r
+\r
+       if (opt_compilerinfo)\r
+               CompilerInfo();\r
+\r
+       SetLogFileName(opt_log);\r
+\r
+       if (opt_log != "")\r
+               {\r
+               for (int i = 0; i < argc; ++i)\r
+                       Log("%s%s", i == 0 ? "" : " ", g_Argv[i].c_str());\r
+               Log("\n");\r
+               time_t Now = time(0);\r
+               struct tm *t = localtime(&Now);\r
+               const char *s = asctime(t);\r
+               Log("Started %s", s); // there is a newline in s\r
+               Log("Version " MY_VERSION ".%s\n", SVN_VERSION);\r
+               Log("\n");\r
+               }\r
+\r
+       if (opt_logopts)\r
+               LogOpts();\r
+       }\r
+\r
+double Pct(double x, double y)\r
+       {\r
+       if (y == 0.0f)\r
+               return 0.0f;\r
+       return (x*100.0f)/y;\r
+       }\r
+\r
+void GetCmdLine(string &s)\r
+       {\r
+       s.clear();\r
+       for (unsigned i = 0; i < SIZE(g_Argv); ++i)\r
+               {\r
+               if (i > 0)\r
+                       s += " ";\r
+               s += g_Argv[i];\r
+               }\r
+       }\r
+\r
+char *mystrsave(const char *s)\r
+       {\r
+       unsigned n = unsigned(strlen(s));\r
+       char *t = myalloc(char, n+1);\r
+       memcpy(t, s, n+1);\r
+       return t;\r
+       }\r
+\r
+void Logu(unsigned u, unsigned w, unsigned prefixspaces)\r
+       {\r
+       for (unsigned i = 0; i < prefixspaces; ++i)\r
+               Log(" ");\r
+       if (u == UINT_MAX)\r
+               Log("%*.*s", w, w, "*");\r
+       else\r
+               Log("%*u", w, u);\r
+       }\r
+\r
+void Logf(float x, unsigned w, unsigned prefixspaces)\r
+       {\r
+       for (unsigned i = 0; i < prefixspaces; ++i)\r
+               Log(" ");\r
+       if (x == FLT_MAX)\r
+               Log("%*.*s", w, w, "*");\r
+       else\r
+               Log("%*.2f", w, x);\r
+       }\r
+\r
+static uint32 g_SLCG_state = 1;\r
+\r
+// Numerical values used by Microsoft C, according to wikipedia:\r
+// http://en.wikipedia.org/wiki/Linear_congruential_generator\r
+static uint32 g_SLCG_a = 214013;\r
+static uint32 g_SLCG_c = 2531011;\r
+\r
+// Simple Linear Congruential Generator\r
+// Bad properties; used just to initialize the better generator.\r
+static uint32 SLCG_rand()\r
+       {\r
+       g_SLCG_state = g_SLCG_state*g_SLCG_a + g_SLCG_c;\r
+       return g_SLCG_state;\r
+       }\r
+\r
+static void SLCG_srand(uint32 Seed)\r
+       {\r
+       g_SLCG_state = Seed;\r
+       for (int i = 0; i < 10; ++i)\r
+               SLCG_rand();\r
+       }\r
+\r
+/***\r
+A multiply-with-carry random number generator, see:\r
+http://en.wikipedia.org/wiki/Multiply-with-carry\r
+\r
+The particular multipliers used here were found on\r
+the web where they are attributed to George Marsaglia.\r
+***/\r
+\r
+static bool g_InitRandDone = false;\r
+static uint32 g_X[5];\r
+\r
+uint32 RandInt32()\r
+       {\r
+       InitRand();\r
+\r
+       uint64 Sum = 2111111111*(uint64) g_X[3] + 1492*(uint64) g_X[2] +\r
+         1776*(uint64) g_X[1] + 5115*(uint64) g_X[0] + g_X[4];\r
+       g_X[3] = g_X[2];\r
+       g_X[2] = g_X[1];\r
+       g_X[1] = g_X[0];\r
+       g_X[4] = (uint32) (Sum >> 32);\r
+       g_X[0] = (uint32) Sum;\r
+       return g_X[0];\r
+       }\r
+\r
+unsigned randu32()\r
+       {\r
+       return (unsigned) RandInt32();\r
+       }\r
+\r
+void InitRand()\r
+       {\r
+       if (g_InitRandDone)\r
+               return;\r
+// Do this first to avoid recursion\r
+       g_InitRandDone = true;\r
+\r
+       unsigned Seed = (optset_randseed ? opt_randseed : (unsigned) (time(0)*getpid()));\r
+       Log("RandSeed=%u\n", Seed);\r
+       SLCG_srand(Seed);\r
+\r
+       for (unsigned i = 0; i < 5; i++)\r
+               g_X[i] = SLCG_rand();\r
+\r
+       for (unsigned i = 0; i < 100; i++)\r
+               RandInt32();\r
+       }\r
+\r
+// MUST COME AT END BECAUSE OF #undef\r
+#if    RCE_MALLOC\r
+#undef mymalloc\r
+#undef myfree\r
+#undef myfree2\r
+void *mymalloc(unsigned bytes, const char *FileName, int Line)\r
+       {\r
+       void *rce_malloc(unsigned bytes, const char *FileName, int Line);\r
+       return rce_malloc(bytes, FileName, Line);\r
+       }\r
+\r
+void myfree(void *p, const char *FileName, int Line)\r
+       {\r
+       void rce_free(void *p, const char *FileName, int Line);\r
+       rce_free(p, FileName, Line);\r
+       }\r
+\r
+void myfree2(void *p, unsigned bytes, const char *FileName, int Line)\r
+       {\r
+       void rce_free(void *p, const char *FileName, int Line);\r
+       rce_free(p, FileName, Line);\r
+       }\r
+\r
+#else // RCE_MALLOC\r
+void *mymalloc(unsigned bytes)\r
+       {\r
+       ++g_NewCalls;\r
+       if (g_InitialMemUseBytes == 0)\r
+               g_InitialMemUseBytes = GetMemUseBytes();\r
+\r
+       g_TotalAllocBytes += bytes;\r
+       g_NetBytes += bytes;\r
+       if (g_NetBytes > g_MaxNetBytes)\r
+               {\r
+               if (g_NetBytes > g_MaxNetBytes + 10000000)\r
+                       GetMemUseBytes();//to force update of peak\r
+               g_MaxNetBytes = g_NetBytes;\r
+               }\r
+       void *p = malloc(bytes);\r
+       //void *p = _malloc_dbg(bytes, _NORMAL_BLOCK, __FILE__, __LINE__);\r
+       if (0 == p)\r
+               {\r
+               double b = GetMemUseBytes();\r
+               fprintf(stderr, "\nOut of memory mymalloc(%u), curr %.3g bytes",\r
+                 (unsigned) bytes, b);\r
+               void LogAllocs();\r
+               LogAllocs();\r
+#if DEBUG && defined(_MSC_VER)\r
+               asserta(_CrtCheckMemory());\r
+#endif\r
+               Die("Out of memory, mymalloc(%u), curr %.3g bytes\n",\r
+                 (unsigned) bytes, b);\r
+               }\r
+       return p;\r
+       }\r
+\r
+void myfree(void *p)\r
+       {\r
+       if (p == 0)\r
+               return;\r
+       free(p);\r
+       //_free_dbg(p, _NORMAL_BLOCK);\r
+       }\r
+\r
+void myfree2(void *p, unsigned bytes)\r
+       {\r
+       ++g_FreeCalls;\r
+       g_TotalFreeBytes += bytes;\r
+       g_NetBytes -= bytes;\r
+\r
+       if (p == 0)\r
+               return;\r
+       free(p);\r
+       }\r
+#endif\r
diff --git a/uchime_src/myutils.h b/uchime_src/myutils.h

new file mode 100644 (file)

index 0000000..b63ad3c
--- /dev/null
+++ b/uchime_src/myutils.h
@@ -0,0 +1,274 @@
+#ifndef myutils_h\r
+#define myutils_h\r
+\r
+#define RCE_MALLOC     0\r
+
+#include <stdio.h>\r
+#include <sys/types.h>\r
+#include <string>\r
+#include <string.h>\r
+#include <memory.h>\r
+#include <vector>\r
+#include <math.h>\r
+#include <stdarg.h>\r
+#include <cstdlib>\r
+#include <climits>\r
+\r
+#ifndef _MSC_VER\r
+#include <inttypes.h>\r
+#endif\r
+\r
+using namespace std;\r
+\r
+#ifdef _MSC_VER\r
+#include <crtdbg.h>\r
+#pragma warning(disable: 4996) // deprecated functions\r
+#define _CRT_SECURE_NO_DEPRECATE       1\r
+#endif\r
+\r
+#if defined(_DEBUG) && !defined(DEBUG)\r
+#define DEBUG  1\r
+#endif\r
+\r
+#if defined(DEBUG) && !defined(_DEBUG)\r
+#define _DEBUG 1\r
+#endif\r
+\r
+#ifndef NDEBUG\r
+#define        DEBUG   1\r
+#define        _DEBUG  1\r
+#endif\r
+\r
+typedef unsigned char byte;\r
+typedef unsigned short uint16;\r
+typedef unsigned uint32;\r
+typedef int int32;\r
+typedef double float32;\r
+typedef signed char int8;\r
+typedef unsigned char uint8;\r
+\r
+#ifdef _MSC_VER\r
+\r
+typedef __int64 int64;\r
+typedef unsigned __int64 uint64;\r
+\r
+#define INT64_PRINTF           "lld"\r
+#define UINT64_PRINTF          "llu"\r
+\r
+#define SIZE_T_PRINTF          "u"\r
+#define OFF64_T_PRINTF         "lld"\r
+\r
+#define INT64_PRINTFX          "llx"\r
+#define UINT64_PRINTFX         "llx"\r
+\r
+#define SIZE_T_PRINTFX         "x"\r
+#define OFF64_T_PRINTFX                "llx"\r
+\r
+#elif defined(__x86_64__)\r
+\r
+typedef long int64;\r
+typedef unsigned long uint64;\r
+\r
+#define INT64_PRINTF           "ld"\r
+#define UINT64_PRINTF          "lu"\r
+\r
+#define SIZE_T_PRINTF          "lu"\r
+#define OFF64_T_PRINTF         "ld"\r
+\r
+#define INT64_PRINTFX          "lx"\r
+#define UINT64_PRINTFX         "lx"\r
+\r
+#define SIZE_T_PRINTFX         "lx"\r
+#define OFF64_T_PRINTFX                "lx"\r
+\r
+#else\r
+\r
+typedef long long int64;\r
+typedef unsigned long long uint64;\r
+\r
+#define INT64_PRINTF           "lld"\r
+#define UINT64_PRINTF          "llu"\r
+\r
+#define SIZE_T_PRINTF          "u"\r
+#define OFF64_T_PRINTF         "lld"\r
+\r
+#define INT64_PRINTFX          "llx"\r
+#define UINT64_PRINTFX         "llx"\r
+\r
+#define SIZE_T_PRINTFX         "x"\r
+#define OFF64_T_PRINTFX                "llx"\r
+#endif\r
+\r
+#define d64            INT64_PRINTF\r
+#define        u64             UINT64_PRINTF\r
+#define        x64             UINT64_PRINTFX\r
+\r
+// const uint64 UINT64_MAX                     = (~((uint64) 0));\r
+\r
+void myassertfail(const char *Exp, const char *File, unsigned Line);\r
+#undef  assert\r
+#ifdef  NDEBUG\r
+#define assert(exp)     ((void)0)\r
+#define myassert(exp)     ((void)0)\r
+#else\r
+#define assert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+#define myassert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+#endif\r
+#define asserta(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+\r
+#define ureturn(x)     return (x)\r
+\r
+#define NotUsed(v)     ((void *) &v)\r
+\r
+// pom=plus or minus, tof=true or false\r
+static inline char pom(bool Plus)      { return Plus ? '+' : '-'; }\r
+static inline char tof(bool x)         { return x ? 'T' : 'F'; }\r
+static inline char yon(bool x)         { return x ? 'Y' : 'N'; }\r
+unsigned GetElapsedSecs();\r
+\r
+#if    RCE_MALLOC\r
+\r
+void *rce_malloc(unsigned bytes, const char *FileName, int Line);\r
+void rce_free(void *p, const char *FileName, int LineNr);\r
+void rce_chkmem();\r
+\r
+void rce_dumpmem_(const char *FileName, int LineNr);\r
+#define rce_dumpmem()          rce_dumpmem_(__FILE__, __LINE__)\r
+\r
+void rce_assertvalidptr_(void *p, const char *FileName, int LineNr);\r
+#define rce_assertvalidptr(p)  rce_assertvalidptr_(p, __FILE__, __LINE__)\r
+\r
+void rce_dumpptr_(void *p, const char *FileName, int LineNr);\r
+#define rce_dumpptr(p) rce_dumpptr_(p, __FILE__, __LINE__)\r
+\r
+#define mymalloc(n)            rce_malloc((n), __FILE__, __LINE__)\r
+#define myfree(p)              rce_free(p, __FILE__, __LINE__)\r
+#define myfree2(p,n)   rce_free(p, __FILE__, __LINE__)\r
+#define myalloc(t, n)  (t *) rce_malloc((n)*sizeof(t), __FILE__, __LINE__)\r
+\r
+#else // RCE_MALLOC\r
+void *mymalloc(unsigned bytes);\r
+void myfree2(void *p, unsigned Bytes);\r
+void myfree(void *p);\r
+#define rce_chkmem()   /* empty */\r
+#define myalloc(t, n)  (t *) mymalloc((n)*sizeof(t))\r
+#endif // RCE_MALLOC\r
+\r
+#define SIZE(c)        unsigned((c).size())\r
+\r
+bool myisatty(int fd);\r
+\r
+#ifdef _MSC_VER\r
+#define off_t  __int64\r
+#endif\r
+\r
+FILE *OpenStdioFile(const string &FileName);\r
+FILE *CreateStdioFile(const string &FileName);\r
+bool CanSetStdioFilePos(FILE *f);\r
+void CloseStdioFile(FILE *f);\r
+void SetStdioFilePos(FILE *f, off_t Pos);\r
+void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes);\r
+void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes);\r
+void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes);\r
+void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes);\r
+bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes);\r
+bool ReadLineStdioFile(FILE *f, string &Line);\r
+byte *ReadAllStdioFile(FILE *f, off_t &FileSize);\r
+byte *ReadAllStdioFile(const string &FileName, off_t &FileSize);\r
+void AppendStdioFileToFile(FILE *fFrom, FILE *fTo);\r
+void FlushStdioFile(FILE *f);\r
+bool StdioFileExists(const string &FileName);\r
+off_t GetStdioFilePos(FILE *f);\r
+off_t GetStdioFileSize(FILE *f);\r
+void LogStdioFileState(FILE *f);\r
+void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo);\r
+void DeleteStdioFile(const string &FileName);\r
+\r
+void myvstrprintf(string &Str, const char *szFormat, va_list ArgList);\r
+void myvstrprintf(string &Str, const char *szFormat, ...);\r
+\r
+void SetLogFileName(const string &FileName);\r
+void Log(const char *szFormat, ...);\r
+\r
+void Die(const char *szFormat, ...);\r
+void Warning(const char *szFormat, ...);\r
+\r
+void ProgressStep(unsigned i, unsigned N, const char *Format, ...);\r
+void Progress(const char *szFormat, ...);\r
+void Progress(const string &Str);\r
+void ProgressLog(const char *szFormat, ...);\r
+void ProgressExit();\r
+\r
+char *mystrsave(const char *s);\r
+\r
+double GetPeakMemUseBytes();\r
+\r
+// Are two floats equal to within epsilon?\r
+const double epsilon = 0.01;\r
+inline bool feq(double x, double y, double epsilon)\r
+       {\r
+       if (fabs(x) > 10000)\r
+               epsilon = fabs(x)/10000;\r
+       if (fabs(x - y) > epsilon)\r
+               return false;\r
+       return true;\r
+       }\r
+\r
+inline bool feq(double x, double y)\r
+       {\r
+       if (x < -1e6 && y < -1e6)\r
+               return true;\r
+       double e = epsilon;\r
+       if (fabs(x) > 10000)\r
+               e = fabs(x)/10000;\r
+       if (fabs(x - y) > e)\r
+               return false;\r
+       return true;\r
+       }\r
+\r
+#define asserteq(x, y) assert(feq(x, y))\r
+#define assertaeq(x, y)        asserta(feq(x, y))\r
+\r
+#define        zero(a, n)      memset(a, 0, n*sizeof(a[0]))\r
+\r
+void InitRand();\r
+unsigned randu32();\r
+void Split(const string &Str, vector<string> &Fields, char Sep = 0);\r
+double Pct(double x, double y);\r
+double GetMemUseBytes();\r
+const char *MemBytesToStr(double Bytes);\r
+const char *IntToStr(unsigned i);\r
+const char *FloatToStr(double d);\r
+const char *SecsToStr(double Secs);\r
+void Logu(unsigned u, unsigned w, unsigned prefixspaces = 2);\r
+void Logf(float x, unsigned w, unsigned prefixspaces = 2);\r
+const char *SecsToHHMMSS(int Secs);\r
+\r
+void MyCmdLine(int argc, char **argv);\r
+void CmdLineErr(const char *Format, ...);\r
+void Help();\r
+void GetCmdLine(string &s);\r
+\r
+#define FLAG_OPT(LongName)                                             extern bool opt_##LongName; extern bool optset_##LongName;\r
+#define TOG_OPT(LongName, Default)                             extern bool opt_##LongName; extern bool optset_##LongName;\r
+#define INT_OPT(LongName, Default, Min, Max)   extern int opt_##LongName; extern bool optset_##LongName;\r
+#define UNS_OPT(LongName, Default, Min, Max)   extern unsigned opt_##LongName; extern bool optset_##LongName;\r
+#define FLT_OPT(LongName, Default, Min, Max)   extern double opt_##LongName; extern bool optset_##LongName;\r
+#define STR_OPT(LongName, Default)                             extern string opt_##LongName; extern bool optset_##LongName;\r
+#define ENUM_OPT(LongName, Default, Values)            extern int opt_##LongName; extern bool optset_##LongName;\r
+#include "myopts.h"\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+\r
+extern const char *SVN_VERSION;\r
+extern const char *SVN_MODS;\r
+extern bool opt_quiet;
+extern bool opt_version;
+extern FILE *g_fLog;
+\r
+#endif // myutils_h\r
diff --git a/uchime_src/orf.h b/uchime_src/orf.h

new file mode 100644 (file)

index 0000000..90b29d1
--- /dev/null
+++ b/uchime_src/orf.h
@@ -0,0 +1,37 @@
+#ifndef orf_h\r
+#define orf_h\r
+\r
+#include "alpha.h"\r
+\r
+struct ORFData\r
+       {\r
+       const byte *NucSeq;\r
+       const byte *AminoSeq;\r
+       int Frame;\r
+       unsigned NucL;\r
+       unsigned AminoL;\r
+       unsigned NucLo;\r
+       unsigned NucHi;\r
+       ORFData *Next;\r
+\r
+       unsigned GetNucPosFirstBase() const;\r
+       unsigned GetAAPos(unsigned NucPos) const;\r
+       unsigned GetCodex(unsigned NucPos) const;\r
+       unsigned GetNucLo(unsigned AALo, unsigned AAHi) const;\r
+       unsigned GetNucHi(unsigned AALo, unsigned AAHi) const;\r
+       unsigned GetAALo(unsigned NucLo, unsigned NucHi) const;\r
+       unsigned GetAAHi(unsigned NucLo, unsigned NucHi) const;\r
+       unsigned GetNucPosFirstBaseInCodon(unsigned AAPos) const;\r
+       unsigned GetNucPosLastBaseInCodon(unsigned AAPos) const;\r
+       unsigned RoundToCodonLo(unsigned NucPos) const;\r
+       unsigned RoundToCodonHi(unsigned NucPos) const;\r
+       void LogMe() const;\r
+       void LogMe2() const;\r
+       };\r
+\r
+const byte ORFEND = '.';\r
+\r
+void GetORFs(const byte *NucSeq, unsigned NucL, vector<ORFData> &ORFs,\r
+  unsigned ORFStyle, int FindFrame, int Sign);\r
+\r
+#endif // orf_h\r
diff --git a/uchime_src/out.h b/uchime_src/out.h

new file mode 100644 (file)

index 0000000..4ca50c7
--- /dev/null
+++ b/uchime_src/out.h
@@ -0,0 +1,134 @@
+#ifndef out_h\r
+#define out_h\r
+\r
+#include "seq.h"\r
+#include "hsp.h"\r
+#include "orf.h"\r
+#include "path.h"\r
+#include <float.h>\r
+\r
+struct AlnData\r
+       {\r
+/***\r
+SA.Seq and SB.Seq align.\r
+Reverse strand stuff for nucleotides is handled like this:\r
+       SA.RevComp must be false.\r
+       If SB.RevComp is true, then SA.Seq is r.c.'d relative to the sequence in\r
+       the input file (query or db). If so, coordinates in HSP refer to SB.Seq\r
+       so are also r.c.'d relative to the original sequence.\r
+***/\r
+       SeqData SA;\r
+       SeqData SB;\r
+       HSPData HSP;\r
+       const char *Path;\r
+       char IdDesc[256];\r
+\r
+       float FractId;\r
+       float RawScore;\r
+       float BitScore;\r
+       float Evalue;\r
+\r
+       void LogMe() const\r
+               {\r
+               Log("AD: ");\r
+               HSP.LogMe();\r
+               Log(" %s,%s\n", SA.Label, SB.Label);\r
+               }\r
+       };\r
+\r
+bool OnDerepHit(const SeqData &SA, const SeqData &SB);\r
+\r
+bool OnLocalUngappedHit(const SeqData &SA, const SeqData &SB,\r
+  const HSPData &HSP, float &Evalue, float &FractId);\r
+\r
+bool OnLocalGappedHit(const SeqData &SA, const SeqData &SB,\r
+  const HSPData &HSP, const PathData &PD, float &Evalue, float &FractId);\r
+\r
+bool OnGlobalHit(const SeqData &SA, const SeqData &SB, const PathData &PD,\r
+  float &FractId);\r
+\r
+void OnReject(const SeqData &SA, const SeqData &SB, double FractId,\r
+  const char *Path);\r
+\r
+void OnNotMatched(const char *Label, unsigned L);\r
+void OnNewCluster(unsigned ClusterIndex, const char *Label, unsigned L);\r
+void OnNewLibCluster(unsigned ClusterIndex, const char *Label, unsigned L);\r
+void OnLibCluster(unsigned ClusterIndex, unsigned Size, double AvgId,\r
+  const char *Label);\r
+void OnNewCluster(unsigned ClusterIndex, unsigned Size, double AvgId,\r
+  const char *Label);\r
+void OnChainCov(const SeqData &NucleoSD, const SeqData &TargetSD,\r
+  float Score, float ChainCov);\r
+\r
+void SetUserFieldIndexes(const string &s);\r
+\r
+void BlastOut(FILE *f, const AlnData &AD);\r
+void Blast6Out(FILE *f, const AlnData &AD);\r
+void FastaPairOut(FILE *f, const AlnData &AD);\r
+void UserOut(FILE *f, const AlnData &AD);\r
+\r
+void BlastOutORF(FILE *f, const AlnData &AD);\r
+\r
+void OpenOutputFiles();\r
+void CloseOutputFiles();\r
+void SetLibSeedCount(unsigned DBSeqCount);\r
+const char *UserFieldIndexToStr(unsigned i);\r
+\r
+extern float **g_SubstMx;\r
+\r
+static char g_IdChar = '|';\r
+static char g_DiffChar = ' ';\r
+\r
+static inline char GetSymN(byte Letter1, byte Letter2)\r
+       {\r
+       Letter1 = toupper(Letter1);\r
+       Letter2 = toupper(Letter2);\r
+       if (Letter1 == Letter2)\r
+               return g_IdChar;\r
+       return g_DiffChar;\r
+       }\r
+\r
+static inline char GetSymA(byte Letter1, byte Letter2)\r
+       {\r
+       Letter1 = toupper(Letter1);\r
+       Letter2 = toupper(Letter2);\r
+       if (Letter1 == Letter2)\r
+               return '|';\r
+\r
+       float Score = g_SubstMx[Letter1][Letter2];\r
+       if (Score >= 2.0f)\r
+               return ':';\r
+       if (Score > 0.0f)\r
+               return '.';\r
+       return ' ';\r
+       }\r
+\r
+static inline char GetSym(byte Letter1, byte Letter2, bool Nucleo)\r
+       {\r
+       if (Nucleo)\r
+               return GetSymN(Letter1, Letter2);\r
+       else\r
+               return GetSymA(Letter1, Letter2);\r
+       }\r
+\r
+static unsigned GetNDig(unsigned n)\r
+       {\r
+       if (n < 10)\r
+               return 1;\r
+       if (n < 100)\r
+               return 2;\r
+       if (n < 1000)\r
+               return 3;\r
+       if (n < 10000)\r
+               return 4;\r
+       if (n < 100000)\r
+               return 5;\r
+       if (n < 1000000)\r
+               return 6;\r
+       return 10;\r
+       }\r
+\r
+extern unsigned *g_UserFieldIndexes;\r
+extern unsigned g_UserFieldCount;\r
+\r
+#endif // out_h\r
diff --git a/uchime_src/path.cpp b/uchime_src/path.cpp

new file mode 100644 (file)

index 0000000..9340344
--- /dev/null
+++ b/uchime_src/path.cpp
@@ -0,0 +1,151 @@
+#include "myutils.h"\r
+#include "path.h"\r
+#include "timing.h"\r
+\r
+#define TRACE  0\r
+\r
+const unsigned PathMagic = 0x9A783A16;\r
+\r
+struct PathBuffer\r
+       {\r
+       unsigned Magic;\r
+       char *Buffer;\r
+       unsigned Size;\r
+       bool InUse;\r
+       };\r
+\r
+static PathBuffer **g_PathBuffers;\r
+static unsigned g_PathBufferSize;\r
+\r
+static char *AllocBuffer(unsigned Size)\r
+       {\r
+       if (Size == 0)\r
+               return 0;\r
+\r
+// Is a free buffer that is big enough?\r
+       for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+               {\r
+               PathBuffer *PB = g_PathBuffers[i];\r
+               asserta(PB->Magic == PathMagic);\r
+               if (!PB->InUse)\r
+                       {\r
+                       if (PB->Size >= Size)\r
+                               {\r
+                               PB->InUse = true;\r
+                               return PB->Buffer;\r
+                               }\r
+                       if (PB->Buffer == 0)\r
+                               {\r
+                               unsigned Size2 = Size + 1024;\r
+                               PB->Buffer = MYALLOC(char, Size2, Path);\r
+                               PB->Size = Size2;\r
+                               PB->InUse = true;\r
+                               return PB->Buffer;\r
+                               }\r
+                       }\r
+               }\r
+\r
+// No available buffer, must expand g_PathBuffers[]\r
+       unsigned NewPathBufferSize = g_PathBufferSize + 1024;\r
+       PathBuffer **NewPathBuffers = MYALLOC(PathBuffer *, NewPathBufferSize, Path);\r
+       \r
+       for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+               NewPathBuffers[i] = g_PathBuffers[i];\r
+\r
+       for (unsigned i = g_PathBufferSize; i < NewPathBufferSize; ++i)\r
+               {\r
+               PathBuffer *PB = MYALLOC(PathBuffer, 1, Path);\r
+               PB->Magic = PathMagic;\r
+               PB->Buffer = 0;\r
+               PB->Size = 0;\r
+               PB->InUse = false;\r
+               NewPathBuffers[i] = PB;\r
+               }\r
+\r
+       PathBuffer *PB = NewPathBuffers[g_PathBufferSize];\r
+\r
+       MYFREE(g_PathBuffers, g_PathBufferSize, Path);\r
+       g_PathBuffers = NewPathBuffers;\r
+       g_PathBufferSize = NewPathBufferSize;\r
+\r
+       asserta(!PB->InUse && PB->Buffer == 0);\r
+\r
+       unsigned Size2 = Size + 1024;\r
+       PB->Buffer = MYALLOC(char, Size2, Path);\r
+       PB->Size = Size2;\r
+       PB->InUse = true;\r
+       return PB->Buffer;\r
+       }\r
+\r
+static void FreeBuffer(char *Buffer)\r
+       {\r
+       if (Buffer == 0)\r
+               return;\r
+\r
+       for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+               {\r
+               PathBuffer *PB = g_PathBuffers[i];\r
+               if (PB->Buffer == Buffer)\r
+                       {\r
+                       asserta(PB->InUse);\r
+                       PB->InUse = false;\r
+                       return;\r
+                       }\r
+               }\r
+\r
+       Die("FreeBuffer, not found");\r
+       }\r
+\r
+void PathData::Alloc(unsigned MaxLen)\r
+       {\r
+       if (MaxLen < Bytes)\r
+               return;\r
+\r
+       StartTimer(PathAlloc);\r
+       if (Bytes > 0)\r
+               {\r
+               FreeBuffer(Front);\r
+               }\r
+\r
+       Bytes = MaxLen + 1;\r
+       Front = AllocBuffer(Bytes);\r
+       Back = Front + Bytes - 1;\r
+       Start = 0;\r
+       EndTimer(PathAlloc);\r
+       }\r
+\r
+void PathData::Free()\r
+       {\r
+       FreeBuffer(Front);\r
+       Front = 0;\r
+       Start = 0;\r
+       Back = 0;\r
+       }\r
+\r
+void PathData::Copy(const PathData &rhs)\r
+       {\r
+       Alloc(rhs.Bytes);\r
+       strcpy(Front, rhs.Front);\r
+       Start = Front + (rhs.Start - rhs.Front);\r
+       }\r
+\r
+void PathData::FromStr(const char *PathStr)\r
+       {\r
+       asserta(PathStr != 0);\r
+       unsigned NeededBytes = (unsigned) strlen(PathStr) + 1;\r
+       Alloc(NeededBytes);\r
+       strcpy(Front, PathStr);\r
+       Start = Front;\r
+       }\r
+\r
+void LogPathStats()\r
+       {\r
+       Log("\n");\r
+       unsigned Bytes = 0;\r
+       for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+               {\r
+               const PathBuffer *PB = g_PathBuffers[i];\r
+               Bytes += PB->Size;\r
+               }\r
+       Log("%u paths allocated, total memory %u bytes\n", g_PathBufferSize, Bytes);\r
+       }\r
diff --git a/uchime_src/path.h b/uchime_src/path.h

new file mode 100644 (file)

index 0000000..f63be7e
--- /dev/null
+++ b/uchime_src/path.h
@@ -0,0 +1,63 @@
+#ifndef path_h\r
+#define path_h\r
+\r
+struct PathData\r
+       {\r
+private:\r
+       PathData(PathData &);\r
+       PathData &operator=(PathData &);\r
+\r
+public:\r
+       char *Start;\r
+       char *Front;\r
+       char *Back;\r
+       unsigned Bytes;\r
+\r
+public:\r
+       PathData()\r
+               {\r
+               Clear(true);\r
+               }\r
+       ~PathData()\r
+               {\r
+               Free();\r
+               }\r
+       void Free();\r
+       void Alloc(unsigned MaxLen);\r
+       void Clear(bool ctor = false)\r
+               {\r
+               Start = 0;\r
+               if (ctor)\r
+                       {\r
+                       Front = 0;\r
+                       Back = 0;\r
+                       Bytes = 0;\r
+                       }\r
+               else\r
+                       Free();\r
+               }\r
+       void Copy(const PathData &rhs);\r
+       void FromStr(const char *PathStr);\r
+       void Reverse()\r
+               {\r
+               asserta(Start != 0);\r
+               unsigned L = (unsigned) strlen(Start);\r
+               for (unsigned k = 0; k < L/2; ++k)\r
+                       {\r
+                       char c = Start[k];\r
+                       Start[k] = Start[L-k-1];\r
+                       Start[L-k-1] = c;\r
+                       }\r
+               }\r
+       void SetEmpty()\r
+               {\r
+               Start = 0;\r
+               }\r
+\r
+       bool IsEmpty() const\r
+               {\r
+               return Start == 0;\r
+               }\r
+       };\r
+\r
+#endif // path_h\r
diff --git a/uchime_src/searchchime.cpp b/uchime_src/searchchime.cpp

new file mode 100644 (file)

index 0000000..c00a9c4
--- /dev/null
+++ b/uchime_src/searchchime.cpp
@@ -0,0 +1,304 @@
+#include "myutils.h"\r
+#include "ultra.h"\r
+#include "chime.h"\r
+#include "uc.h"\r
+#include "dp.h"\r
+#include <set>\r
+#include <algorithm>\r
+\r
+#define TRACE  0\r
+\r
+extern FILE *g_fUChime;\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+  vector<unsigned> &Parents);\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+  const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
+\r
+static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
+  vector<unsigned> &IdVec, unsigned d)\r
+       {\r
+       IdVec.clear();\r
+       const unsigned ColCount = SIZE(Path);\r
+\r
+       const byte *Q = QSD.Seq;\r
+       const byte *P = PSD.Seq;\r
+\r
+       const unsigned QL = QSD.L;\r
+       const unsigned PL = PSD.L;\r
+\r
+       if (QL <= d)\r
+               {\r
+               IdVec.resize(QSD.L, 0);\r
+               return;\r
+               }\r
+\r
+       unsigned QPos = 0;\r
+       unsigned PPos = 0;\r
+\r
+       vector<bool> SameVec;\r
+       SameVec.reserve(QL);\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char c = Path[Col];\r
+\r
+               bool Same = false;\r
+               if (c == 'M')\r
+                       {\r
+                       byte q = Q[QPos];\r
+                       byte p = P[PPos];\r
+                       Same = (toupper(q) == toupper(p));\r
+                       }\r
+\r
+               if (c == 'M' || c == 'D')\r
+                       {\r
+                       ++QPos;\r
+                       SameVec.push_back(Same);\r
+                       }\r
+\r
+               if (c == 'M' || c == 'I')\r
+                       ++PPos;\r
+               }\r
+\r
+       asserta(SIZE(SameVec) == QL);\r
+\r
+       unsigned n = 0;\r
+       for (unsigned QPos = 0; QPos < d; ++QPos)\r
+               {\r
+               if (SameVec[QPos])\r
+                       ++n;\r
+               IdVec.push_back(n);\r
+               }\r
+\r
+       for (unsigned QPos = d; QPos < QL; ++QPos)\r
+               {\r
+               if (SameVec[QPos])\r
+                       ++n;\r
+               IdVec.push_back(n);\r
+               if (SameVec[QPos-d])\r
+                       --n;\r
+               }\r
+       asserta(SIZE(IdVec) == QL);\r
+\r
+#if    TRACE\r
+       {\r
+       Log("\n");\r
+       Log("GetSmoothedIdVec\n");\r
+       unsigned QPos = 0;\r
+       unsigned PPos = 0;\r
+       Log("Q P  Same       Id\n");\r
+       Log("- -  ----  -------\n");\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               char c = Path[Col];\r
+\r
+               bool Same = false;\r
+               if (c == 'M')\r
+                       {\r
+                       byte q = Q[QPos];\r
+                       byte p = P[PPos];\r
+                       Same = (toupper(q) == toupper(p));\r
+                       Log("%c %c  %4c  %7d\n", q, p, tof(Same), IdVec[QPos]);\r
+                       }\r
+\r
+               if (c == 'M' || c == 'D')\r
+                       ++QPos;\r
+               if (c == 'M' || c == 'I')\r
+                       ++PPos;\r
+               }\r
+       }\r
+#endif\r
+       }\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+  const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+  float MinFractId, ChimeHit2 &Hit)\r
+       {\r
+       Hit.Clear();\r
+       Hit.QLabel = QSD.Label;\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("\n");\r
+               Log("SearchChime()\n");\r
+               Log("Query>%s\n", QSD.Label);\r
+               }\r
+\r
+       vector<unsigned> Parents;\r
+       GetCandidateParents(U, QSD, QAb, Parents);\r
+\r
+       unsigned ParentCount = SIZE(Parents);\r
+       if (ParentCount <= 1)\r
+               {\r
+               if (opt_verbose)\r
+                       Log("%u candidate parents, done.\n", ParentCount);\r
+               return false;\r
+               }\r
+\r
+       if (opt_fastalign)\r
+               HF.SetA(QSD);\r
+       HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
+\r
+       unsigned ChunkLength;\r
+       vector<unsigned> ChunkLos;\r
+       GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
+       const unsigned ChunkCount = SIZE(ChunkLos);\r
+\r
+       vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
+       vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
+\r
+       vector<SeqData> PSDs;\r
+       vector<string> Paths;\r
+       double TopPctId = 0.0;\r
+       unsigned TopParentIndex = UINT_MAX;\r
+       unsigned QL = QSD.L;\r
+       vector<unsigned> MaxIdVec(QL, 0);\r
+       for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+               {\r
+               unsigned ParentSeqIndex = Parents[ParentIndex];\r
+\r
+               SeqData PSD;\r
+               //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
+               //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
+               //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
+               //PSD.Index = ParentSeqIndex;\r
+               U.GetSeqData(ParentSeqIndex, PSD);\r
+               PSDs.push_back(PSD);\r
+\r
+               if (opt_fastalign)\r
+                       HF.SetB(PSD);\r
+\r
+               PathData PD;\r
+\r
+               float HSPId;\r
+               bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
+               if (!Found)\r
+                       {\r
+                       Paths.push_back("");                            \r
+                       continue;\r
+                       }\r
+\r
+               double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
+               if (opt_selfid && PctId == 100.0)\r
+                       {\r
+                       Paths.push_back("");                            \r
+                       continue;\r
+                       }\r
+\r
+               if (PctId > TopPctId)\r
+                       {\r
+                       TopParentIndex = ParentIndex;\r
+                       TopPctId = PctId;\r
+                       if (TopPctId >= 100.0 - opt_mindiv)\r
+                               {\r
+                               if (opt_verbose)\r
+                                       {\r
+                                       Log("  %.1f%%  >%s\n", TopPctId, PSD.Label);\r
+                                       Log("  Top hit exceeds ctl threshold, done.\n");\r
+                                       return false;\r
+                                       }\r
+                               }\r
+                       }\r
+\r
+               string Path = PD.Start;\r
+               Paths.push_back(Path);\r
+\r
+               vector<unsigned> IdVec;\r
+               GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+               for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                       if (IdVec[QPos] > MaxIdVec[QPos])\r
+                               MaxIdVec[QPos] = IdVec[QPos];\r
+               }\r
+\r
+       vector<unsigned> BestParents;\r
+       for (unsigned k = 0; k < opt_maxp; ++k)\r
+               {\r
+               unsigned BestParent = UINT_MAX;\r
+               unsigned BestCov = 0;\r
+               for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+                       {\r
+                       const SeqData &PSD = PSDs[ParentIndex];\r
+                       const string &Path = Paths[ParentIndex];\r
+                       if (Path == "")\r
+                               continue;\r
+\r
+                       vector<unsigned> IdVec;\r
+                       GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+                       unsigned Cov = 0;\r
+                       for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                               if (IdVec[QPos] == MaxIdVec[QPos])\r
+                                       ++Cov;\r
+\r
+                       if (Cov > BestCov)\r
+                               {\r
+                               BestParent = ParentIndex;\r
+                               BestCov = Cov;\r
+                               }\r
+                       }\r
+\r
+               if (BestParent == UINT_MAX)\r
+                       break;\r
+\r
+               BestParents.push_back(BestParent);\r
+               vector<unsigned> IdVec;\r
+\r
+               const SeqData &PSD = PSDs[BestParent];\r
+               const string &Path = Paths[BestParent];\r
+               GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+               for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+                       if (IdVec[QPos] == MaxIdVec[QPos])\r
+                               MaxIdVec[QPos] = UINT_MAX;\r
+               }\r
+\r
+       unsigned BestParentCount = SIZE(BestParents);\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
+               for (unsigned k = 0; k < BestParentCount; ++k)\r
+                       {\r
+                       unsigned i = BestParents[k];\r
+                       Log(" %s\n", PSDs[i].Label);\r
+                       }\r
+               }\r
+\r
+       bool Found = false;\r
+       for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
+               {\r
+               unsigned i1 = BestParents[k1];\r
+               asserta(i1 < ParentCount);\r
+\r
+               const SeqData &PSD1 = PSDs[i1];\r
+               const string &Path1 = Paths[i1];\r
+\r
+               for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
+                       {\r
+                       unsigned i2 = BestParents[k2];\r
+                       asserta(i2 < ParentCount);\r
+                       asserta(i2 != i1);\r
+\r
+                       const SeqData &PSD2 = PSDs[i2];\r
+                       const string &Path2 = Paths[i2];\r
+\r
+                       ChimeHit2 Hit2;\r
+                       AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
+                       Hit2.PctIdQT = TopPctId;\r
+\r
+                       if (Hit2.Accept())\r
+                               Found = true;\r
+\r
+                       if (Hit2.Score > Hit.Score)\r
+                               Hit = Hit2;\r
+\r
+                       if (opt_verbose)\r
+                               Hit2.LogMe();\r
+                       }\r
+               }\r
+\r
+       return Found;\r
+       }\r
diff --git a/uchime_src/seq.h b/uchime_src/seq.h

new file mode 100644 (file)

index 0000000..9014641
--- /dev/null
+++ b/uchime_src/seq.h
@@ -0,0 +1,38 @@
+#ifndef seq_h\r
+#define seq_h\r
+\r
+struct ORFData;\r
+\r
+struct SeqData\r
+       {\r
+       const char *Label;\r
+       const byte *Seq;\r
+       unsigned L;\r
+       unsigned Index;\r
+\r
+// RevComp means that SeqData.Seq is reverse-complemented relative\r
+// to the sequence in the input file (query or db). Coordinates in\r
+// a hit (e.g., AlnData) will be relative to SeqData.Seq, so both\r
+// the sequence and the coordinates should be r.c.'d for output.\r
+       bool RevComp;\r
+       bool Nucleo;\r
+       const ORFData *ORFParent;\r
+\r
+       SeqData()\r
+               {\r
+               Clear();\r
+               }\r
+\r
+       void Clear()\r
+               {\r
+               Label = 0;\r
+               Seq = 0;\r
+               L = 0;\r
+               Index = UINT_MAX;\r
+               RevComp = false;\r
+               Nucleo = false;\r
+               ORFParent = 0;\r
+               }\r
+       };\r
+\r
+#endif // seq_h\r
diff --git a/uchime_src/seqdb.cpp b/uchime_src/seqdb.cpp

new file mode 100644 (file)

index 0000000..03de189
--- /dev/null
+++ b/uchime_src/seqdb.cpp
@@ -0,0 +1,289 @@
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+#include "sfasta.h"\r
+#include "seq.h"\r
+\r
+void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)\r
+       {\r
+       const unsigned ROWLEN = 80;\r
+       if (Label != 0)\r
+               fprintf(f, ">%s\n", Label);\r
+       unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+       for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+               {\r
+               unsigned From = BlockIndex*ROWLEN;\r
+               unsigned To = From + ROWLEN;\r
+               if (To >= L)\r
+                       To = L;\r
+               for (unsigned Pos = From; Pos < To; ++Pos)\r
+                       fputc(Seq[Pos], f);\r
+               fputc('\n', f);\r
+               }\r
+       }\r
+\r
+SeqDB::~SeqDB()\r
+       {\r
+       Clear();\r
+       }\r
+\r
+SeqDB::SeqDB()\r
+       {\r
+       Clear(true);\r
+       }\r
+\r
+void SeqDB::Clear(bool ctor)\r
+       {\r
+       if (!ctor)\r
+               {\r
+               for (unsigned i = 0; i < m_SeqCount; ++i)\r
+                       {\r
+                       unsigned n = strlen(m_Labels[i]);\r
+                       MYFREE(m_Labels[i], n, SeqDB);\r
+                       MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);\r
+                       }\r
+               MYFREE(m_Labels, m_Size, SeqDB);\r
+               MYFREE(m_Seqs, m_Size, SeqDB);\r
+               MYFREE(m_SeqLengths, m_Size, SeqDB);\r
+               }\r
+\r
+       m_FileName.clear();\r
+       m_SeqCount = 0;\r
+       m_Size = 0;\r
+\r
+       m_Labels = 0;\r
+       m_Seqs = 0;\r
+       m_SeqLengths = 0;\r
+\r
+       m_Aligned = false;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       }\r
+\r
+void SeqDB::InitEmpty(bool Nucleo)\r
+       {\r
+       Clear();\r
+       m_IsNucleo = Nucleo;\r
+       m_IsNucleoSet = true;\r
+       }\r
+\r
+void SeqDB::FromFasta(const string &FileName, bool AllowGaps)\r
+       {\r
+       Clear();\r
+       m_FileName = FileName;\r
+       SFasta SF;\r
+\r
+       SF.Open(FileName);\r
+       SF.m_AllowGaps = AllowGaps;\r
+\r
+       ProgressStep(0, 1000, "Reading %s", FileName.c_str());\r
+       for (;;)\r
+               {\r
+               unsigned QueryPctDoneX10 = SF.GetPctDoneX10();\r
+               ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());\r
+               const byte *Seq = SF.GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+\r
+               const char *Label = SF.GetLabel();\r
+               unsigned L = SF.GetSeqLength();\r
+               AddSeq(Label, Seq, L);\r
+               }\r
+       ProgressStep(999, 1000, "Reading %s", FileName.c_str());\r
+\r
+       SetIsNucleo();\r
+\r
+       Progress("%s sequences\n", IntToStr(GetSeqCount()));\r
+       }\r
+\r
+void SeqDB::ToFasta(const string &FileName) const\r
+       {\r
+       FILE *f = CreateStdioFile(FileName);\r
+       for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)\r
+               ToFasta(f, SeqIndex);\r
+       CloseStdioFile(f);\r
+       }\r
+\r
+void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const\r
+       {\r
+       if (WithLabel)\r
+               fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+\r
+       const unsigned ROWLEN = 80;\r
+\r
+       unsigned L = GetSeqLength(SeqIndex);\r
+       const byte *Seq = GetSeq(SeqIndex);\r
+       unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+       for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+               {\r
+               unsigned From = BlockIndex*ROWLEN;\r
+               unsigned To = From + ROWLEN;\r
+               if (To >= L)\r
+                       To = L;\r
+               for (unsigned Pos = From; Pos < To; ++Pos)\r
+                       fputc(Seq[Pos], f);\r
+               fputc('\n', f);\r
+               }\r
+       }\r
+\r
+void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const\r
+       {\r
+       asserta(SeqIndex < m_SeqCount);\r
+       fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+       SeqToFasta(f, SeqIndex);\r
+       }\r
+\r
+unsigned SeqDB::GetMaxLabelLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned MaxL = 0;\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               unsigned L = (unsigned) strlen(m_Labels[Index]);\r
+               if (L > MaxL)\r
+                       MaxL = L;\r
+               }\r
+       return MaxL;\r
+       }\r
+\r
+unsigned SeqDB::GetMaxSeqLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned MaxL = 0;\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               unsigned L = m_SeqLengths[Index];\r
+               if (L > MaxL)\r
+                       MaxL = L;\r
+               }\r
+       return MaxL;\r
+       }\r
+\r
+void SeqDB::LogMe() const\r
+       {\r
+       Log("\n");\r
+       const unsigned SeqCount = GetSeqCount();\r
+       Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));\r
+       if (SeqCount == 0)\r
+               return;\r
+\r
+       Log("Index             Label  Length  Seq\n");\r
+       Log("-----  ----------------  ------  ---\n");\r
+       for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+               {\r
+               Log("%5u", Index);\r
+               Log("  %16.16s", m_Labels[Index]);\r
+               unsigned L = m_SeqLengths[Index];\r
+               Log("  %6u", L);\r
+               Log("  %*.*s", L, L, m_Seqs[Index]);\r
+               Log("\n");\r
+               }\r
+       }\r
+\r
+void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const\r
+       {\r
+       asserta(Id < m_SeqCount);\r
+       Buffer.Seq = m_Seqs[Id];\r
+       Buffer.Label = m_Labels[Id];\r
+       Buffer.L = m_SeqLengths[Id];\r
+       Buffer.Index = Id;\r
+       Buffer.ORFParent = 0;\r
+       Buffer.RevComp = false;\r
+       Buffer.Nucleo = IsNucleo();\r
+       }\r
+\r
+void SeqDB::SetIsNucleo()\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned N = 0;\r
+       for (unsigned i = 0; i < 100; ++i)\r
+               {\r
+               unsigned SeqIndex = unsigned(rand()%SeqCount);\r
+               const byte *Seq = GetSeq(SeqIndex);\r
+               unsigned L = GetSeqLength(SeqIndex);\r
+               const unsigned Pos = unsigned(rand()%L);\r
+               byte c = Seq[Pos];\r
+\r
+               if (g_IsNucleoChar[c])\r
+                       ++N;\r
+               }\r
+       m_IsNucleo = (N > 80);\r
+       m_IsNucleoSet = true;\r
+       }\r
+\r
+unsigned SeqDB::GetTotalLength() const\r
+       {\r
+       const unsigned SeqCount = GetSeqCount();\r
+       unsigned TotalLength = 0;\r
+       for (unsigned Id = 0; Id < SeqCount; ++Id)\r
+               TotalLength += GetSeqLength(Id);\r
+       return TotalLength;\r
+       }\r
+\r
+unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)\r
+       {\r
+       StartTimer(AddSeq);\r
+       if (m_SeqCount >= m_Size)\r
+               {\r
+               unsigned NewSize = unsigned(m_Size*1.5) + 1024;\r
+               char **NewLabels = MYALLOC(char *, NewSize, SeqDB);\r
+               byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);\r
+               unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);\r
+\r
+               for (unsigned i = 0; i < m_SeqCount; ++i)\r
+                       {\r
+                       NewLabels[i] = m_Labels[i];\r
+                       NewSeqs[i] = m_Seqs[i];\r
+                       NewSeqLengths[i] = m_SeqLengths[i];\r
+                       }\r
+\r
+               MYFREE(m_Labels, m_SeqCount, SeqDB);\r
+               MYFREE(m_Seqs, m_SeqCount, SeqDB);\r
+               MYFREE(m_SeqLengths, m_SeqCount, SeqDB);\r
+\r
+               m_Labels = NewLabels;\r
+               m_Seqs = NewSeqs;\r
+               m_SeqLengths = NewSeqLengths;\r
+               m_Size = NewSize;\r
+               }\r
+\r
+       unsigned Index = m_SeqCount++;\r
+       m_Seqs[Index] = MYALLOC(byte, L, SeqDB);\r
+       memcpy(m_Seqs[Index], Seq, L);\r
+\r
+       unsigned n = strlen(Label) + 1;\r
+       m_Labels[Index] = MYALLOC(char, n, SeqDB);\r
+       memcpy(m_Labels[Index], Label, n);\r
+\r
+       if (Index == 0)\r
+               m_Aligned = true;\r
+       else\r
+               m_Aligned = (m_Aligned && L == m_SeqLengths[0]);\r
+\r
+       m_SeqLengths[Index] = L;\r
+\r
+       EndTimer(AddSeq);\r
+       return Index;\r
+       }\r
+\r
+unsigned SeqDB::GetIndex(const char *Label) const\r
+       {\r
+       for (unsigned i = 0; i < m_SeqCount; ++i)\r
+               if (strcmp(Label, m_Labels[i]) == 0)\r
+                       return i;\r
+       Die("SeqDB::GetIndex(%s), not found", Label);\r
+       return UINT_MAX;\r
+       }\r
+\r
+void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)\r
+       {\r
+       LabelToIndex.clear();\r
+       for (unsigned i = 0; i < m_SeqCount; ++i)\r
+               {\r
+               const string &Label = string(GetLabel(i));\r
+               if (LabelToIndex.find(Label) != LabelToIndex.end())\r
+                       Die("Duplicate label: %s", Label.c_str());\r
+               LabelToIndex[Label] = i;\r
+               }\r
+       }\r
diff --git a/uchime_src/seqdb.h b/uchime_src/seqdb.h

new file mode 100644 (file)

index 0000000..e4af984
--- /dev/null
+++ b/uchime_src/seqdb.h
@@ -0,0 +1,108 @@
+#ifndef seqdb_h\r
+#define seqdb_h\r
+\r
+#include <vector>\r
+#include <map>\r
+\r
+struct SeqData;\r
+\r
+using namespace std;\r
+\r
+struct SeqDB\r
+       {\r
+private:\r
+       SeqDB(const SeqDB &rhs);\r
+       SeqDB &operator=(const SeqDB &rhs);\r
+\r
+public:\r
+       string m_FileName;\r
+       char **m_Labels;\r
+       byte **m_Seqs;\r
+       unsigned *m_SeqLengths;\r
+       unsigned m_SeqCount;\r
+       unsigned m_Size;\r
+\r
+       bool m_Aligned;\r
+       bool m_IsNucleo;\r
+       bool m_IsNucleoSet;\r
+\r
+public:\r
+       SeqDB();\r
+       ~SeqDB();\r
+       void Clear(bool ctor = false);\r
+       void InitEmpty(bool Nucleo);\r
+\r
+       unsigned AddSeq(const char *Label, const byte *Seq, unsigned L);\r
+\r
+       byte *GetSeq(unsigned SeqIndex) const\r
+               {\r
+               asserta(SeqIndex < m_SeqCount);\r
+               return m_Seqs[SeqIndex];\r
+               }\r
+\r
+       const char *GetLabel(unsigned SeqIndex) const\r
+               {\r
+               asserta(SeqIndex < m_SeqCount);\r
+               return m_Labels[SeqIndex];\r
+               }\r
+\r
+       unsigned GetSeqLength(unsigned SeqIndex) const\r
+               {\r
+               asserta(SeqIndex < m_SeqCount);\r
+               return m_SeqLengths[SeqIndex];\r
+               }\r
+\r
+       unsigned GetSeqCount() const\r
+               {\r
+               return m_SeqCount;\r
+               }\r
+\r
+       unsigned GetPairCount() const\r
+               {\r
+               unsigned SeqCount = GetSeqCount();\r
+               return (SeqCount*(SeqCount - 1))/2;\r
+               }\r
+\r
+       unsigned GetPairIndex(unsigned SeqIndex1, unsigned SeqIndex2) const\r
+               {\r
+               if (SeqIndex1 > SeqIndex2)\r
+                       return (SeqIndex1*(SeqIndex1 - 1))/2 + SeqIndex2;\r
+               return (SeqIndex2*(SeqIndex2 - 1))/2 + SeqIndex1;\r
+               }\r
+\r
+       unsigned GetColCount() const\r
+               {\r
+               if (!m_Aligned)\r
+                       Die("SeqDB::GetColCount, not aligned");\r
+               if (m_SeqCount == 0)\r
+                       Die("SeqDB::GetColCount, empty");\r
+               return m_SeqLengths[0];\r
+               }\r
+\r
+       bool IsNucleo() const\r
+               {\r
+               asserta(m_IsNucleoSet);\r
+               return m_IsNucleo;\r
+               }\r
+\r
+       void GetSeqData(unsigned Id, SeqData &Buffer) const;\r
+\r
+       unsigned GetMaxLabelLength() const;\r
+       unsigned GetMaxSeqLength() const;\r
+       void SetIsNucleo();\r
+       unsigned GetIndex(const char *Label) const;\r
+       void MakeLabelToIndex(map<string, unsigned> &LabelToIndex);\r
+\r
+       void LogMe() const;\r
+       void FromFasta(const string &FileName, bool AllowGaps = false);\r
+\r
+       void ToFasta(const string &FileName) const;\r
+       void ToFasta(FILE *f, unsigned SeqIndex) const;\r
+       void SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel = false) const;\r
+\r
+       unsigned GetTotalLength() const;\r
+       };\r
+\r
+bool isgap(byte c);\r
+\r
+#endif\r
diff --git a/uchime_src/setnucmx.cpp b/uchime_src/setnucmx.cpp

new file mode 100644 (file)

index 0000000..030ff5a
--- /dev/null
+++ b/uchime_src/setnucmx.cpp
@@ -0,0 +1,77 @@
+#include "myutils.h"
+#include "mx.h"
+
+Mx<float> g_SubstMxf;
+float **g_SubstMx;
+
+static const char Alphabet[] = "ACGTU";
+
+void SetNucSubstMx(double Match, double Mismatch)\r
+       {\r
+       static bool Done = false;\r
+       if (Done)\r
+               return;\r
+       Done = true;\r
+\r
+       if (Match <= 0.0)\r
+               Die("Match score should be +ve");\r
+       if (Mismatch >= 0.0)\r
+               Die("Mismatch score should be -ve");\r
+\r
+       unsigned N = unsigned(strlen(Alphabet));\r
+\r
+       g_SubstMxf.Alloc("NUCMX", 256, 256);\r
+       strcpy(g_SubstMxf.m_Alpha, "ACGT");\r
+       g_SubstMxf.Init(0);\r
+       g_SubstMx = g_SubstMxf.GetData();\r
+       for (unsigned i = 0; i < N; ++i)\r
+               {\r
+               for (unsigned j = 0; j < N; ++j)\r
+                       {\r
+                       float v = float(i == j ? Match : Mismatch);\r
+\r
+                       byte ui = (byte) toupper(Alphabet[i]);\r
+                       byte uj = (byte) toupper(Alphabet[j]);\r
+                       byte li = (byte) tolower(ui);\r
+                       byte lj = (byte) tolower(uj);\r
+                       ui = (byte) toupper(ui);\r
+                       uj = (byte) toupper(uj);\r
+\r
+                       g_SubstMx[ui][uj] = v;\r
+                       g_SubstMx[uj][ui] = v;\r
+\r
+                       g_SubstMx[ui][lj] = v;\r
+                       g_SubstMx[uj][li] = v;\r
+\r
+                       g_SubstMx[li][uj] = v;\r
+                       g_SubstMx[lj][ui] = v;\r
+\r
+                       g_SubstMx[li][lj] = v;\r
+                       g_SubstMx[lj][li] = v;\r
+                       }\r
+               }\r
+\r
+       for (unsigned j = 0; j < N; ++j)\r
+               {\r
+               float v = 0.0f;\r
+\r
+               byte ui = (byte) 'N';\r
+               byte uj = (byte) toupper(Alphabet[j]);\r
+               byte li = (byte) 'n';\r
+               byte lj = (byte) tolower(uj);\r
+               ui = (byte) toupper(ui);\r
+               uj = (byte) toupper(uj);\r
+\r
+               g_SubstMx[ui][uj] = v;\r
+               g_SubstMx[uj][ui] = v;\r
+\r
+               g_SubstMx[ui][lj] = v;\r
+               g_SubstMx[uj][li] = v;\r
+\r
+               g_SubstMx[li][uj] = v;\r
+               g_SubstMx[lj][ui] = v;\r
+\r
+               g_SubstMx[li][lj] = v;\r
+               g_SubstMx[lj][li] = v;\r
+               }\r
+       }\r
diff --git a/uchime_src/sfasta.cpp b/uchime_src/sfasta.cpp

new file mode 100644 (file)

index 0000000..918d4f8
--- /dev/null
+++ b/uchime_src/sfasta.cpp
@@ -0,0 +1,467 @@
+#include "sfasta.h"\r
+#include "orf.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+static inline bool isgap(byte c)\r
+       {\r
+       return c == '-' || c == '.';\r
+       }\r
+\r
+const unsigned BufferSize = 16*1024*1024;\r
+\r
+static unsigned GetMaxPoly(const byte *Seq, unsigned L)\r
+       {\r
+       byte CurrChar = Seq[0];\r
+       unsigned Start = 0;\r
+       unsigned MaxLen = 1;\r
+       for (unsigned i = 1; i < L; ++i)\r
+               {\r
+               char c = Seq[i];\r
+               if (c != CurrChar || i+1 == L)\r
+                       {\r
+                       unsigned Len = i - Start;\r
+                       if (Len > MaxLen)\r
+                               MaxLen = Len;\r
+                       CurrChar = c;\r
+                       Start = i;\r
+                       }\r
+               }\r
+       return MaxLen;\r
+       }\r
+\r
+SFasta::SFasta()\r
+       {\r
+       m_FileName = "";\r
+       m_File = 0;\r
+       m_Buffer = 0;\r
+       m_BufferSize = 0;\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       m_FileSize = 0;\r
+       m_Label = 0;\r
+       m_SeqLength = 0;\r
+       m_TooShortCount = 0;\r
+       m_TooLongCount = 0;\r
+       m_ShortestLength = 0;\r
+       m_LongestLength = 0;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       }\r
+\r
+SFasta::~SFasta()\r
+       {\r
+       Clear();\r
+       }\r
+\r
+void SFasta::Clear()\r
+       {\r
+       MYFREE(m_Buffer, m_BufferSize, SFasta);\r
+       if (m_File != 0)\r
+               CloseStdioFile(m_File);\r
+\r
+       m_FileName = "";\r
+       m_File = 0;\r
+       m_Buffer = 0;\r
+       m_BufferSize = 0;\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       m_FileSize = 0;\r
+       m_Label = 0;\r
+       m_SeqLength = 0;\r
+       m_SeqIndex = UINT_MAX;\r
+       m_AllowGaps = false;\r
+       m_IsNucleo = false;\r
+       m_IsNucleoSet = false;\r
+       m_TooShortCount = 0;\r
+       m_TooLongCount = 0;\r
+       m_ShortestLength = 0;\r
+       m_LongestLength = 0;\r
+       m_TooPolyCount = 0;\r
+       }\r
+\r
+void SFasta::LogMe() const\r
+       {\r
+       Log("\n");\r
+       Log("SFasta::LogMe()\n");\r
+       Log("FileName=%s\n", m_FileName.c_str());\r
+       Log("FileSize=%u\n", (unsigned) m_FileSize);\r
+       Log("FilePos=%u\n", (unsigned) m_FilePos);\r
+       Log("BufferSize=%u\n", m_BufferSize);\r
+       Log("BufferPos=%u\n", m_BufferOffset);\r
+       Log("BufferBytes=%u\n", m_BufferBytes);\r
+       if (m_Label == 0)\r
+               Log("Label=NULL\n");\r
+       else\r
+               Log("Label=%s\n", m_Label);\r
+       Log("SeqLength=%u\n", m_SeqLength);\r
+       }\r
+\r
+const byte *SFasta::GetNextSeq()\r
+       {\r
+       for (;;)\r
+               {\r
+               const byte *Seq = GetNextSeqLo();\r
+               if (Seq == 0)\r
+                       {\r
+                       if (m_TooShortCount > 0)\r
+                               Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",\r
+                                 m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());\r
+                       if (m_TooLongCount > 0)\r
+                               Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",\r
+                                 m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());\r
+                       if (m_TooPolyCount > 0)\r
+                               Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",\r
+                                 m_TooPolyCount, opt_maxpoly);\r
+                       return 0;\r
+                       }\r
+               if (m_SeqLength < opt_minlen)\r
+                       {\r
+                       ++m_TooShortCount;\r
+                       if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)\r
+                               m_ShortestLength = m_SeqLength;\r
+                       continue;\r
+                       }\r
+               if (m_SeqLength > opt_maxlen && opt_maxlen != 0)\r
+                       {\r
+                       if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)\r
+                               m_LongestLength = m_SeqLength;\r
+                       ++m_TooLongCount;\r
+                       continue;\r
+                       }\r
+               return Seq;\r
+               }\r
+       }\r
+\r
+const byte *SFasta::GetNextSeqLo()\r
+       {\r
+// End of cache?\r
+       if (m_BufferOffset == m_BufferBytes)\r
+               {\r
+       // End of file?\r
+               if (m_FilePos == m_FileSize)\r
+                       return 0;\r
+               FillCache();\r
+               }\r
+\r
+       StartTimer(SF_GetNextSeq);\r
+       asserta(m_Buffer[m_BufferOffset] == '>');\r
+       m_Label = (char *) (m_Buffer + m_BufferOffset + 1);\r
+       \r
+//// Scan to end-of-line.\r
+//// Use dubious library function strchr() in the hope\r
+//// that it uses fast machine code.\r
+//     byte *ptr = (byte *) strchr(m_Label, '\n');\r
+//     asserta(ptr != 0);\r
+//     *ptr = 0;\r
+\r
+       byte *ptr = 0;\r
+       for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)\r
+               {\r
+               char c = m_Buffer[i];\r
+               if (c == '\n' || c == '\r')\r
+                       {\r
+                       ptr = m_Buffer + i;\r
+                       break;\r
+                       }\r
+               }\r
+       asserta(ptr != 0);\r
+\r
+       if (opt_trunclabels)\r
+               {\r
+               for (char *p = m_Label; *p; ++p)\r
+                       if (isspace(*p))\r
+                               {\r
+                               *p = 0;\r
+                               break;\r
+                               }\r
+               }\r
+       else\r
+               {\r
+               for (char *p = m_Label; *p; ++p)\r
+                       {\r
+                       if (*p == '\t')\r
+                               *p = ' ';\r
+                       else if (*p == '\r' || *p == '\n')\r
+                               {\r
+                               *p = 0;\r
+                               char NextChar = *(p+1);\r
+                               if (NextChar == '\r' || NextChar == '\n')\r
+                                       ++p;\r
+                               break;\r
+                               }\r
+                       }\r
+               }\r
+\r
+// ptr points to end-of-line.\r
+// Move to start of sequence data.\r
+       byte *Seq = ++ptr;\r
+\r
+// Delete white space in-place\r
+       byte *To = ptr;\r
+       m_BufferOffset = (unsigned) (ptr - m_Buffer);\r
+       while (m_BufferOffset < m_BufferBytes)\r
+               {\r
+               byte c = m_Buffer[m_BufferOffset];\r
+               if (c == '>')\r
+                       {\r
+                       char prevc = '\n';\r
+                       if (m_BufferOffset > 0)\r
+                               prevc = m_Buffer[m_BufferOffset-1];\r
+                       if (prevc == '\n' || prevc == '\r')\r
+                               break;\r
+                       }\r
+               ++m_BufferOffset;\r
+               if (isalpha(c) || (isgap(c) && m_AllowGaps))\r
+                       *To++ = c;\r
+               else if (c == '\n' || c == '\r')\r
+                       continue;\r
+               else\r
+                       {\r
+                       const char *Label = (m_Label == 0 ? "" : m_Label);\r
+                       static bool WarningDone = false;\r
+                       if (!WarningDone)\r
+                               {\r
+                               if (isgap(c))\r
+                                       Warning("Ignoring gaps in FASTA file '%s'",\r
+                                         m_FileName.c_str());\r
+                               else if (isprint(c))\r
+                                       Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",\r
+                                         m_FileName.c_str(), c, Label);\r
+                               else\r
+                                       Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",\r
+                                         m_FileName.c_str(), c, Label);\r
+                               WarningDone = true;\r
+                               }\r
+                       continue;\r
+                       }\r
+               }\r
+       m_SeqLength = unsigned(To - Seq);\r
+\r
+       if (m_SeqIndex == UINT_MAX)\r
+               m_SeqIndex = 0;\r
+       else\r
+               ++m_SeqIndex;\r
+\r
+       EndTimer(SF_GetNextSeq);\r
+       return Seq;\r
+       }\r
+\r
+void SFasta::Open(const string &FileName)\r
+       {\r
+       Clear();\r
+       m_FileName = FileName;\r
+       m_File = OpenStdioFile(FileName);\r
+       m_BufferSize = BufferSize;\r
+       //m_Buffer = myalloc<byte>(m_BufferSize);\r
+       m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);\r
+       m_FileSize = GetStdioFileSize(m_File);\r
+       }\r
+\r
+void SFasta::Rewind()\r
+       {\r
+       m_BufferOffset = 0;\r
+       m_BufferBytes = 0;\r
+       m_FilePos = 0;\r
+       }\r
+\r
+bool SFasta::SetIsNucleo()\r
+       {\r
+       if (m_FilePos != 0)\r
+               Die("SFasta::IsNucleo, not at BOF");\r
+\r
+       unsigned LetterCount = 0;\r
+       unsigned NucleoLetterCount = 0;\r
+       for (;;)\r
+               {\r
+               const byte *Seq = GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+               unsigned L = GetSeqLength();\r
+               for (unsigned i = 0; i < L; ++i)\r
+                       if (g_IsNucleoChar[Seq[i]])\r
+                               ++NucleoLetterCount;\r
+               LetterCount += L;\r
+               if (LetterCount > 256)\r
+                       break;\r
+               }\r
+       Rewind();\r
+       if (LetterCount == 0)\r
+               {\r
+               m_IsNucleoSet = true;\r
+               m_IsNucleo = true;\r
+               return true;\r
+               }\r
+\r
+// Nucleo if more than 90% nucleo letters AGCTUN\r
+       m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;\r
+       m_IsNucleoSet = true;\r
+       return m_IsNucleo;\r
+       }\r
+\r
+void SFasta::FillCache()\r
+       {\r
+       StartTimer(SF_FillCache);\r
+       asserta(m_FilePos < m_FileSize);\r
+\r
+// off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.\r
+       off_t otBytesToRead = m_FileSize - m_FilePos;\r
+\r
+       bool FinalBuffer = true;\r
+       if (otBytesToRead > (off_t) m_BufferSize)\r
+               {\r
+               FinalBuffer = false;\r
+               otBytesToRead = m_BufferSize;\r
+               }\r
+\r
+       unsigned BytesToRead = unsigned(otBytesToRead);\r
+       asserta(BytesToRead > 0);\r
+       asserta(BytesToRead <= m_BufferSize);\r
+\r
+       SetStdioFilePos(m_File, m_FilePos);\r
+       ReadStdioFile(m_File, m_Buffer, BytesToRead);\r
+       if (m_Buffer[0] != '>')\r
+               {\r
+               if (m_FilePos == 0)\r
+                       Die("Input is not FASTA file");\r
+               else\r
+                       Die("SFasta::FillCache() failed, expected '>'");\r
+               }\r
+\r
+       m_BufferOffset = 0;\r
+\r
+// If last buffer in file, done\r
+       if (FinalBuffer)\r
+               {\r
+               m_BufferBytes = BytesToRead;\r
+               m_FilePos += BytesToRead;\r
+               EndTimer(SF_FillCache);\r
+               return;\r
+               }\r
+\r
+// If not last buffer, truncate any partial sequence\r
+// at end of buffer. Search backwards to find last '>'.\r
+       byte *ptr = m_Buffer + BytesToRead - 1;\r
+       while (ptr > m_Buffer)\r
+               {\r
+               if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))\r
+                       break;\r
+               --ptr;\r
+               }\r
+\r
+       if (ptr == m_Buffer)\r
+               {\r
+               LogMe();\r
+               if (*ptr != '>')\r
+                       {\r
+       // No '>' found.\r
+       // This might techincally be legal FASTA if the entire\r
+       // buffer is white space, but strange if not the last buffer\r
+       // in the file, so quit anyway.\r
+                       Die("Failed to find '>' (pos=%u, bytes=%u)",\r
+                         (unsigned) m_FilePos, BytesToRead);\r
+                       }\r
+               else\r
+                       {\r
+       // Entire buffer is one sequence which may be truncated.\r
+                       Die("Sequence too long (pos=%u, bytes=%u)",\r
+                         (unsigned) m_FilePos, BytesToRead);\r
+                       }\r
+               }\r
+\r
+       asserta(*ptr == '>');\r
+\r
+       m_BufferBytes = unsigned(ptr - m_Buffer);\r
+       m_FilePos += m_BufferBytes;\r
+\r
+       EndTimer(SF_FillCache);\r
+       }\r
+\r
+unsigned SFasta::GetPctDoneX10() const\r
+       {\r
+       if (m_FilePos == 0 || m_FileSize == 0)\r
+               return 0;\r
+\r
+       assert(m_FilePos >= (off_t) m_BufferBytes);\r
+       off_t BufferStart = m_FilePos - m_BufferBytes;\r
+       off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+       unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));\r
+       if (iPctX10 == 0)\r
+               return 1;\r
+       if (iPctX10 >= 999)\r
+               return 998;\r
+       return iPctX10;\r
+       }\r
+\r
+double SFasta::GetPctDone() const\r
+       {\r
+       if (m_FilePos == 0 || m_FileSize == 0)\r
+               return 0;\r
+\r
+       assert(m_FilePos >= (off_t) m_BufferBytes);\r
+       off_t BufferStart = m_FilePos - m_BufferBytes;\r
+       off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+       return double(BufferPos)*100.0/double(m_FileSize);\r
+       }\r
+\r
+bool SFasta::GetNextSD(SeqData &SD)\r
+       {\r
+       SD.Seq = GetNextSeq();\r
+       if (SD.Seq == 0)\r
+               return false;\r
+\r
+       SD.Label = GetLabel();\r
+       SD.L = GetSeqLength();\r
+       SD.Index = GetSeqIndex();\r
+       SD.ORFParent = 0;\r
+       SD.Nucleo = GetIsNucleo();\r
+       SD.RevComp = false;\r
+\r
+       return true;\r
+       }\r
+\r
+#if    TEST\r
+void TestSFasta()\r
+       {\r
+       SFasta SF;\r
+       SF.Open(opt_input);\r
+\r
+       if (opt_verbose)\r
+               {\r
+               Log("  Index   Length  Label\n");\r
+               Log("-------  -------  -----\n");\r
+               }\r
+\r
+       unsigned Index = 0;\r
+       unsigned SeqCount = 0;\r
+       double LetterCount = 0.0;\r
+       ProgressStep(0, 1000, "Reading");\r
+       for (;;)\r
+               {\r
+               const byte *Seq = SF.GetNextSeq();\r
+               if (Seq == 0)\r
+                       break;\r
+               ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");\r
+               const char *Label = SF.GetLabel();\r
+               unsigned L = SF.GetSeqLength();\r
+               ++SeqCount;\r
+               LetterCount += L;\r
+\r
+               if (opt_verbose)\r
+                       {\r
+                       Log(">%7u  %7u  '%s'\n", Index, L, Label);\r
+                       Log("+%7.7s  %7.7s  \"%*.*s\"\n", "", "", L, L, Seq);\r
+                       }\r
+\r
+               ++Index;\r
+               }\r
+       ProgressStep(999, 1000, "Reading");\r
+\r
+       Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+       Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+       }\r
+#endif // TEST\r
diff --git a/uchime_src/sfasta.h b/uchime_src/sfasta.h

new file mode 100644 (file)

index 0000000..ed2f2ff
--- /dev/null
+++ b/uchime_src/sfasta.h
@@ -0,0 +1,93 @@
+#ifndef sfasta_h\r
+#define sfasta_h\r
+\r
+#include "myutils.h"\r
+#include "seq.h"\r
+\r
+typedef void (*ON_START_XSEQ)(const SeqData &SD);\r
+typedef void (*ON_END_XSEQ)(const SeqData &SD);\r
+\r
+// Sequential reader for FASTA file format.\r
+// Serves sequences in file order to save memory.\r
+// Caches biggish chunks to compromise memory vs. speed.\r
+class SFasta\r
+       {\r
+public:\r
+       string m_FileName;\r
+       FILE *m_File;\r
+       bool m_AllowGaps;\r
+\r
+       off_t m_FileSize;\r
+\r
+// Position to start next read\r
+       off_t m_FilePos;\r
+\r
+// Cached data.\r
+       byte *m_Buffer;\r
+\r
+// Bytes allocated to m_Buffer\r
+       unsigned m_BufferSize;\r
+\r
+// Current position in buffer, normally points to '>'\r
+       unsigned m_BufferOffset;\r
+\r
+// File data in buffer <= m_BufferSize\r
+       unsigned m_BufferBytes;\r
+\r
+// Current label\r
+// Points into m_Buffer, not a separate buffer.\r
+       char *m_Label;\r
+\r
+// Current sequence length\r
+       unsigned m_SeqLength;\r
+\r
+// Current seq index\r
+       unsigned m_SeqIndex;\r
+\r
+       unsigned m_ShortestLength;\r
+       unsigned m_LongestLength;\r
+       unsigned m_TooShortCount;\r
+       unsigned m_TooLongCount;\r
+       unsigned m_TooPolyCount;\r
+\r
+private:\r
+       bool m_IsNucleoSet;\r
+       bool m_IsNucleo;\r
+\r
+public:\r
+       SFasta();\r
+       ~SFasta();\r
+\r
+       void Clear();\r
+       void Open(const string &FileName);\r
+       void Rewind();\r
+       bool SetIsNucleo();\r
+       bool GetIsNucleo() const { asserta(m_IsNucleoSet); return m_IsNucleo; };\r
+\r
+// Get next sequence.\r
+// Returns zero on end-of-file\r
+       const byte *GetNextSeq();\r
+\r
+// Get next sequence as SeqData object, return false on end-of-file.\r
+       bool GetNextSD(SeqData &SD);\r
+\r
+// Length of most recent sequence returned by GetNextSeq().\r
+       unsigned GetSeqLength() const { return m_SeqLength; }\r
+\r
+// Label of most recent sequence returned by GetNextSeq().\r
+       const char *GetLabel() const { return m_Label; }\r
+\r
+// Index of most recent sequence returned by GetNextSeq().\r
+       unsigned GetSeqIndex() const { return m_SeqIndex; }\r
+\r
+       unsigned GetPctDoneX10() const;\r
+       double GetPctDone() const;\r
+\r
+       void LogMe() const;\r
+\r
+private:\r
+       void FillCache();\r
+       const byte *GetNextSeqLo();\r
+       };\r
+\r
+#endif // sfasta_h\r
diff --git a/uchime_src/svnmods.h b/uchime_src/svnmods.h

new file mode 100644 (file)

index 0000000..c68513e
--- /dev/null
+++ b/uchime_src/svnmods.h
@@ -0,0 +1,15 @@
+"Path: .\n"
+"URL: file:///public/svn/usearch\n"
+"Repository Root: file:///public/svn/usearch\n"
+"Repository UUID: 58640331-1837-4c17-bc3e-636dc59aced1\n"
+"Revision: 34\n"
+"Node Kind: directory\n"
+"Schedule: normal\n"
+"Last Changed Author: bob\n"
+"Last Changed Rev: 34\n"
+"Last Changed Date: 2011-05-01 08:29:04 -0700 (Sun, 01 May 2011)\n"
+"\n"
+"?       mk\n"
+"!       svnmods.h\n"
+"M       ungappedblastid.cpp\n"
+"M       chaindisjointhits.cpp\n"
diff --git a/uchime_src/svnversion.h b/uchime_src/svnversion.h

new file mode 100644 (file)

index 0000000..2a64d50
--- /dev/null
+++ b/uchime_src/svnversion.h
@@ -0,0 +1 @@
+"40"
diff --git a/uchime_src/timers.h b/uchime_src/timers.h

new file mode 100644 (file)

index 0000000..81cf7d1
--- /dev/null
+++ b/uchime_src/timers.h
@@ -0,0 +1,173 @@
+T(MxBase_Alloc)\r
+T(MxBase_FreeData)\r
+T(MxBase_AllocData)\r
+T(SortSeqIndexes)\r
+T(Alloc_Vectors)\r
+T(MainLoop_NotNW)\r
+T(WriteOutput)\r
+T(NWB)\r
+T(ReadAllStdioFile)\r
+T(Windex_Init)\r
+T(Windex_SetSeqIndex)\r
+T(SeqToWords)\r
+T(SeqToWordsStep)\r
+T(SeqToShortWords)\r
+T(SeqToShortWordsA)\r
+T(SeqToShortWordsB)\r
+T(GetFractIdB)\r
+T(Windex_UniqueWordsAlloc)\r
+T(Windex_UniqueWords)\r
+T(GetPctId)\r
+T(Windex_Reset)\r
+T(GetSig)\r
+T(NWEditDist)\r
+T(EditDist_Myers)\r
+T(EditDist_BlockTarget)\r
+T(NWBand)\r
+T(WordCounting)\r
+T(NWAff)\r
+T(NWAffBand)\r
+T(NWSimple)\r
+T(NWSimpleB)\r
+T(BandWrap)\r
+T(IncIdCounts)\r
+T(GetBestDiagB)\r
+T(GetBestDiagB1)\r
+T(GetBestDiagB2)\r
+T(ClusterInit)\r
+T(ClusterPrep)\r
+T(HotSort1)\r
+T(HotSort2)\r
+T(SortA)\r
+T(SortB)\r
+T(CountSort)\r
+T(AddWords)\r
+T(ClusterWindex)\r
+T(MainInit)\r
+T(Output)\r
+T(WindexTail)\r
+T(WindexExit)\r
+T(Sort)\r
+T(U_AllocSeqLength)\r
+T(U_AllocSeedCount)\r
+T(U_AddSeed)\r
+T(AddSeq)\r
+T(U_SetWordCounts)\r
+T(U_SetWordCountsHash)\r
+T(U_SetWordScores)\r
+T(U_SetHotHits)\r
+T(U_SetHotHitsHash)\r
+T(U_SetHotHitsScores)\r
+T(U_Search)\r
+T(U_SearchExact)\r
+T(WF_SeqToWords)\r
+T(WF_SeqToWordsA)\r
+T(WF_SeqToWordsB)\r
+T(WF_AllocLA)\r
+T(WF_AllocLB)\r
+T(WF_AllocDiags)\r
+T(WF_SetA)\r
+T(WF_SetA_Nb)\r
+T(WF_SetAZero)\r
+T(WF_SetA2)\r
+T(WF_SetB)\r
+T(WF_GetCommonWordCount)\r
+T(WF_GetBestDiag)\r
+T(GetFractIdGivenPath)\r
+T(WX_GetUniqueWords)\r
+T(CompressPath)\r
+T(GetHSPs1)\r
+T(GetHSPs2)\r
+T(AlignHSPs)\r
+T(WF_ResolveHSPs)\r
+T(WX_SetExcludes)\r
+T(ViterbiFast)\r
+T(ViterbiFastBand)\r
+T(ViterbiFastBand0)\r
+T(ViterbiFastBand1)\r
+T(ViterbiFastBand2)\r
+T(ViterbiFastBand3)\r
+T(ViterbiFastBand4)\r
+T(TraceBackBit)\r
+T(TraceBackBitSW)\r
+T(SF_GetNextSeq)\r
+T(SF_FillCache)\r
+T(OnGlobalAccept)\r
+T(UngappedBlast)\r
+T(UngappedBlastId)\r
+T(UngappedBlast2Hit)\r
+T(LogHSPs)\r
+T(BlastOutput)\r
+T(BlastLeft)\r
+T(BlastRight)\r
+T(Blast1)\r
+T(Blast2)\r
+T(Blast3)\r
+T(Blast4)\r
+T(GetBestSeg)\r
+T(SWLinearDP)\r
+T(SWLinearTB)\r
+T(SWLinearDP2)\r
+T(SWLinearTB2)\r
+T(Chain)\r
+T(XlatSeq)\r
+T(XlatSeqToLetters)\r
+T(XDropFwdSimple)\r
+T(XDropFwdFast)\r
+T(XDropFwdFastTB)\r
+T(XDropBwd)\r
+T(SWSimple)\r
+T(PathAlloc)\r
+T(SubPath)\r
+T(SWUngapped)\r
+T(SWFast)\r
+T(SWFastNTB)\r
+T(SWAT_CacheQuery)\r
+T(SWAT_AlignTarget)\r
+T(SWAT_CacheQueryNW)\r
+T(SWAT_AlignTargetNW)\r
+T(SeqDB_FromFasta)\r
+T(LocalUngappedHitToAD)\r
+T(LocalGappedHitToAD)\r
+T(GlobalHitToAD)\r
+T(ResolveOverlaps)\r
+T(GetORFs)\r
+T(ChainCov_AddHit)\r
+T(ChainCov_EndQuery)\r
+T(ChainCov_DoTarget)\r
+T(BuildNb)\r
+T(MakeIntSubstMx)\r
+T(UngappedExtendLeft)\r
+T(UngappedExtendRight)\r
+T(AlignSP)\r
+T(AlignHSP)\r
+\r
+// Background\r
+T(Bg_SearchLoop)\r
+T(Bg_MainInit)\r
+T(Bg_MainTerm)\r
+T(Bg_Other)\r
+T(Bg_1)\r
+T(Bg_2)\r
+T(Bg_3)\r
+T(Bg_4)\r
+T(Bg_5)\r
+T(Bg_6)\r
+T(Bg_7)\r
+T(Bg_8)\r
+T(Bg_9)\r
+T(Bg_XFrame2)\r
+T(Bg_Usearch1)\r
+T(Bg_Usearch2)\r
+T(Bg_Usearch3)\r
+T(Bg_Usearch4)\r
+T(Bg_Hot)\r
+\r
+// For Timer2\r
+T(Search_2)\r
+T(Search_Loop_2)\r
+T(Search_InnerLoop_2)\r
+T(OnHit_2)\r
+T(UngappedBlast_2)\r
+T(MainInit_2)\r
+T(MainTerm_2)\r
diff --git a/uchime_src/timing.h b/uchime_src/timing.h

new file mode 100644 (file)

index 0000000..0a80aee
--- /dev/null
+++ b/uchime_src/timing.h
@@ -0,0 +1,238 @@
+#define TIMING 0
+#ifndef timing_h
+#define timing_h
+
+#define BG_TIMING      0
+
+#if !TIMING
+#undef BG_TIMING
+#define BG_TIMING      0
+#endif
+
+#if    UCHIMES
+#undef TIMING
+#define TIMING 0
+#endif
+
+#if TIMING
+
+enum TIMER
+       {
+       TIMER_None,
+#define T(x)   TIMER_##x,
+#include "timers.h"
+#undef T
+       };
+
+const unsigned TimerCount =
+       1       // TIMER_None
+#define T(x)   +1
+#include "timers.h"
+#undef T
+       ;
+
+enum COUNTER
+       {
+#define C(x)   COUNTER_##x,
+#include "counters.h"
+#undef C
+       };
+
+enum ALLOCER
+       {
+#define A(x)   ALLOCER_##x,
+#include "allocs.h"
+#undef A
+       };
+
+const unsigned CounterCount =
+#define C(x)   +1
+#include "counters.h"
+#undef C
+       ;
+
+const unsigned AllocerCount =
+#define A(x)   +1
+#include "allocs.h"
+#undef A
+       ;
+
+#ifdef _MSC_VER
+
+typedef unsigned __int64 TICKS;
+
+#pragma warning(disable:4035)
+inline TICKS GetClockTicks()
+       {
+       _asm
+               {
+               _emit   0x0f
+               _emit   0x31
+               }
+       }
+
+#else  // ifdef _MSC_VER
+
+typedef uint64_t TICKS;
+__inline__ uint64_t GetClockTicks()
+       {
+       uint32_t lo, hi;
+       /* We cannot use "=A", since this would use %rax on x86_64 */
+       __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+       return (uint64_t)hi << 32 | lo;
+       }
+
+#endif // ifdef _MSC_VER
+
+//void AddTicks(const string &Name, TICKS Ticks1, TICKS Ticks2);
+//void AddBytes(const string &Name, double Bytes);
+//#define SubBytes(Name, Bytes)        AddBytes(Name, -double(Bytes))
+
+const char *TimerToStr(TIMER t);
+
+extern TICKS g_BeginTicks[TimerCount];
+extern double g_TotalTicks[TimerCount];
+extern double g_TotalCounts[TimerCount];
+extern double g_Counters[CounterCount];
+extern unsigned g_AllocNewCount[AllocerCount];
+extern unsigned g_AllocFreeCount[AllocerCount];
+extern double g_AllocNewBytes[AllocerCount];
+extern double g_AllocFreeBytes[AllocerCount];
+extern double g_AllocNetBytes[AllocerCount];
+extern double g_AllocPeakBytes[AllocerCount];
+extern bool g_Timer2[TimerCount];
+extern TIMER g_CurrTimer;
+#if    BG_TIMING
+extern TIMER g_BackgroundTimer;
+#endif
+
+#define MYALLOC(Type, N, Name)         (Type *) MyAlloc_((N)*sizeof(Type), ALLOCER_##Name, __FILE__, __LINE__)
+#define MYFREE(Array, N, Name)         MyFree_(Array, N*sizeof(Array[0]), ALLOCER_##Name, __FILE__, __LINE__)
+
+inline void *MyAlloc_(unsigned Bytes, unsigned a, const char *FileName, int Line)
+       {
+       ++g_AllocNewCount[a];
+       g_AllocNewBytes[a] += Bytes;
+       g_AllocNetBytes[a] += Bytes;
+       if (g_AllocNetBytes[a] > g_AllocPeakBytes[a])
+               g_AllocPeakBytes[a] = g_AllocNetBytes[a];
+       return mymalloc(Bytes);
+       }
+
+inline void MyFree_(void *p, unsigned Bytes, unsigned a, const char *FileName, int Line)
+       {
+       ++g_AllocFreeCount[a];
+       g_AllocFreeBytes[a] += Bytes;
+       g_AllocNetBytes[a] -= Bytes;
+       myfree2(p, Bytes);
+       }
+
+#if    BG_TIMING
+inline void SetBackgroundTimer_(TIMER Timer)
+       {
+       TICKS Now = GetClockTicks();
+       if (g_BeginTicks[g_BackgroundTimer] != 0)
+               {
+               ++g_TotalCounts[g_BackgroundTimer];
+               g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]);
+               }
+       g_BackgroundTimer = Timer;
+       g_BeginTicks[Timer] = Now;
+       }
+#else
+#define SetBackgroundTimer_(Timer)     /* empty */
+#endif
+
+inline void StartTimer_(TIMER Timer)
+       {
+       if (g_CurrTimer != TIMER_None)
+               Die("StartTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+       TICKS Now = GetClockTicks();
+#if    BG_TIMING
+       if (g_BeginTicks[g_BackgroundTimer] != 0)
+               {
+               ++g_TotalCounts[g_BackgroundTimer];
+               g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]);
+               }
+#endif
+       g_BeginTicks[Timer] = Now;
+       g_CurrTimer = Timer;
+       }
+
+inline void PauseTimer_(TIMER Timer)
+       {
+       if (Timer != g_CurrTimer)
+               Die("PauseTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+       TICKS Now = GetClockTicks();
+       g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]);
+       g_BeginTicks[Timer] = Now;
+       g_CurrTimer = TIMER_None;
+       }
+
+inline void EndTimer_(TIMER Timer)
+       {
+       if (Timer != g_CurrTimer)
+               Die("EndTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+       TICKS Now = GetClockTicks();
+#if    BG_TIMING
+       g_BeginTicks[g_BackgroundTimer] = Now;
+#endif
+       g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]);
+       ++g_TotalCounts[Timer];
+       g_CurrTimer = TIMER_None;
+       }
+
+inline void StartTimer2_(TIMER Timer)
+       {
+       g_Timer2[Timer] = true;
+       g_BeginTicks[Timer] = GetClockTicks();
+       }
+
+inline void EndTimer2_(TIMER Timer)
+       {
+       g_TotalTicks[Timer] += double(GetClockTicks() - g_BeginTicks[Timer]);
+       ++g_TotalCounts[Timer];
+       }
+
+#define AddCounter(x, N)       g_Counters[COUNTER_##x] += N
+#define IncCounter(x)          ++(g_Counters[COUNTER_##x])
+#define StartTimer(x)          StartTimer_(TIMER_##x)
+#define PauseTimer(x)          PauseTimer_(TIMER_##x)
+#define EndTimer(x)                    EndTimer_(TIMER_##x)
+#define StartTimer2(x)         StartTimer2_(TIMER_##x)
+#define EndTimer2(x)           EndTimer2_(TIMER_##x)
+
+#if    BG_TIMING
+#define SetBackgroundTimer(x)  SetBackgroundTimer_(TIMER_##x)
+#else
+#define SetBackgroundTimer(x)  /* empty */
+#endif
+
+#else  // if TIMING
+
+#define AddCounter(x, N)       /* empty */
+#define IncCounter(x)          /* empty */
+#define StartTimer(x)          /* empty */
+#define PauseTimer(x)          /* empty */
+#define EndTimer(x)                    /* empty */
+#define StartTimer2(x)         /* empty */
+#define PauseTimer2(x)         /* empty */
+#define EndTimer2(x)           /* empty */
+#define SetBackgroundTimer(x)  /* empty */
+#define MYALLOC(Type, N, Name)         myalloc(Type, N)
+#define MYFREE(Array, N, Name)         myfree(Array)
+
+#endif // if TIMING
+
+void LogMemStats();
+void LogTickStats();
+void LogStats();
+void LogAllocs();
+
+#define AddBytes(x, n) /* empty */
+#define SubBytes(x, n) /* empty */
+
+#endif // if timing_h
diff --git a/uchime_src/tracebackbit.cpp b/uchime_src/tracebackbit.cpp

new file mode 100644 (file)

index 0000000..94159cd
--- /dev/null
+++ b/uchime_src/tracebackbit.cpp
@@ -0,0 +1,180 @@
+#include "dp.h"
+
+#define TRACE  0
+
+Mx<byte> g_Mx_TBBit;
+byte **g_TBBit;
+float *g_DPRow1;
+float *g_DPRow2;
+static float *g_DPBuffer1;
+static float *g_DPBuffer2;
+
+static unsigned g_CacheLB;
+
+void AllocBit(unsigned LA, unsigned LB)
+       {
+       g_Mx_TBBit.Alloc("TBBit", LA+1, LB+1);
+       g_TBBit = g_Mx_TBBit.GetData();
+       if (LB > g_CacheLB)
+               {
+               MYFREE(g_DPBuffer1, g_CacheLB, AllocBit);
+               MYFREE(g_DPBuffer2, g_CacheLB, AllocBit);
+
+               g_CacheLB = LB + 128;
+
+       // Allow use of [-1]
+               //g_DPBuffer1 = myalloc<float>(g_CacheLB+3);
+               //g_DPBuffer2 = myalloc<float>(g_CacheLB+3);
+               g_DPBuffer1 = MYALLOC(float, g_CacheLB+3, AllocBit);
+               g_DPBuffer2 = MYALLOC(float, g_CacheLB+3, AllocBit);
+               g_DPRow1 = g_DPBuffer1 + 1;
+               g_DPRow2 = g_DPBuffer2 + 1;
+               }
+       }
+
+void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD)
+       {
+       PD.Alloc(LA+LB);
+
+       StartTimer(TraceBackBit);
+       char *PathPtr = PD.Back;
+       *PathPtr = 0;
+
+       byte **TB = g_TBBit;
+
+#if    TRACE
+       Log("\n");
+       Log("TraceBackBit\n");
+#endif
+
+       size_t i = LA;
+       size_t j = LB;
+       for (;;)
+               {
+#if    TRACE
+               Log("i=%3d  j=%3d  state=%c\n", (int) i, (int) j, State);
+#endif
+               if (i == 0 && j == 0)
+                       break;
+
+               --PathPtr;
+               *PathPtr = State;
+
+               byte t;
+               switch (State)
+                       {
+               case 'M':
+                       asserta(i > 0 && j > 0);
+                       t = TB[i-1][j-1];
+                       if (t & TRACEBITS_DM)
+                               State = 'D';
+                       else if (t & TRACEBITS_IM)
+                               State = 'I';
+                       else
+                               State = 'M';
+                       --i;
+                       --j;
+                       break;
+               case 'D':
+                       asserta(i > 0);
+                       t = TB[i-1][j];
+                       if (t & TRACEBITS_MD)
+                               State = 'M';
+                       else
+                               State = 'D';
+                       --i;
+                       break;
+
+               case 'I':
+                       asserta(j > 0);
+                       t = TB[i][j-1];
+                       if (t & TRACEBITS_MI)
+                               State = 'M';
+                       else
+                               State = 'I';
+                       --j;
+                       break;
+
+               default:
+                       Die("TraceBackBit, invalid state %c", State);
+                       }
+               }
+       PD.Start = PathPtr;
+       EndTimer(TraceBackBit);
+       }
+
+void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj,
+  unsigned &Leni, unsigned &Lenj, PathData &PD)
+       {
+       PD.Alloc(LA+LB);
+
+       StartTimer(TraceBackBitSW);
+       char *PathPtr = PD.Back;
+       *PathPtr = 0;
+
+       byte **TB = g_TBBit;
+
+#if    TRACE
+       Log("\n");
+       Log("TraceBackBitSW\n");
+#endif
+
+       unsigned i = Besti;
+       unsigned j = Bestj;
+       char State = 'M';
+       for (;;)
+               {
+#if    TRACE
+               Log("i=%3d  j=%3d  state=%c\n", (int) i, (int) j, State);
+#endif
+               --PathPtr;
+               *PathPtr = State;
+
+               byte t;
+               switch (State)
+                       {
+               case 'M':
+                       asserta(i > 0 && j > 0);
+                       t = TB[i-1][j-1];
+                       if (t & TRACEBITS_DM)
+                               State = 'D';
+                       else if (t & TRACEBITS_IM)
+                               State = 'I';
+                       else if (t & TRACEBITS_SM)
+                               {
+                               Leni = Besti - i + 1;
+                               Lenj = Bestj - j + 1;
+                               PD.Start = PathPtr;
+                               EndTimer(TraceBackBitSW);
+                               return;
+                               }
+                       else
+                               State = 'M';
+                       --i;
+                       --j;
+                       break;
+               case 'D':
+                       asserta(i > 0);
+                       t = TB[i-1][j];
+                       if (t & TRACEBITS_MD)
+                               State = 'M';
+                       else
+                               State = 'D';
+                       --i;
+                       break;
+
+               case 'I':
+                       asserta(j > 0);
+                       t = TB[i][j-1];
+                       if (t & TRACEBITS_MI)
+                               State = 'M';
+                       else
+                               State = 'I';
+                       --j;
+                       break;
+
+               default:
+                       Die("TraceBackBitSW, invalid state %c", State);
+                       }
+               }
+       }
diff --git a/uchime_src/uc.h b/uchime_src/uc.h

new file mode 100644 (file)

index 0000000..93ef892
--- /dev/null
+++ b/uchime_src/uc.h
@@ -0,0 +1,60 @@
+#ifndef uc_h\r
+#define uc_h\r
+\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+#include "path.h"\r
+\r
+struct AlnData;\r
+\r
+class UCFile\r
+       {\r
+public:\r
+       FILE *m_File;\r
+       byte *m_Data;\r
+       vector<char> m_RecTypes;\r
+       vector<float> m_PctIds;\r
+       vector<const char *> m_Labels;\r
+       vector<const char *> m_SeedLabels;\r
+       vector<unsigned> m_SeedIndexes;\r
+       vector<const char *> m_CompressedPaths;\r
+       vector<unsigned> m_SeqLengths;\r
+       vector<unsigned> m_SortOrder;\r
+       vector<char> m_Strands;\r
+       vector<unsigned> m_Los;\r
+       vector<unsigned> m_SeedLos;\r
+\r
+public:\r
+       UCFile();\r
+       void Clear(bool ctor = false);\r
+       void Close();\r
+       void FromFile(const string &FileName);\r
+       void FromClstr(const string &FileName);\r
+       void ToFile(const string &FileName);\r
+       unsigned GetRecordCount() const;\r
+       void LogMe() const;\r
+       void ToClstr(const string &FileName);\r
+       void ToFasta(const string &FileName, const SeqDB &Input, bool Reformat);\r
+       void Create(const string &FileName);\r
+       void Sort();\r
+       void Flush() const;\r
+\r
+       void WriteNotMatched(unsigned L, const char *Label) const;\r
+       void WriteLibSeed(unsigned SeedIndex, unsigned L, const char *Label) const;\r
+       void WriteNewSeed(unsigned SeedIndex, unsigned L, const char *Label) const;\r
+       void WriteHit(const SeqData &SA, const SeqData &SB, double FractId,\r
+         const PathData &PD) const;\r
+       void WriteReject(const SeqData &SA, const SeqData &SB, double FractId,\r
+         const char *Path) const;\r
+       void WriteHit(unsigned SeedIndex, unsigned L, double PctId,\r
+         const char *CompressedPath, char Strand, unsigned Lo, unsigned SeedLo,\r
+         const char *Label, const char *SeedLabel) const;\r
+       void WriteHit(const AlnData &AD);\r
+       void WriteLibCluster(unsigned SeedIndex, unsigned Size, double AvgId,\r
+         const char *Label) const;\r
+       void WriteNewCluster(unsigned SeedIndex, unsigned Size, double AvgId,\r
+         const char *Label) const;\r
+       void WriteSeqX(FILE *f, const byte *Seq, unsigned L, const char *CompressedPath) const;\r
+       };\r
+\r
+#endif // uc_h\r
diff --git a/uchime_src/uchime_main.cpp b/uchime_src/uchime_main.cpp

new file mode 100644 (file)

index 0000000..b9d69ad
--- /dev/null
+++ b/uchime_src/uchime_main.cpp
@@ -0,0 +1,212 @@
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "seqdb.h"\r
+#include "dp.h"\r
+#include "ultra.h"\r
+#include "hspfinder.h"\r
+#include <algorithm>\r
+#include <set>\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+  const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+  float MinFractId, ChimeHit2 &Hit);\r
+\r
+FILE *g_fUChime;\r
+FILE *g_fUChimeAlns;\r
+const vector<float> *g_SortVecFloat;\r
+bool g_UchimeDeNovo = false;\r
+\r
+void Usage()\r
+       {\r
+       printf("\n");\r
+       printf("UCHIME %s by Robert C. Edgar\n", MY_VERSION);\r
+       printf("http://www.drive5.com/uchime\n");\r
+       printf("\n");\r
+       printf("This software is donated to the public domain\n");\r
+       printf("\n");\r
+\r
+       printf(\r
+#include "help.h"\r
+               );\r
+       }\r
+\r
+void SetBLOSUM62()\r
+       {\r
+       Die("SetBLOSUM62 not implemented");\r
+       }\r
+\r
+void ReadSubstMx(const string &/*FileName*/, Mx<float> &/*Mxf*/)\r
+       {\r
+       Die("ReadSubstMx not implemented");\r
+       }\r
+\r
+void LogAllocs()\r
+       {\r
+       /*empty*/\r
+       }\r
+\r
+static bool CmpDescVecFloat(unsigned i, unsigned j)\r
+       {\r
+       return (*g_SortVecFloat)[i] > (*g_SortVecFloat)[j];\r
+       }\r
+\r
+void Range(vector<unsigned> &v, unsigned N)\r
+       {\r
+       v.clear();\r
+       v.reserve(N);\r
+       for (unsigned i = 0; i < N; ++i)\r
+               v.push_back(i);\r
+       }\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order)\r
+       {\r
+       StartTimer(Sort);\r
+       const unsigned N = SIZE(Values);\r
+       Range(Order, N);\r
+       g_SortVecFloat = &Values;\r
+       sort(Order.begin(), Order.end(), CmpDescVecFloat);\r
+       EndTimer(Sort);\r
+       }\r
+\r
+float GetAbFromLabel(const string &Label)\r
+       {\r
+       vector<string> Fields;\r
+       Split(Label, Fields, '/');\r
+       const unsigned N = SIZE(Fields);\r
+       for (unsigned i = 0; i < N; ++i)\r
+               {\r
+               const string &Field = Fields[i];\r
+               if (Field.substr(0, 3) == "ab=")\r
+                       {\r
+                       string a = Field.substr(3, string::npos);\r
+                       return (float) atof(a.c_str());\r
+                       }\r
+               }\r
+       if (g_UchimeDeNovo)\r
+               Die("Missing abundance /ab=xx/ in label >%s", Label.c_str());\r
+       return 0.0;\r
+       }\r
+\r
+int main(int argc, char *argv[])\r
+       {\r
+               \r
+       MyCmdLine(argc, argv);\r
+\r
+       if (argc < 2)\r
+               {\r
+               Usage();\r
+               return 0;\r
+               }\r
+\r
+       if (opt_version)\r
+               {\r
+               printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);\r
+               return 0;\r
+               }\r
+\r
+       printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);\r
+       printf("by Robert C. Edgar\n");\r
+       printf("http://drive5.com/uchime\n");\r
+       printf("This code is donated to the public domain.\n");\r
+       printf("\n");\r
+       if (!optset_w)\r
+               opt_w = 8;\r
+       \r
+       float MinFractId = 0.95f;\r
+       if (optset_id)\r
+               MinFractId = (float) opt_id;\r
+\r
+       Log("%8.2f  minh\n", opt_minh);\r
+       Log("%8.2f  xn\n", opt_xn);\r
+       Log("%8.2f  dn\n", opt_dn);\r
+       Log("%8.2f  xa\n", opt_xa);\r
+       Log("%8.2f  mindiv\n", opt_mindiv);\r
+       Log("%8u  maxp\n", opt_maxp);\r
+\r
+       if (opt_input == "" && opt_uchime != "")\r
+               opt_input = opt_uchime;\r
+\r
+       if (opt_input == "")\r
+               Die("Missing --input");\r
+\r
+       g_UchimeDeNovo = (opt_db == "");\r
+\r
+       if (opt_uchimeout != "")\r
+               g_fUChime = CreateStdioFile(opt_uchimeout);\r
+\r
+       if (opt_uchimealns != "")\r
+               g_fUChimeAlns = CreateStdioFile(opt_uchimealns);\r
+\r
+       SeqDB Input;\r
+       SeqDB DB;\r
+\r
+       Input.FromFasta(opt_input);\r
+       if (!Input.IsNucleo())\r
+               Die("Input contains amino acid sequences");\r
+\r
+       const unsigned QuerySeqCount = Input.GetSeqCount();\r
+       vector<unsigned> Order;\r
+       for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+               Order.push_back(i);\r
+\r
+       if (g_UchimeDeNovo)\r
+               {\r
+               vector<float> Abs;\r
+               for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+                       {\r
+                       const char *Label = Input.GetLabel(i);\r
+                       float Ab = GetAbFromLabel(Label);\r
+                       Abs.push_back(Ab);\r
+                       }\r
+               SortDescending(Abs, Order);\r
+               DB.m_IsNucleoSet = true;\r
+               DB.m_IsNucleo = true;\r
+               }\r
+       else\r
+               {\r
+               DB.FromFasta(opt_db);\r
+               if (!DB.IsNucleo())\r
+                       Die("Database contains amino acid sequences");\r
+               }\r
+\r
+       vector<ChimeHit2> Hits;\r
+       unsigned HitCount = 0;\r
+       for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+               {\r
+               unsigned QuerySeqIndex = Order[i];\r
+\r
+               SeqData QSD;\r
+               Input.GetSeqData(QuerySeqIndex, QSD);\r
+\r
+               float QAb = -1.0;\r
+               if (g_UchimeDeNovo)\r
+                       QAb = GetAbFromLabel(QSD.Label);\r
+\r
+               ChimeHit2 Hit;\r
+               AlnParams &AP = *(AlnParams *) 0;\r
+               AlnHeuristics &AH = *(AlnHeuristics *) 0;\r
+               HSPFinder &HF = *(HSPFinder *) 0;\r
+               bool Found = SearchChime(DB, QSD, QAb, AP, AH, HF, MinFractId, Hit);\r
+               if (Found)\r
+                       ++HitCount;\r
+               else\r
+                       {\r
+                       if (g_UchimeDeNovo)\r
+                               DB.AddSeq(QSD.Label, QSD.Seq, QSD.L);\r
+                       }\r
+\r
+               WriteChimeHit(g_fUChime, Hit);\r
+\r
+               ProgressStep(i, QuerySeqCount, "%u/%u chimeras found (%.1f%%)", HitCount, i, Pct(HitCount, i+1));\r
+               }\r
+\r
+       Log("\n");\r
+       Log("%s: %u/%u chimeras found (%.1f%%)\n",\r
+         opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount));\r
+\r
+       CloseStdioFile(g_fUChime);\r
+       CloseStdioFile(g_fUChimeAlns);\r
+\r
+       ProgressExit();\r
+       return 0;\r
+       }\r
diff --git a/uchime_src/ultra.h b/uchime_src/ultra.h

new file mode 100644 (file)

index 0000000..e0a432f
--- /dev/null
+++ b/uchime_src/ultra.h
@@ -0,0 +1,8 @@
+#ifndef ultra_h
+#define ultra_h
+
+#include "seqdb.h"
+#define Ultra SeqDB
+#define GetSeedLabel GetLabel
+
+#endif // ultra_h
diff --git a/uchime_src/usort.cpp b/uchime_src/usort.cpp

new file mode 100644 (file)

index 0000000..922dcb4
--- /dev/null
+++ b/uchime_src/usort.cpp
@@ -0,0 +1,86 @@
+#if    UCHIMES\r
+\r
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+#include "alpha.h"\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order);\r
+\r
+static byte *g_QueryHasWord;\r
+static unsigned g_WordCount;\r
+\r
+unsigned GetWord(const byte *Seq)\r
+       {\r
+       unsigned Word = 0;\r
+       const byte *Front = Seq;\r
+       for (unsigned i = 0; i < opt_w; ++i)\r
+               {\r
+               unsigned Letter = g_CharToLetterNucleo[*Front++];\r
+               Word = (Word*4) + Letter;\r
+               }\r
+       return Word;\r
+       }\r
+\r
+static void SetQuery(const SeqData &Query)\r
+       {\r
+       if (g_QueryHasWord == 0)\r
+               {\r
+               g_WordCount = 4;\r
+               for (unsigned i = 1; i < opt_w; ++i)\r
+                       g_WordCount *= 4;\r
+\r
+               g_QueryHasWord = myalloc(byte, g_WordCount);\r
+               }\r
+\r
+       memset(g_QueryHasWord, 0, g_WordCount);\r
+\r
+       if (Query.L <= opt_w)\r
+               return;\r
+\r
+       const unsigned L = Query.L - opt_w + 1;\r
+       const byte *Seq = Query.Seq;\r
+       for (unsigned i = 0; i < L; ++i)\r
+               {\r
+               unsigned Word = GetWord(Seq++);\r
+               g_QueryHasWord[Word] = 1;\r
+               }\r
+       }\r
+\r
+static unsigned GetUniqueWordsInCommon(const SeqData &Target)\r
+       {\r
+       if (Target.L <= opt_w)\r
+               return 0;\r
+\r
+       unsigned Count = 0;\r
+       const unsigned L = Target.L - opt_w + 1;\r
+       const byte *Seq = Target.Seq;\r
+       for (unsigned i = 0; i < L; ++i)\r
+               {\r
+               unsigned Word = GetWord(Seq++);\r
+               if (g_QueryHasWord[Word])\r
+                       ++Count;\r
+               }\r
+       return Count;\r
+       }\r
+\r
+void USort(const SeqData &Query, const SeqDB &DB, vector<float> &WordCounts, \r
+  vector<unsigned> &Order)\r
+       {\r
+       WordCounts.clear();\r
+       Order.clear();\r
+\r
+       SetQuery(Query);\r
+\r
+       const unsigned SeqCount = DB.GetSeqCount();\r
+       for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)\r
+               {\r
+               SeqData Target;\r
+               DB.GetSeqData(SeqIndex, Target);\r
+               float WordCount = (float) GetUniqueWordsInCommon(Target);\r
+               WordCounts.push_back(WordCount);\r
+               }\r
+       SortDescending(WordCounts, Order);\r
+       }\r
+\r
+#endif // UCHIMES\r
diff --git a/uchime_src/viterbifast.cpp b/uchime_src/viterbifast.cpp

new file mode 100644 (file)

index 0000000..2b20174
--- /dev/null
+++ b/uchime_src/viterbifast.cpp
@@ -0,0 +1,378 @@
+#include "dp.h"
+#include "out.h"
+#include "evalue.h"
+
+#define CMP_SIMPLE     0
+\r
+#if    SAVE_FAST
+static Mx<float> g_MxDPM;
+static Mx<float> g_MxDPD;
+static Mx<float> g_MxDPI;
+
+static Mx<char> g_MxTBM;
+static Mx<char> g_MxTBD;
+static Mx<char> g_MxTBI;
+
+static float **g_DPM;
+static float **g_DPD;
+static float **g_DPI;
+
+static char **g_TBM;
+static char **g_TBD;
+static char **g_TBI;
+
+#if    CMP_SIMPLE
+static Mx<float> *g_DPMSimpleMx;
+static Mx<float> *g_DPDSimpleMx;
+static Mx<float> *g_DPISimpleMx;
+static float **g_DPMSimple;
+static float **g_DPDSimple;
+static float **g_DPISimple;
+
+#define cmpm(i, j, x)  { if (!feq(x, g_DPMSimple[i][j])) \
+                                                       { \
+                                                       Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+                                                         __FILE__, __LINE__, x, i, j, g_DPMSimple[i][j]); \
+                                                       } \
+                                               }
+
+#define cmpd(i, j, x)  { if (!feq(x, g_DPDSimple[i][j])) \
+                                                       { \
+                                                       Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+                                                         __FILE__, __LINE__, x, i, j, g_DPDSimple[i][j]); \
+                                                       } \
+                                               }
+
+#define cmpi(i, j, x)  { if (!feq(x, g_DPISimple[i][j])) \
+                                                       { \
+                                                       Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+                                                         __FILE__, __LINE__, x, i, j, g_DPISimple[i][j]); \
+                                                       } \
+                                               }
+
+#else
+
+#define cmpm(i, j, x)  /* empty */
+#define cmpd(i, j, x)  /* empty */
+#define cmpi(i, j, x)  /* empty */
+
+#endif
+
+static void AllocSave(unsigned LA, unsigned LB)
+       {
+#if    CMP_SIMPLE
+       GetSimpleDPMxs(&g_DPMSimpleMx, &g_DPDSimpleMx, &g_DPISimpleMx);
+       g_DPMSimple = g_DPMSimpleMx->GetData();
+       g_DPDSimple = g_DPDSimpleMx->GetData();
+       g_DPISimple = g_DPISimpleMx->GetData();
+#endif
+       g_MxDPM.Alloc("FastM", LA+1, LB+1);\r
+       g_MxDPD.Alloc("FastD", LA+1, LB+1);\r
+       g_MxDPI.Alloc("FastI", LA+1, LB+1);\r
+\r
+       g_MxTBM.Alloc("FastTBM", LA+1, LB+1);\r
+       g_MxTBD.Alloc("FastTBD", LA+1, LB+1);\r
+       g_MxTBI.Alloc("FastTBI", LA+1, LB+1);\r
+\r
+       g_DPM = g_MxDPM.GetData();\r
+       g_DPD = g_MxDPD.GetData();\r
+       g_DPI = g_MxDPI.GetData();\r
+\r
+       g_TBM = g_MxTBM.GetData();\r
+       g_TBD = g_MxTBD.GetData();\r
+       g_TBI = g_MxTBI.GetData();\r
+       }
+
+static void SAVE_DPM(unsigned i, unsigned j, float x)
+       {
+       g_DPM[i][j] = x;
+#if    CMP_SIMPLE
+       if (i > 0 && j > 0)
+       asserta(feq(x, g_DPMSimple[i][j]));
+#endif
+       }
+
+static void SAVE_DPD(unsigned i, unsigned j, float x)
+       {
+       g_DPD[i][j] = x;
+#if    CMP_SIMPLE
+       if (i > 0 && j > 0)
+       asserta(feq(x, g_DPDSimple[i][j]));
+#endif
+       }
+
+static void SAVE_DPI(unsigned i, unsigned j, float x)
+       {
+       g_DPI[i][j] = x;
+#if    CMP_SIMPLE
+       if (i > 0 && j > 0)
+       asserta(feq(x, g_DPISimple[i][j]));
+#endif
+       }
+
+static void SAVE_TBM(unsigned i, unsigned j, char x)
+       {
+       g_TBM[i][j] = x;
+       }
+
+static void SAVE_TBD(unsigned i, unsigned j, char x)
+       {
+       g_TBD[i][j] = x;
+       }
+
+static void SAVE_TBI(unsigned i, unsigned j, char x)
+       {
+       g_TBI[i][j] = x;
+       }
+
+void GetFastMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I)
+       {
+       *M = &g_MxDPM;
+       *D = &g_MxDPD;
+       *I = &g_MxDPI;
+       }
+
+#else  // SAVE_FAST
+
+#define        SAVE_DPM(i, j, x)       /* empty */
+#define        SAVE_DPD(i, j, x)       /* empty */
+#define        SAVE_DPI(i, j, x)       /* empty */
+
+#define        SAVE_TBM(i, j, x)       /* empty */
+#define        SAVE_TBD(i, j, x)       /* empty */
+#define        SAVE_TBI(i, j, x)       /* empty */
+
+#define AllocSave(LA, LB)      /* empty */
+
+#define cmpm(i, j, x)  /* empty */
+#define cmpd(i, j, x)  /* empty */
+#define cmpi(i, j, x)  /* empty */
+
+#endif // SAVE_FAST
+
+float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB,
+  const AlnParams &AP, PathData &PD)
+       {
+       if (LA*LB > 100*1000*1000)
+               Die("ViterbiFast, too long LA=%u, LB=%u", LA, LB);
+
+       AllocBit(LA, LB);
+       AllocSave(LA, LB);
+       
+       StartTimer(ViterbiFast);
+
+       const float * const *Mx = AP.SubstMx;
+       float OpenA = AP.LOpenA;
+       float ExtA = AP.LExtA;
+
+       byte **TB = g_TBBit;
+       float *Mrow = g_DPRow1;
+       float *Drow = g_DPRow2;
+
+// Use Mrow[-1], so...
+       Mrow[-1] = MINUS_INFINITY;
+       for (unsigned j = 0; j <= LB; ++j)
+               {
+               Mrow[j] = MINUS_INFINITY;
+               SAVE_DPM(0, j, MINUS_INFINITY);
+               SAVE_TBM(0, j, '?');
+
+               Drow[j] = MINUS_INFINITY;
+               SAVE_DPD(0, j, MINUS_INFINITY);
+               SAVE_TBD(0, j, '?');
+               }
+       
+// Main loop
+       float M0 = float (0);
+       SAVE_DPM(0, 0, 0);
+       for (unsigned i = 0; i < LA; ++i)
+               {
+               byte a = A[i];
+               const float *MxRow = Mx[a];
+               float OpenB = AP.LOpenB;
+               float ExtB = AP.LExtB;
+               float I0 = MINUS_INFINITY;
+
+               SAVE_TBM(i, 0, '?');
+
+               SAVE_DPI(i, 0, MINUS_INFINITY);
+               SAVE_DPI(i, 1, MINUS_INFINITY);
+
+               SAVE_TBI(i, 0, '?');
+               SAVE_TBI(i, 1, '?');
+               
+               byte *TBrow = TB[i];
+               for (unsigned j = 0; j < LB; ++j)
+                       {
+                       byte b = B[j];
+                       byte TraceBits = 0;
+                       float SavedM0 = M0;
+
+               // MATCH
+                       {
+               // M0 = DPM[i][j]
+               // I0 = DPI[i][j]
+               // Drow[j] = DPD[i][j]
+                       cmpm(i, j, M0);
+                       cmpd(i, j, Drow[j]);
+                       cmpi(i, j, I0);
+
+                       float xM = M0;
+                       SAVE_TBM(i+1, j+1, 'M');
+                       if (Drow[j] > xM)
+                               {
+                               xM = Drow[j];
+                               TraceBits = TRACEBITS_DM;
+                               SAVE_TBM(i+1, j+1, 'D');
+                               }
+                       if (I0 > xM)
+                               {
+                               xM = I0;
+                               TraceBits = TRACEBITS_IM;
+                               SAVE_TBM(i+1, j+1, 'I');
+                               }
+                       M0 = Mrow[j];
+                       cmpm(i, j+1, M0);
+
+                       Mrow[j] = xM + MxRow[b];
+               // Mrow[j] = DPM[i+1][j+1])
+                       SAVE_DPM(i+1, j+1, Mrow[j]);
+                       }
+                       
+               // DELETE
+                       {
+               // SavedM0 = DPM[i][j]
+               // Drow[j] = DPD[i][j]
+                       cmpm(i, j, SavedM0);
+                       cmpd(i, j, Drow[j]);
+
+                       float md = SavedM0 + OpenB;
+                       Drow[j] += ExtB;
+                       SAVE_TBD(i+1, j, 'D');
+                       if (md >= Drow[j])
+                               {
+                               Drow[j] = md;
+                               TraceBits |= TRACEBITS_MD;
+                               SAVE_TBD(i+1, j, 'M');
+                               }
+               // Drow[j] = DPD[i+1][j]
+                       SAVE_DPD(i+1, j, Drow[j]);
+                       }
+                       
+               // INSERT
+                       {
+               // SavedM0 = DPM[i][j]
+               // I0 = DPI[i][j]
+                       cmpm(i, j, SavedM0);
+                       cmpi(i, j, I0);
+                       
+                       float mi = SavedM0 + OpenA;
+                       I0 += ExtA;
+                       SAVE_TBI(i, j+1, 'I');
+                       if (mi >= I0)
+                               {
+                               I0 = mi;
+                               TraceBits |= TRACEBITS_MI;
+                               SAVE_TBI(i, j+1, 'M');
+                               }
+               // I0 = DPI[i][j+1]
+                       SAVE_DPI(i, j+1, I0);
+                       }
+                       
+                       OpenB = AP.OpenB;
+                       ExtB = AP.ExtB;
+                       
+                       TBrow[j] = TraceBits;
+                       }
+               
+       // Special case for end of Drow[]
+               {
+       // M0 = DPM[i][LB]
+       // Drow[LB] = DPD[i][LB]
+               
+               TBrow[LB] = 0;
+               float md = M0 + AP.ROpenB;
+               Drow[LB] += AP.RExtB;
+               SAVE_TBD(i+1, LB, 'D');
+               if (md >= Drow[LB])
+                       {
+                       Drow[LB] = md;
+                       TBrow[LB] = TRACEBITS_MD;
+                       SAVE_TBD(i+1, LB, 'M');
+                       }
+       // Drow[LB] = DPD[i+1][LB]
+               SAVE_DPD(i+1, LB, Drow[LB]);
+               }
+               
+               SAVE_DPM(i+1, 0, MINUS_INFINITY);
+               M0 = MINUS_INFINITY;
+
+               OpenA = AP.OpenA;
+               ExtA = AP.ExtA;
+               }
+       
+       SAVE_TBM(LA, 0, '?');
+
+// Special case for last row of DPI
+       byte *TBrow = TB[LA];
+       float I1 = MINUS_INFINITY;
+
+       SAVE_DPI(LA, 0, MINUS_INFINITY);
+       SAVE_TBI(LA, 0, '?');
+
+       SAVE_DPI(LA, 1, MINUS_INFINITY);
+       SAVE_TBI(LA, 1, '?');
+
+       for (unsigned j = 1; j < LB; ++j)
+               {
+       // Mrow[j-1] = DPM[LA][j]
+       // I1 = DPI[LA][j]
+               
+               TBrow[j] = 0;
+               float mi = Mrow[int(j)-1] + AP.ROpenA;
+               I1 += AP.RExtA;
+               SAVE_TBI(LA, j+1, 'I');
+               if (mi > I1)
+                       {
+                       I1 = mi;
+                       TBrow[j] = TRACEBITS_MI;
+                       SAVE_TBI(LA, j+1, 'M');
+                       }
+               SAVE_DPI(LA, j+1, I1);
+               }
+       
+       float FinalM = Mrow[LB-1];
+       float FinalD = Drow[LB];
+       float FinalI = I1;
+// FinalM = DPM[LA][LB]
+// FinalD = DPD[LA][LB]
+// FinalI = DPI[LA][LB]
+       
+       float Score = FinalM;
+       byte State = 'M';
+       if (FinalD > Score)
+               {
+               Score = FinalD;
+               State = 'D';
+               }
+       if (FinalI > Score)
+               {
+               Score = FinalI;
+               State = 'I';
+               }
+
+       EndTimer(ViterbiFast);
+       TraceBackBit(LA, LB, State, PD);
+
+#if    SAVE_FAST
+       g_MxDPM.LogMe();
+       g_MxDPD.LogMe();
+       g_MxDPI.LogMe();
+
+       g_MxTBM.LogMe();
+       g_MxTBD.LogMe();
+       g_MxTBI.LogMe();
+#endif
+
+       return Score;
+       }
diff --git a/uchime_src/windex.h b/uchime_src/windex.h

new file mode 100644 (file)

index 0000000..0b324ca
--- /dev/null
+++ b/uchime_src/windex.h
@@ -0,0 +1,71 @@
+#ifndef windex_h\r
+#define windex_h\r
+\r
+class SFasta;\r
+struct SeqDB;\r
+\r
+typedef uint32 word_t;\r
+typedef uint16 wordcount_t;\r
+typedef uint32 arrsize_t;\r
+typedef uint16 seqcountperword_t;\r
+typedef uint32 seqindex_t;\r
+typedef uint16 commonwordcount_t;\r
+\r
+const uint32 WindexFileHdr_Magic1 = 0x312DE41;\r
+const uint32 WindexFileHdr_Magic2 = 0x312DE42;\r
+const uint32 WindexFileHdr_Magic3 = 0x312DE43;\r
+const uint32 WindexFileHdr_Magic4 = 0x312DE44;\r
+\r
+struct WindexFileHdr\r
+       {\r
+       uint32 Magic1;\r
+       uint32 IsNucleo;\r
+       uint32 WordLength;\r
+       uint32 Magic2;\r
+       };\r
+\r
+class Windex\r
+       {\r
+public:\r
+       bool m_Nucleo;\r
+       bool m_RedAlpha;\r
+       unsigned m_WordLength;\r
+       unsigned m_AlphaSize;\r
+       unsigned m_WordCount;\r
+       unsigned m_Hi;\r
+       unsigned m_CapacityInc;\r
+       arrsize_t *m_Capacities;\r
+       arrsize_t *m_Sizes;\r
+       float *m_WordScores;\r
+       seqindex_t **m_SeedIndexes;\r
+       byte *m_UniqueCounts;\r
+       unsigned m_CharToLetter[256];\r
+\r
+public:\r
+       Windex();\r
+       void ToFile(const string &FileName) const;\r
+       void FromFile(const string &FileName);\r
+       void FromSFasta(SFasta &SF);\r
+       void FromSeqDB(const SeqDB &DB);\r
+       void Clear(bool ctor = false);\r
+       void AddWords(unsigned SeqIndex, const word_t *Words, unsigned N);\r
+       void Init(bool Nucleo, unsigned WordLength);\r
+       void Init2(bool Nucleo, unsigned TableSize);\r
+       void InitRed(unsigned WordLength);\r
+       void InitWordScores(const float *const *SubstMx);\r
+       void Reset();\r
+       void LogMe() const;\r
+       unsigned LogMemSize() const;\r
+       void LogWordStats(unsigned TopWords = 10) const;\r
+       const char *WordToStr(word_t Word) const;\r
+       word_t SeqToWord(const byte *Seq) const;\r
+       unsigned SeqToWords(const byte *Seq, unsigned L, word_t *Words) const;\r
+       unsigned SeqToWordsStep(unsigned Step, const byte *Seq, unsigned L, word_t *Words) const;\r
+       unsigned WordsToCounts(const word_t *Words, unsigned N,\r
+         word_t *UniqueWords, seqcountperword_t *Counts) const;\r
+       unsigned GetUniqueWords(const word_t *Words, unsigned N,\r
+         word_t *UniqueWords) const;\r
+       void LogSizeHisto() const;\r
+       };\r
+\r
+#endif // windex_h\r
diff --git a/uchime_src/writechhit.cpp b/uchime_src/writechhit.cpp

new file mode 100644 (file)

index 0000000..ea67061
--- /dev/null
+++ b/uchime_src/writechhit.cpp
@@ -0,0 +1,329 @@
+#include "myutils.h"\r
+#include "chime.h"\r
+\r
+void WriteChimeFileHdr(FILE *f)\r
+       {\r
+       if (f == 0)\r
+               return;\r
+\r
+       fprintf(f,\r
+               "\tQuery"               // 1\r
+               "\tA"                   // 2\r
+               "\tB"                   // 3\r
+               "\tIdQM"                // 4\r
+               "\tIdQA"                // 5\r
+               "\tIdQB"                // 6\r
+               "\tIdAB"                // 7\r
+               "\tIdQT"                // 8\r
+               "\tLY"                  // 9\r
+               "\tLN"                  // 10\r
+               "\tLA"                  // 11\r
+               "\tRY"                  // 12\r
+               "\tRN"                  // 13\r
+               "\tRA"                  // 14\r
+               "\tDiv"                 // 15\r
+               "\tY"                   // 16\r
+               "\n"\r
+               );\r
+       }\r
+\r
+void WriteChimeHit(FILE *f, const ChimeHit2 &Hit)\r
+       {\r
+       if (f == 0)\r
+               return;\r
+\r
+       if (Hit.Div <= 0.0)\r
+               {\r
+               fprintf(f, "0.0000");           // 0\r
+\r
+               fprintf(f,\r
+                 "\t%s", Hit.QLabel.c_str());  // 1\r
+\r
+               fprintf(f,\r
+                 "\t*"                                         // 2\r
+                 "\t*"                                         // 3\r
+                 "\t*"                                         // 4\r
+                 "\t*"                                         // 5\r
+                 "\t*"                                         // 6\r
+                 "\t*"                                         // 7\r
+                 "\t*"                                         // 8\r
+                 "\t*"                                         // 9\r
+                 "\t*"                                         // 10\r
+                 "\t*"                                         // 11\r
+                 "\t*"                                         // 12\r
+                 "\t*"                                         // 13\r
+                 "\t*"                                         // 14\r
+                 "\t*"                                         // 15\r
+                 "\tN"                                         // 16\r
+                 "\n"\r
+                 );\r
+               return;\r
+               }\r
+\r
+       fprintf(f, "%.4f", Hit.Score);          // 0\r
+\r
+       fputc('\t', f);\r
+       fputs(Hit.QLabel.c_str(), f);           // 1\r
+\r
+       fputc('\t', f);\r
+       fputs(Hit.ALabel.c_str(), f);           // 2\r
+\r
+       fputc('\t', f);\r
+       fputs(Hit.BLabel.c_str(), f);           // 3\r
+\r
+       fprintf(f, "\t%.1f", Hit.PctIdQM);      // 4\r
+       fprintf(f, "\t%.1f", Hit.PctIdQA);      // 5\r
+       fprintf(f, "\t%.1f", Hit.PctIdQB);      // 6\r
+       fprintf(f, "\t%.1f", Hit.PctIdAB);      // 7\r
+       fprintf(f, "\t%.1f", Hit.PctIdQT);      // 8\r
+\r
+       fprintf(f, "\t%u", Hit.CS_LY);          // 9\r
+       fprintf(f, "\t%u", Hit.CS_LN);          // 10\r
+       fprintf(f, "\t%u", Hit.CS_LA);          // 11\r
+\r
+       fprintf(f, "\t%u", Hit.CS_RY);          // 12\r
+       fprintf(f, "\t%u", Hit.CS_RN);          // 13\r
+       fprintf(f, "\t%u", Hit.CS_RA);          // 14\r
+\r
+       fprintf(f, "\t%.2f", Hit.Div);          // 15\r
+\r
+       fprintf(f, "\t%c", yon(Hit.Accept())); // 16\r
+       fputc('\n', f);\r
+       }\r
+\r
+unsigned GetUngappedLength(const byte *Seq, unsigned L)\r
+       {\r
+       unsigned UL = 0;\r
+       for (unsigned i = 0; i < L; ++i)\r
+               if (!isgap(Seq[i]))\r
+                       ++UL;\r
+       return UL;\r
+       }\r
+\r
+void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit)\r
+       {\r
+       if (f == 0)\r
+               return;\r
+\r
+       if (Hit.Div <= 0.0)\r
+               return;\r
+\r
+       const string &Q3 = Hit.Q3;\r
+       const string &A3 = Hit.A3;\r
+       const string &B3 = Hit.B3;\r
+\r
+       const byte *Q3Seq = (const byte *) Q3.c_str();\r
+       const byte *A3Seq = (const byte *) A3.c_str();\r
+       const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+// Aligned\r
+       unsigned ColCount = SIZE(Q3);\r
+       asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+       unsigned LQ = GetUngappedLength(Q3Seq, ColCount);\r
+       unsigned LA = GetUngappedLength(A3Seq, ColCount);\r
+       unsigned LB = GetUngappedLength(B3Seq, ColCount);\r
+\r
+       fprintf(f, "\n");\r
+       fprintf(f, "------------------------------------------------------------------------\n");\r
+       fprintf(f, "Query   (%5u nt) %s\n", LQ, Hit.QLabel.c_str());\r
+       fprintf(f, "ParentA (%5u nt) %s\n", LA, Hit.ALabel.c_str());\r
+       fprintf(f, "ParentB (%5u nt) %s\n", LB, Hit.BLabel.c_str());\r
+\r
+// Strip terminal gaps in query\r
+       unsigned FromCol = UINT_MAX;\r
+       unsigned ToCol = UINT_MAX;\r
+       for (unsigned Col = 0; Col < ColCount; ++Col)\r
+               {\r
+               if (!isgap(Q3Seq[Col]))\r
+                       {\r
+                       if (FromCol == UINT_MAX)\r
+                               FromCol = Col;\r
+                       ToCol = Col;\r
+                       }\r
+               }\r
+\r
+       unsigned QPos = 0;\r
+       unsigned APos = 0;\r
+       unsigned BPos = 0;\r
+       for (unsigned Col = 0; Col < FromCol; ++Col)\r
+               {\r
+               if (!isgap(A3Seq[Col]))\r
+                       ++APos;\r
+               if (!isgap(B3Seq[Col]))\r
+                       ++BPos;\r
+               }\r
+\r
+       unsigned Range = ToCol - FromCol + 1;\r
+       unsigned RowCount = (Range + 79)/80;\r
+       unsigned RowFromCol = FromCol;\r
+       for (unsigned RowIndex = 0; RowIndex < RowCount; ++RowIndex)\r
+               {\r
+               fprintf(f, "\n");\r
+               unsigned RowToCol = RowFromCol + 79;\r
+               if (RowToCol > ToCol)\r
+                       RowToCol = ToCol;\r
+\r
+       // A row\r
+               fprintf(f, "A %5u ", APos + 1);\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       char q = Q3Seq[Col];\r
+                       char a = A3Seq[Col];\r
+                       if (a != q)\r
+                               a = tolower(a);\r
+                       fprintf(f, "%c", a);\r
+                       if (!isgap(a))\r
+                               ++APos;\r
+                       }\r
+               fprintf(f, " %u\n", APos);\r
+\r
+       // Q row\r
+               fprintf(f, "Q %5u ", QPos + 1);\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       char q = Q3Seq[Col];\r
+                       fprintf(f, "%c", q);\r
+                       if (!isgap(q))\r
+                               ++QPos;\r
+                       }\r
+               fprintf(f, " %u\n", QPos);\r
+\r
+       // B row\r
+               fprintf(f, "B %5u ", BPos + 1);\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       char q = Q3Seq[Col];\r
+                       char b = B3Seq[Col];\r
+                       if (b != q)\r
+                               b = tolower(b);\r
+                       fprintf(f, "%c", b);\r
+                       if (!isgap(b))\r
+                               ++BPos;\r
+                       }\r
+               fprintf(f, " %u\n", BPos);\r
+\r
+       // Diffs\r
+               fprintf(f, "Diffs   ");\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       char q = Q3Seq[Col];\r
+                       char a = A3Seq[Col];\r
+                       char b = B3Seq[Col];\r
+\r
+                       char c = ' ';\r
+                       if (isgap(q) || isgap(a) || isgap(b))\r
+                               c = ' ';\r
+                       else if (Col < Hit.ColXLo)\r
+                               {\r
+                               if (q == a && q == b)\r
+                                       c = ' ';\r
+                               else if (q == a && q != b)\r
+                                       c = 'A';\r
+                               else if (q == b && q != a)\r
+                                       c = 'b';\r
+                               else if (a == b && q != a)\r
+                                       c = 'N';\r
+                               else\r
+                                       c = '?';\r
+                               }\r
+                       else if (Col > Hit.ColXHi)\r
+                               {\r
+                               if (q == a && q == b)\r
+                                       c = ' ';\r
+                               else if (q == b && q != a)\r
+                                       c = 'B';\r
+                               else if (q == a && q != b)\r
+                                       c = 'a';\r
+                               else if (a == b && q != a)\r
+                                       c = 'N';\r
+                               else\r
+                                       c = '?';\r
+                               }\r
+\r
+                       fprintf(f, "%c", c);\r
+                       }\r
+               fprintf(f, "\n");\r
+\r
+       // SNPs\r
+               fprintf(f, "Votes   ");\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       char q = Q3Seq[Col];\r
+                       char a = A3Seq[Col];\r
+                       char b = B3Seq[Col];\r
+\r
+                       bool PrevGap = Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]));\r
+                       bool NextGap = Col+1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]));\r
+\r
+                       char c = ' ';\r
+                       if (isgap(q) || isgap(a) || isgap(b) || PrevGap || NextGap)\r
+                               c = ' ';\r
+                       else if (Col < Hit.ColXLo)\r
+                               {\r
+                               if (q == a && q == b)\r
+                                       c = ' ';\r
+                               else if (q == a && q != b)\r
+                                       c = '+';\r
+                               else if (q == b && q != a)\r
+                                       c = '!';\r
+                               else\r
+                                       c = '0';\r
+                               }\r
+                       else if (Col > Hit.ColXHi)\r
+                               {\r
+                               if (q == a && q == b)\r
+                                       c = ' ';\r
+                               else if (q == b && q != a)\r
+                                       c = '+';\r
+                               else if (q == a && q != b)\r
+                                       c = '!';\r
+                               else\r
+                                       c = '0';\r
+                               }\r
+\r
+                       fprintf(f, "%c", c);\r
+                       }\r
+               fprintf(f, "\n");\r
+\r
+       // LR row\r
+               fprintf(f, "Model   ");\r
+               for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+                       {\r
+                       if (Col < Hit.ColXLo)\r
+                               fprintf(f, "A");\r
+                       else if (Col >= Hit.ColXLo && Col <= Hit.ColXHi)\r
+                               fprintf(f, "x");\r
+                       else\r
+                               fprintf(f, "B");\r
+                       }\r
+\r
+               fprintf(f, "\n");\r
+\r
+               RowFromCol += 80;\r
+               }\r
+       fprintf(f, "\n");\r
+\r
+       double PctIdBestP = max(Hit.PctIdQA, Hit.PctIdQB);\r
+       double Div = (Hit.PctIdQM - PctIdBestP)*100.0/PctIdBestP;\r
+\r
+       unsigned LTot = Hit.CS_LY + Hit.CS_LN + Hit.CS_LA;\r
+       unsigned RTot = Hit.CS_RY + Hit.CS_RN + Hit.CS_RA;\r
+\r
+       double PctL = Pct(Hit.CS_LY, LTot);\r
+       double PctR = Pct(Hit.CS_RY, RTot);\r
+\r
+       fprintf(f,\r
+         "Ids.  QA %.1f%%, QB %.1f%%, AB %.1f%%, QModel %.1f%%, Div. %+.1f%%\n",\r
+         Hit.PctIdQA,\r
+         Hit.PctIdQB,\r
+         Hit.PctIdAB,\r
+         Hit.PctIdQM,\r
+         Div);\r
+\r
+       fprintf(f,\r
+         "Diffs Left %u: N %u, A %u, Y %u (%.1f%%); Right %u: N %u, A %u, Y %u (%.1f%%), Score %.4f\n",\r
+         LTot, Hit.CS_LN, Hit.CS_LA, Hit.CS_LY, PctL,\r
+         RTot, Hit.CS_RN, Hit.CS_RA, Hit.CS_RY, PctR,\r
+         Hit.Score);\r
+       }\r
diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp

index a404f79222ac83cf41d42c04ceb614df1e01dceb..dbdee2ad942db51860cb0a7f403662a895def1bf 100644 (file)
--- a/unifracunweightedcommand.cpp
+++ b/unifracunweightedcommand.cpp
@@ -8,6 +8,9 @@
   */
  
  #include "unifracunweightedcommand.h"
+#include "treereader.h"
+#include "subsample.h"
+#include "consensus.h"
  
  //**********************************************************************************************************************
  vector<string> UnifracUnweightedCommand::setParameters(){      
@@ -20,7 +23,9 @@ vector<string> UnifracUnweightedCommand::setParameters(){
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
                 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
-               CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
+        CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+        CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus);
+        CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -45,6 +50,8 @@ string UnifracUnweightedCommand::getHelpString(){
                 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
                 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
+        helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
+        helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results of the subsampling, as well as a consensus tree built from these trees. Default=F.\n";
                 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
                 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
                 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
@@ -66,6 +73,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(){
                 outputTypes["uwsummary"] = tempOutNames;
                 outputTypes["phylip"] = tempOutNames;
                 outputTypes["column"] = tempOutNames;
+        outputTypes["tree"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
@@ -102,6 +110,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                         outputTypes["uwsummary"] = tempOutNames;
                         outputTypes["phylip"] = tempOutNames;
                         outputTypes["column"] = tempOutNames;
+            outputTypes["tree"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -133,13 +142,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                                 }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
-                       //check for required parameters
+            //check for required parameters
                         treefile = validParameter.validFile(parameters, "tree", true);
                         if (treefile == "not open") { abort = true; }
                         else if (treefile == "not found") {                             //if there is a current design file, use it
@@ -159,7 +162,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
                         
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
@@ -190,6 +193,24 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors); 
                         
+            temp = validParameter.validFile(parameters, "subsample", false);           if (temp == "not found") { temp = "F"; }
+                       if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+            else {  
+                if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
+                else { subsample = false; }
+            }
+                       
+            if (!subsample) { subsampleIters = 0;   }
+            else { subsampleIters = iters;          }
+            
+            temp = validParameter.validFile(parameters, "consensus", false);                                   if (temp == "not found") { temp = "F"; }
+                       consensus = m->isTrue(temp);
+            
+                       if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  } 
+                       if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } 
+            if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
+            if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
+
                         if (!random) {  iters = 0;  } //turn off random calcs
                         
                         //if user selects distance = true and no groups it won't calc the pairwise
@@ -220,91 +241,60 @@ int UnifracUnweightedCommand::execute() {
                 
                 m->setTreeFile(treefile);
                 
-               if (groupfile != "") {
-                       //read in group map info.
-                       tmap = new TreeMap(groupfile);
-                       tmap->readMap();
-               }else{ //fake out by putting everyone in one group
-                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       tmap = new TreeMap();
-                       
-                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               }
-               
-               if (namefile != "") { readNamesFile(); }
-               
-               read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
-               
-               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-               
-               read->AssembleTrees();
-               T = read->getTrees();
-               delete read;
-               
-               //make sure all files match
-               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-               int numNamesInTree;
-               if (namefile != "")  {  
-                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-                       else {   numNamesInTree = m->Treenames.size();  }
-               }else {  numNamesInTree = m->Treenames.size();  }
-               
-               
-               //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                               //is that name in the tree?
-                               int count = 0;
-                               for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                       count++;
-                               }
-                               
-                               if (m->control_pressed) { 
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
-                               }
-                               
-                               //then you did not find it so report it 
-                               if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
-                               }
-                       }
-               }
-       
+               TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        T = reader->getTrees();
+        tmap = T[0]->getTreeMap();
+        map<string, string> nameMap = reader->getNames();
+        delete reader; 
+        
                 sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
                 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
                 m->openOutputFile(sumFile, outSum);
                 
-               util = new SharedUtil();
+               SharedUtil util;
                 Groups = m->getGroups();
                 vector<string> namesGroups = tmap->getNamesOfGroups();
-               util->setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted");       //sets the groups the user wants to analyze
-               util->getCombos(groupComb, Groups, numComp);
-               m->setGroups(Groups);
-               delete util;
-       
-               if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
+               util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted");        //sets the groups the user wants to analyze
                 
-               unweighted = new Unweighted(tmap, includeRoot);
+               Unweighted unweighted(includeRoot);
                 
                 int start = time(NULL);
-               
-               userData.resize(numComp,0);  //data[0] = unweightedscore 
-               randomData.resize(numComp,0); //data[0] = unweightedscore
-               //create new tree with same num nodes and leaves as users
-               
+        
+        //set or check size
+        if (subsample) {
+            //user has not set size, set size = smallest samples size
+            if (subsampleSize == -1) { 
+                vector<string> temp; temp.push_back(Groups[0]);
+                subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+                for (int i = 1; i < Groups.size(); i++) {
+                    temp.clear(); temp.push_back(Groups[i]);
+                    int thisSize = (tmap->getNamesSeqs(temp)).size();
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+                m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
+            }else { //eliminate any too small groups
+                vector<string> newGroups = Groups;
+                Groups.clear();
+                for (int i = 0; i < newGroups.size(); i++) {
+                    vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
+                    vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
+                    int thisSize = thisGroupsSeqs.size();
+                    
+                    if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);        }
+                    else {  m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+                } 
+                m->setGroups(Groups);
+            }
+        }
+               
+        util.getCombos(groupComb, Groups, numComp);
+               m->setGroups(Groups);
+        
+               if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
+        
                 if (numComp < processors) { processors = numComp;  }
+        
+        if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
                 
                 outSum << "Tree#" << '\t' << "Groups" << '\t'  <<  "UWScore" <<'\t';
                 m->mothurOut("Tree#\tGroups\tUWScore\t");
@@ -313,13 +303,7 @@ int UnifracUnweightedCommand::execute() {
          
                 //get pscores for users trees
                 for (int i = 0; i < T.size(); i++) {
-                       if (m->control_pressed) { 
-                               delete tmap; delete unweighted;
-                               for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                               outSum.close();
-                               for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  }
-                               return 0; 
-                       }
+                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
                         
                         counter = 0;
                         
@@ -335,11 +319,12 @@ int UnifracUnweightedCommand::execute() {
                         rCumul.resize(numComp);  
                         utreeScores.resize(numComp);  
                         UWScoreSig.resize(numComp); 
+            
+            vector<double> userData; userData.resize(numComp,0);  //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
  
-                       userData = unweighted->getValues(T[i], processors, outputDir);  //userData[0] = unweightedscore
+                       userData = unweighted.getValues(T[i], processors, outputDir);  //userData[0] = unweightedscore
                 
-                       if (m->control_pressed) { delete tmap; delete unweighted;
-                               for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close();  for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  }return 0; }
+                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close();  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  }return 0; }
                         
                         //output scores for each combination
                         for(int k = 0; k < numComp; k++) {
@@ -348,56 +333,47 @@ int UnifracUnweightedCommand::execute() {
                                 
                                 //add users score to validscores
                                 validScores[userData[k]] = userData[k];
+                
+                if (!random) { UWScoreSig[k].push_back(0.0);   }
                         }
-               
-                       //get unweighted scores for random trees - if random is false iters = 0
-                       for (int j = 0; j < iters; j++) {
-               
-                               //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
-                               randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
-                               
-                               if (m->control_pressed) { delete tmap; delete unweighted;
-                                       for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0; }
-                       
-                               for(int k = 0; k < numComp; k++) {      
-                                       //add trees unweighted score to map of scores
-                                       map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
-                                       if (it != rscoreFreq[k].end()) {//already have that score
-                                               rscoreFreq[k][randomData[k]]++;
-                                       }else{//first time we have seen this score
-                                               rscoreFreq[k][randomData[k]] = 1;
-                                       }
-                               
-                                       //add randoms score to validscores
-                                       validScores[randomData[k]] = randomData[k];
-                               }
-                               
-                               //report progress
-//                             m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();  
-                       }
-       
-                       for(int a = 0; a < numComp; a++) {
-                               float rcumul = 1.0000;
-                               
-                               if (random) {
-                                       //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
-                                       for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
-                                               //make rscoreFreq map and rCumul
-                                               map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
-                                               rCumul[a][it->first] = rcumul;
-                                               //get percentage of random trees with that info
-                                               if (it2 != rscoreFreq[a].end()) {  rscoreFreq[a][it->first] /= iters; rcumul-= it2->second;  }
-                                               else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
-                                       }
-                                       UWScoreSig[a].push_back(rCumul[a][userData[a]]);
-                               }else           {       UWScoreSig[a].push_back(0.0);                                           }
-       
-                       }
-                       
-                       if (m->control_pressed) { delete tmap; delete unweighted;
-                               for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0;  }
-                       
-                       //print output files
+            
+            if (random) {  runRandomCalcs(T[i], userData);  }
+                       
+                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0;  }
+            
+            //subsample loop
+            vector< vector<double> > calcDistsTotals;  //each iter, each groupCombos dists. this will be used to make .dist files
+            for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
+                
+                if (m->control_pressed) { break; }
+                
+                //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
+                TreeMap* newTmap = new TreeMap();
+                newTmap->getCopy(*tmap);
+                
+                SubSample sample;
+                Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+                
+                //call new weighted function
+                vector<double> iterData; iterData.resize(numComp,0);
+                Unweighted thisUnweighted(includeRoot);
+                iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
+                
+                //save data to make ave dist, std dist
+                calcDistsTotals.push_back(iterData);
+                
+                delete newTmap;
+                delete subSampleTree;
+                
+                if((thisIter+1) % 100 == 0){   m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
+            }
+            
+            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0;  }
+
+            if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i); }
+            if (consensus) {  getConsensusTrees(calcDistsTotals, i);  }
+            
+            //print output files
                         printUWSummaryFile(i);
                         if (random)  {  printUnweightedFile();  delete output;  }
                         if (phylip) {   createPhylipFile(i);            }
@@ -411,8 +387,7 @@ int UnifracUnweightedCommand::execute() {
                 
  
                 outSum.close();
-               m->clearGroups();
-               delete tmap; delete unweighted;
+               delete tmap; 
                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }     return 0; }
@@ -445,6 +420,273 @@ int UnifracUnweightedCommand::execute() {
                 exit(1);
         }
  }
+/**************************************************************************************************/
+int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
+       try {
+        //we need to find the average distance and standard deviation for each groups distance
+        
+        //finds sum
+        vector<double> averages; averages.resize(numComp, 0); 
+        for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
+            for (int i = 0; i < dists[thisIter].size(); i++) {  
+                averages[i] += dists[thisIter][i];
+            }
+        }
+        
+        //finds average.
+        for (int i = 0; i < averages.size(); i++) {  averages[i] /= (float) subsampleIters; }
+        
+        //find standard deviation
+        vector<double> stdDev; stdDev.resize(numComp, 0);
+        
+        for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+            for (int j = 0; j < dists[thisIter].size(); j++) {
+                stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+            }
+        }
+        for (int i = 0; i < stdDev.size(); i++) {  
+            stdDev[i] /= (float) subsampleIters; 
+            stdDev[i] = sqrt(stdDev[i]);
+        }
+        
+        //make matrix with scores in it
+        vector< vector<double> > avedists;     avedists.resize(m->getNumGroups());
+        for (int i = 0; i < m->getNumGroups(); i++) {
+            avedists[i].resize(m->getNumGroups(), 0.0);
+        }
+        
+        //make matrix with scores in it
+        vector< vector<double> > stddists;     stddists.resize(m->getNumGroups());
+        for (int i = 0; i < m->getNumGroups(); i++) {
+            stddists[i].resize(m->getNumGroups(), 0.0);
+        }
+        
+        //flip it so you can print it
+        int count = 0;
+        for (int r=0; r<m->getNumGroups(); r++) { 
+            for (int l = 0; l < r; l++) {
+                avedists[r][l] = averages[count];
+                avedists[l][r] = averages[count];
+                stddists[r][l] = stdDev[count];
+                stddists[l][r] = stdDev[count];
+                count++;
+            }
+        }
+        
+        string aveFileName = outputDir + m->getSimpleName(treefile)  + toString(treeNum+1) + ".unweighted.ave.dist";
+        outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName); 
+        
+        ofstream out;
+        m->openOutputFile(aveFileName, out);
+        
+        string stdFileName = outputDir + m->getSimpleName(treefile)  + toString(treeNum+1) + ".unweighted.std.dist";
+        outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName); 
+        
+        ofstream outStd;
+        m->openOutputFile(stdFileName, outStd);
+        
+        if ((outputForm == "lt") || (outputForm == "square")) {
+            //output numSeqs
+            out << m->getNumGroups() << endl;
+            outStd << m->getNumGroups() << endl;
+        }
+        
+        //output to file
+        for (int r=0; r<m->getNumGroups(); r++) { 
+            //output name
+            string name = (m->getGroups())[r];
+            if (name.length() < 10) { //pad with spaces to make compatible
+                while (name.length() < 10) {  name += " ";  }
+            }
+            
+            if (outputForm == "lt") {
+                out << name << '\t';
+                outStd << name << '\t';
+                
+                //output distances
+                for (int l = 0; l < r; l++) {  out  << avedists[r][l] << '\t';  outStd  << stddists[r][l] << '\t';}
+                out << endl;  outStd << endl;
+            }else if (outputForm == "square") {
+                out << name << '\t';
+                outStd << name << '\t';
+                
+                //output distances
+                for (int l = 0; l < m->getNumGroups(); l++) {  out  << avedists[r][l] << '\t'; outStd  << stddists[r][l] << '\t'; }
+                out << endl; outStd << endl;
+            }else{
+                //output distances
+                for (int l = 0; l < r; l++) {  
+                    string otherName = (m->getGroups())[l];
+                    if (otherName.length() < 10) { //pad with spaces to make compatible
+                        while (otherName.length() < 10) {  otherName += " ";  }
+                    }
+                    
+                    out  << name << '\t' << otherName << avedists[r][l] << endl;  
+                    outStd  << name << '\t' << otherName << stddists[r][l] << endl; 
+                }
+            }
+        }
+        out.close();
+        outStd.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracUnweightedCommand", "getAverageSTDMatrices");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
+       try {
+        
+        //used in tree constructor 
+        m->runParse = false;
+        
+        //create treemap class from groupmap for tree class to use
+        TreeMap newTmap;
+        newTmap.makeSim(m->getGroups());
+        
+        //clear  old tree names if any
+        m->Treenames.clear();
+        
+        //fills globaldatas tree names
+        m->Treenames = m->getGroups();
+        
+        vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+        
+        if (m->control_pressed) { return 0; }
+        
+        Consensus con;
+        Tree* conTree = con.getTree(newTrees);
+        
+        //create a new filename
+        string conFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".unweighted.cons.tre";                                
+        outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile); 
+        ofstream outTree;
+        m->openOutputFile(conFile, outTree);
+        
+        if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
+        outTree.close();
+        
+        return 0;
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracUnweightedCommand", "getConsensusTrees");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+       try {
+        
+        vector<Tree*> trees;
+        
+        //create a new filename
+        string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".unweighted.all.tre";                              
+        outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile); 
+        
+        ofstream outAll;
+        m->openOutputFile(outputFile, outAll);
+        
+        
+        for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
+            
+            if (m->control_pressed) { break; }
+            
+            //make matrix with scores in it
+            vector< vector<double> > sims;     sims.resize(m->getNumGroups());
+            for (int j = 0; j < m->getNumGroups(); j++) {
+                sims[j].resize(m->getNumGroups(), 0.0);
+            }
+            
+            int count = 0;
+                       for (int r=0; r<m->getNumGroups(); r++) { 
+                               for (int l = 0; l < r; l++) {
+                    double sim = -(dists[i][count]-1.0);
+                                       sims[r][l] = sim;
+                                       sims[l][r] = sim;
+                                       count++;
+                               }
+                       }
+            
+            //create tree
+            Tree* tempTree = new Tree(&mytmap, sims);
+            map<string, string> empty;
+            tempTree->assembleTree(empty);
+            
+            trees.push_back(tempTree);
+            
+            //print tree
+            tempTree->print(outAll);
+        }
+        
+        outAll.close();
+        
+        if (m->control_pressed) {  for (int i = 0; i < trees.size(); i++) {  delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
+        
+        return trees;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracUnweightedCommand", "buildTrees");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+int UnifracUnweightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
+       try {
+        vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
+        
+        Unweighted unweighted(includeRoot);
+        
+        //get unweighted scores for random trees - if random is false iters = 0
+        for (int j = 0; j < iters; j++) {
+            
+            //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
+            randomData = unweighted.getValues(thisTree, "", "", processors, outputDir);
+            
+            if (m->control_pressed) { return 0; }
+                       
+            for(int k = 0; k < numComp; k++) { 
+                //add trees unweighted score to map of scores
+                map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
+                if (it != rscoreFreq[k].end()) {//already have that score
+                    rscoreFreq[k][randomData[k]]++;
+                }else{//first time we have seen this score
+                    rscoreFreq[k][randomData[k]] = 1;
+                }
+                               
+                //add randoms score to validscores
+                validScores[randomData[k]] = randomData[k];
+            }
+        }
+        
+        for(int a = 0; a < numComp; a++) {
+            float rcumul = 1.0000;
+    
+            //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
+            for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
+                //make rscoreFreq map and rCumul
+                map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
+                rCumul[a][it->first] = rcumul;
+                //get percentage of random trees with that info
+                if (it2 != rscoreFreq[a].end()) {  rscoreFreq[a][it->first] /= iters; rcumul-= it2->second;  }
+                else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
+            }
+            UWScoreSig[a].push_back(rCumul[a][usersScores[a]]);
+        }
+        
+        return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracUnweightedCommand", "runRandomCalcs");
+               exit(1);
+       }
+}
  /***********************************************************/
  void UnifracUnweightedCommand::printUnweightedFile() {
         try {
@@ -581,45 +823,6 @@ void UnifracUnweightedCommand::createPhylipFile(int i) {
                 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
                 exit(1);
         }
-}/*****************************************************************/
-int UnifracUnweightedCommand::readNamesFile() {
-       try {
-               m->names.clear();
-               numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = m->names.find(first);
-                       if (itNames == m->names.end()) {  
-                               m->names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = dupNames[i]; 
-                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
-               }
-               in.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
-               exit(1);
-       }
  }
  /***********************************************************/
  
diff --git a/unifracunweightedcommand.h b/unifracunweightedcommand.h

index cd8d51dcad76a44dbc0ca064bd79a5ef3e0fbe80..fd39ae43852bee7ad970d77035fa839c50f62422 100644 (file)
--- a/unifracunweightedcommand.h
+++ b/unifracunweightedcommand.h
@@ -36,36 +36,32 @@ class UnifracUnweightedCommand : public Command {
         
         
         private:
-               ReadTree* read;
-               SharedUtil* util;
                 FileOutput* output;
                 vector<Tree*> T;           //user trees
                 TreeMap* tmap;
-               Unweighted* unweighted;
                 string sumFile, allGroups;
                 vector<string> groupComb; // AB. AC, BC...
-               int iters, numGroups, numComp, counter, processors, numUniquesInName;
-               EstOutput userData;                     //unweighted score info for user tree
-               EstOutput randomData;           //unweighted score info for random trees
+               int iters, numGroups, numComp, counter, processors, subsampleSize, subsampleIters;
                 vector< vector<float> > utreeScores; //scores for users trees for each comb.
                 vector< vector<float> > UWScoreSig;  //tree score signifigance when compared to random trees - percentage of random trees with that score or higher.
                 map<float, float>  validScores;  //map contains scores from random
                 vector< map<float, float> > rscoreFreq;  //map <unweighted score, number of random trees with that score.> -vector entry for each combination.
                 vector< map<float, float> > rCumul;  //map <unweighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each combination.
                 
-               bool abort, phylip, random, includeRoot;
+               bool abort, phylip, random, includeRoot, consensus, subsample;
                 string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile;
                 vector<string> Groups, outputNames; //holds groups to be used
  
                 ofstream outSum, out;
                 ifstream inFile;
-               map<string, string> nameMap;
                 
+        int runRandomCalcs(Tree*, vector<double>);
                 void printUWSummaryFile(int);
                 void printUnweightedFile();
                 void createPhylipFile(int);
-               int readNamesFile();
-                
+        vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+        int getConsensusTrees(vector< vector<double> >&, int);
+        int getAverageSTDMatrices(vector< vector<double> >&, int);
                 
  };
  
diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp

index b3a54c929ace3221fe3ba736a3b5fad7f87818f9..633cb643a213044e4f7e92c108ad012690deff09 100644 (file)
--- a/unifracweightedcommand.cpp
+++ b/unifracweightedcommand.cpp
@@ -8,6 +8,9 @@
   */
  
  #include "unifracweightedcommand.h"
+#include "consensus.h"
+#include "subsample.h"
+#include "treereader.h"
  
  //**********************************************************************************************************************
  vector<string> UnifracWeightedCommand::setParameters(){        
@@ -18,7 +21,9 @@ vector<string> UnifracWeightedCommand::setParameters(){
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-               CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
+        CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+        CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus);
+        CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
                 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
                 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
@@ -37,14 +42,16 @@ vector<string> UnifracWeightedCommand::setParameters(){
  string UnifracWeightedCommand::getHelpString(){        
         try {
                 string helpString = "";
-               helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root and random.  tree parameter is required unless you have valid current tree file.\n";
+               helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random.  tree parameter is required unless you have valid current tree file.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n";
                 helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
                 helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
                 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n";
                 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
-               helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
+        helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
+        helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results, as well as a consensus tree built from these trees. Default=F.\n";
+        helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
                 helpString += "Example unifrac.weighted(groups=A-B-C, iters=500).\n";
                 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
                 helpString += "The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n";
@@ -66,6 +73,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(){
                 outputTypes["wsummary"] = tempOutNames;
                 outputTypes["phylip"] = tempOutNames;
                 outputTypes["column"] = tempOutNames;
+        outputTypes["tree"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
@@ -102,6 +110,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                         outputTypes["wsummary"] = tempOutNames;
                         outputTypes["phylip"] = tempOutNames;
                         outputTypes["column"] = tempOutNames;
+            outputTypes["tree"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -133,12 +142,6 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                                 }
                         }
                         
-                       m->runParse = true;
-                       m->clearGroups();
-                       m->clearAllGroups();
-                       m->Treenames.clear();
-                       m->names.clear();
-                       
                         //check for required parameters
                         treefile = validParameter.validFile(parameters, "tree", true);
                         if (treefile == "not open") { treefile = ""; abort = true; }
@@ -159,7 +162,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
                         
                                                                                                                                         
                         //check for optional parameter and set defaults
@@ -190,9 +193,25 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);
-                       
-                       if (!random) {  iters = 0;  } //turn off random calcs
-                       
+            
+            temp = validParameter.validFile(parameters, "subsample", false);           if (temp == "not found") { temp = "F"; }
+                       if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+            else {  
+                if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; }  //we will set it to smallest group later 
+                else { subsample = false; }
+            }
+                       
+            if (!subsample) { subsampleIters = 0;   }
+            else { subsampleIters = iters;          }
+            
+            temp = validParameter.validFile(parameters, "consensus", false);                                   if (temp == "not found") { temp = "F"; }
+                       consensus = m->isTrue(temp);
+            
+                       if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  } 
+                       if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } 
+            if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
+            if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
+            
                         if (namefile == "") {
                                 vector<string> files; files.push_back(treefile);
                                 parser.getNameFile(files);
@@ -214,218 +233,147 @@ int UnifracWeightedCommand::execute() {
                 
                 m->setTreeFile(treefile);
                 
-               if (groupfile != "") {
-                       //read in group map info.
-                       tmap = new TreeMap(groupfile);
-                       tmap->readMap();
-               }else{ //fake out by putting everyone in one group
-                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       tmap = new TreeMap();
-                       
-                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               }
-               
-               if (namefile != "") { readNamesFile(); }
-               
-               read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
-               
-               if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-               
-               read->AssembleTrees();
-               T = read->getTrees();
-               delete read;
-               
-               //make sure all files match
-               //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
-               int numNamesInTree;
-               if (namefile != "")  {  
-                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
-                       else {   numNamesInTree = m->Treenames.size();  }
-               }else {  numNamesInTree = m->Treenames.size();  }
-               
-               
-               //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-                               //is that name in the tree?
-                               int count = 0;
-                               for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
-                                       count++;
-                               }
-                               
-                               if (m->control_pressed) { 
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
-                                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       m->clearGroups();
-                                       return 0;
-                               }
+        TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        T = reader->getTrees();
+        tmap = T[0]->getTreeMap();
+        map<string, string> nameMap = reader->getNames();
+        delete reader;
+    
+        if (m->control_pressed) {  delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
                                 
-                               //then you did not find it so report it 
-                               if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
-                               }
-                       }
-               }
-               
                 sumFile = outputDir + m->getSimpleName(treefile) + ".wsummary";
                 m->openOutputFile(sumFile, outSum);
                 outputNames.push_back(sumFile);  outputTypes["wsummary"].push_back(sumFile);
-                       
-               util = new SharedUtil();
+               
+        SharedUtil util;
                 string s; //to make work with setgroups
                 Groups = m->getGroups();
                 vector<string> nameGroups = tmap->getNamesOfGroups();
-               util->setGroups(Groups, nameGroups, s, numGroups, "weighted");  //sets the groups the user wants to analyze
-               util->getCombos(groupComb, Groups, numComp);
+               util.setGroups(Groups, nameGroups, s, numGroups, "weighted");   //sets the groups the user wants to analyze
                 m->setGroups(Groups);
-               delete util;
                 
-               weighted = new Weighted(tmap, includeRoot);
+        if (m->control_pressed) {  delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
+        
+               Weighted weighted(includeRoot);
                         
                 int start = time(NULL);
-               
-               //get weighted for users tree
-               userData.resize(numComp,0);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
-               randomData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
-               
-               if (numComp < processors) { processors = numComp; }
-                               
-               //get weighted scores for users trees
-               for (int i = 0; i < T.size(); i++) {
-                       
-                       if (m->control_pressed) { delete tmap; delete weighted;
-                               for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } return 0; }
-
-                       counter = 0;
-                       rScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
-                       uScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
-                       
-                       if (random) {  
-                               output = new ColumnFile(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted", itersString);  
-                               outputNames.push_back(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted");
-                               outputTypes["weighted"].push_back(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted");
-                       } 
-
-                       userData = weighted->getValues(T[i], processors, outputDir);  //userData[0] = weightedscore
-                       
-                       if (m->control_pressed) { delete tmap; delete weighted;
-                               for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0; }
-                       
-                       //save users score
-                       for (int s=0; s<numComp; s++) {
-                               //add users score to vector of user scores
-                               uScores[s].push_back(userData[s]);
-                               
-                               //save users tree score for summary file
-                               utreeScores.push_back(userData[s]);
-                       }
-                       
-                       if (random) { 
-                       
-                               //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
-                               vector< vector<string> > namesOfGroupCombos;
-                               for (int a=0; a<numGroups; a++) { 
-                                       for (int l = 0; l < a; l++) {   
-                                               vector<string> groups; groups.push_back((m->getGroups())[a]); groups.push_back((m->getGroups())[l]);
-                                               namesOfGroupCombos.push_back(groups);
-                                       }
-                               }
-                               
-                               lines.clear();
-                               
-                               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                                       if(processors != 1){
-                                               int numPairs = namesOfGroupCombos.size();
-                                               int numPairsPerProcessor = numPairs / processors;
-                                       
-                                               for (int i = 0; i < processors; i++) {
-                                                       int startPos = i * numPairsPerProcessor;
-                                                       if(i == processors - 1){
-                                                               numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
-                                                       }
-                                                       lines.push_back(linePair(startPos, numPairsPerProcessor));
-                                               }
-                                       }
-                               #endif
-
-                               
-                               //get scores for random trees
-                               for (int j = 0; j < iters; j++) {
-                               
-                                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                                               if(processors == 1){
-                                                       driver(T[i],  namesOfGroupCombos, 0, namesOfGroupCombos.size(),  rScores);
-                                               }else{
-                                                       createProcesses(T[i],  namesOfGroupCombos, rScores);
-                                               }
-                                       #else
-                                               driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
-                                       #endif
-                                       
-                                       if (m->control_pressed) { delete tmap; delete weighted;
-                                               for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0; }
-                                       
-                                       //report progress
-//                                     m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();          
-                               }
-                               lines.clear();
-                       
-                               //find the signifigance of the score for summary file
-                               for (int f = 0; f < numComp; f++) {
-                                       //sort random scores
-                                       sort(rScores[f].begin(), rScores[f].end());
-                                       
-                                       //the index of the score higher than yours is returned 
-                                       //so if you have 1000 random trees the index returned is 100 
-                                       //then there are 900 trees with a score greater then you. 
-                                       //giving you a signifigance of 0.900
-                                       int index = findIndex(userData[f], f);    if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
-                                       
-                                       //the signifigance is the number of trees with the users score or higher 
-                                       WScoreSig.push_back((iters-index)/(float)iters);
-                               }
-                               
-                               //out << "Tree# " << i << endl;
-                               calculateFreqsCumuls();
-                               printWeightedFile();
-                               
-                               delete output;
-                       
-                       }
-                       
-                       //clear data
-                       rScores.clear();
-                       uScores.clear();
-                       validScores.clear();
-               }
-               
-               
-               if (m->control_pressed) { delete tmap; delete weighted;
-                       for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  } return 0;  }
-               
+            
+        //set or check size
+        if (subsample) {
+            //user has not set size, set size = smallest samples size
+            if (subsampleSize == -1) { 
+                vector<string> temp; temp.push_back(Groups[0]);
+                subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+                for (int i = 1; i < Groups.size(); i++) {
+                    temp.clear(); temp.push_back(Groups[i]);
+                    int thisSize = (tmap->getNamesSeqs(temp)).size();
+                    if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
+                }
+                m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
+            }else { //eliminate any too small groups
+                vector<string> newGroups = Groups;
+                Groups.clear();
+                for (int i = 0; i < newGroups.size(); i++) {
+                    vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
+                    vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
+                    int thisSize = thisGroupsSeqs.size();
+                    
+                    if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);        }
+                    else {  m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+                } 
+                m->setGroups(Groups);
+            }
+        }
+        
+        //here in case some groups are removed by subsample
+        util.getCombos(groupComb, Groups, numComp);
+        
+        if (numComp < processors) { processors = numComp; }
+        
+        if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
+        
+        //get weighted scores for users trees
+        for (int i = 0; i < T.size(); i++) {
+            
+            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
+            
+            counter = 0;
+            rScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
+            uScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
+            
+            vector<double> userData; userData.resize(numComp,0);  //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
+            vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
+            
+            if (random) {  
+                output = new ColumnFile(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted", itersString);  
+                outputNames.push_back(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted");
+                outputTypes["weighted"].push_back(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".weighted");
+            } 
+            
+            userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore
+            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
+            
+            //save users score
+            for (int s=0; s<numComp; s++) {
+                //add users score to vector of user scores
+                uScores[s].push_back(userData[s]);
+                //save users tree score for summary file
+                utreeScores.push_back(userData[s]);
+            }
+            
+            if (random) {  runRandomCalcs(T[i], userData); }
+            
+            //clear data
+            rScores.clear();
+            uScores.clear();
+            validScores.clear();
+            
+            //subsample loop
+            vector< vector<double> > calcDistsTotals;  //each iter, each groupCombos dists. this will be used to make .dist files
+            for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
+                
+                if (m->control_pressed) { break; }
+                
+                //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
+                TreeMap* newTmap = new TreeMap();
+                newTmap->getCopy(*tmap);
+                
+                SubSample sample;
+                Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+                   
+                //call new weighted function
+                vector<double> iterData; iterData.resize(numComp,0);
+                Weighted thisWeighted(includeRoot);
+                iterData = thisWeighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
+                
+                //save data to make ave dist, std dist
+                calcDistsTotals.push_back(iterData);
+                
+                delete newTmap;
+                delete subSampleTree;
+                
+                if((thisIter+1) % 100 == 0){   m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
+            }
+            
+            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
+            
+            if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i); }
+            if (consensus) {  getConsensusTrees(calcDistsTotals, i);  }
+        }
+        
+               
+               if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0;  }
+               
+        if (phylip) {  createPhylipFile();             }
+    
                 printWSummaryFile();
                 
-               if (phylip) {   createPhylipFile();             }
-
                 //clear out users groups
                 m->clearGroups();
-               delete tmap; delete weighted;
+               delete tmap; 
                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
-               
-               if (m->control_pressed) { 
-                       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  }
-                       return 0; 
-               }
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
                 
                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine();
                 
@@ -456,6 +404,301 @@ int UnifracWeightedCommand::execute() {
         }
  }
  /**************************************************************************************************/
+int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
+       try {
+        //we need to find the average distance and standard deviation for each groups distance
+        
+        //finds sum
+        vector<double> averages; averages.resize(numComp, 0); 
+        for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
+            for (int i = 0; i < dists[thisIter].size(); i++) {  
+                averages[i] += dists[thisIter][i];
+            }
+        }
+        
+        //finds average.
+        for (int i = 0; i < averages.size(); i++) {  averages[i] /= (float) subsampleIters; }
+        
+        //find standard deviation
+        vector<double> stdDev; stdDev.resize(numComp, 0);
+                
+        for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+            for (int j = 0; j < dists[thisIter].size(); j++) {
+                stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+            }
+        }
+        for (int i = 0; i < stdDev.size(); i++) {  
+            stdDev[i] /= (float) subsampleIters; 
+            stdDev[i] = sqrt(stdDev[i]);
+        }
+        
+        //make matrix with scores in it
+        vector< vector<double> > avedists;     avedists.resize(m->getNumGroups());
+        for (int i = 0; i < m->getNumGroups(); i++) {
+            avedists[i].resize(m->getNumGroups(), 0.0);
+        }
+        
+        //make matrix with scores in it
+        vector< vector<double> > stddists;     stddists.resize(m->getNumGroups());
+        for (int i = 0; i < m->getNumGroups(); i++) {
+            stddists[i].resize(m->getNumGroups(), 0.0);
+        }
+        
+        //flip it so you can print it
+        int count = 0;
+        for (int r=0; r<m->getNumGroups(); r++) { 
+            for (int l = 0; l < r; l++) {
+                avedists[r][l] = averages[count];
+                avedists[l][r] = averages[count];
+                stddists[r][l] = stdDev[count];
+                stddists[l][r] = stdDev[count];
+                count++;
+            }
+        }
+        
+        string aveFileName = outputDir + m->getSimpleName(treefile)  + toString(treeNum+1) + ".weighted.ave.dist";
+        outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName); 
+        
+        ofstream out;
+        m->openOutputFile(aveFileName, out);
+        
+       string stdFileName = outputDir + m->getSimpleName(treefile)  + toString(treeNum+1) + ".weighted.std.dist";
+       outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName); 
+        
+        ofstream outStd;
+        m->openOutputFile(stdFileName, outStd);
+        
+        if ((outputForm == "lt") || (outputForm == "square")) {
+            //output numSeqs
+            out << m->getNumGroups() << endl;
+            outStd << m->getNumGroups() << endl;
+        }
+        
+        //output to file
+        for (int r=0; r<m->getNumGroups(); r++) { 
+            //output name
+            string name = (m->getGroups())[r];
+            if (name.length() < 10) { //pad with spaces to make compatible
+                while (name.length() < 10) {  name += " ";  }
+            }
+            
+            if (outputForm == "lt") {
+                out << name << '\t';
+                outStd << name << '\t';
+                
+                //output distances
+                for (int l = 0; l < r; l++) {  out  << avedists[r][l] << '\t';  outStd  << stddists[r][l] << '\t';}
+                out << endl;  outStd << endl;
+            }else if (outputForm == "square") {
+                out << name << '\t';
+                outStd << name << '\t';
+                
+                //output distances
+                for (int l = 0; l < m->getNumGroups(); l++) {  out  << avedists[r][l] << '\t'; outStd  << stddists[r][l] << '\t'; }
+                out << endl; outStd << endl;
+            }else{
+                //output distances
+                for (int l = 0; l < r; l++) {  
+                    string otherName = (m->getGroups())[l];
+                    if (otherName.length() < 10) { //pad with spaces to make compatible
+                        while (otherName.length() < 10) {  otherName += " ";  }
+                    }
+                    
+                    out  << name << '\t' << otherName << avedists[r][l] << endl;  
+                    outStd  << name << '\t' << otherName << stddists[r][l] << endl; 
+                }
+            }
+        }
+        out.close();
+        outStd.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracWeightedCommand", "getAverageSTDMatrices");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
+       try {
+        
+        //used in tree constructor 
+        m->runParse = false;
+        
+        //create treemap class from groupmap for tree class to use
+        TreeMap newTmap;
+        newTmap.makeSim(m->getGroups());
+        
+        //clear  old tree names if any
+        m->Treenames.clear();
+        
+        //fills globaldatas tree names
+        m->Treenames = m->getGroups();
+        
+        vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+        
+        if (m->control_pressed) { return 0; }
+        
+        Consensus con;
+        Tree* conTree = con.getTree(newTrees);
+        
+        //create a new filename
+        string conFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.cons.tre";                          
+        outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile); 
+        ofstream outTree;
+        m->openOutputFile(conFile, outTree);
+        
+        if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
+        outTree.close();
+        
+        return 0;
+
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracWeightedCommand", "getConsensusTrees");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+       try {
+        
+        vector<Tree*> trees;
+        
+        //create a new filename
+        string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.all.tre";                                
+        outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile); 
+        
+        ofstream outAll;
+        m->openOutputFile(outputFile, outAll);
+        
+
+        for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
+            
+            if (m->control_pressed) { break; }
+            
+            //make matrix with scores in it
+            vector< vector<double> > sims;     sims.resize(m->getNumGroups());
+            for (int j = 0; j < m->getNumGroups(); j++) {
+                sims[j].resize(m->getNumGroups(), 0.0);
+            }
+            
+            int count = 0;
+                       for (int r=0; r<m->getNumGroups(); r++) { 
+                               for (int l = 0; l < r; l++) {
+                    double sim = -(dists[i][count]-1.0);
+                                       sims[r][l] = sim;
+                                       sims[l][r] = sim;
+                                       count++;
+                               }
+                       }
+
+            //create tree
+            Tree* tempTree = new Tree(&mytmap, sims);
+            map<string, string> empty;
+            tempTree->assembleTree(empty);
+            
+            trees.push_back(tempTree);
+            
+            //print tree
+            tempTree->print(outAll);
+        }
+        
+        outAll.close();
+        
+        if (m->control_pressed) {  for (int i = 0; i < trees.size(); i++) {  delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
+        
+        return trees;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracWeightedCommand", "buildTrees");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
+       try {
+        
+        //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
+        vector< vector<string> > namesOfGroupCombos;
+        for (int a=0; a<numGroups; a++) { 
+            for (int l = 0; l < a; l++) {      
+                vector<string> groups; groups.push_back((m->getGroups())[a]); groups.push_back((m->getGroups())[l]);
+                namesOfGroupCombos.push_back(groups);
+            }
+        }
+        
+        lines.clear();
+        
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+        if(processors != 1){
+            int numPairs = namesOfGroupCombos.size();
+            int numPairsPerProcessor = numPairs / processors;
+            
+            for (int i = 0; i < processors; i++) {
+                int startPos = i * numPairsPerProcessor;
+                if(i == processors - 1){
+                    numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
+                }
+                lines.push_back(linePair(startPos, numPairsPerProcessor));
+            }
+        }
+#endif
+        
+        
+        //get scores for random trees
+        for (int j = 0; j < iters; j++) {
+            
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+            if(processors == 1){
+                driver(thisTree,  namesOfGroupCombos, 0, namesOfGroupCombos.size(),  rScores);
+            }else{
+                createProcesses(thisTree,  namesOfGroupCombos, rScores);
+            }
+#else
+            driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
+#endif
+            
+            if (m->control_pressed) { delete tmap;  for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
+            
+            //report progress
+            //                                 m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();          
+        }
+        lines.clear();
+        
+        //find the signifigance of the score for summary file
+        for (int f = 0; f < numComp; f++) {
+            //sort random scores
+            sort(rScores[f].begin(), rScores[f].end());
+            
+            //the index of the score higher than yours is returned 
+            //so if you have 1000 random trees the index returned is 100 
+            //then there are 900 trees with a score greater then you. 
+            //giving you a signifigance of 0.900
+            int index = findIndex(usersScores[f], f);    if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
+            
+            //the signifigance is the number of trees with the users score or higher 
+            WScoreSig.push_back((iters-index)/(float)iters);
+        }
+        
+        //out << "Tree# " << i << endl;
+        calculateFreqsCumuls();
+        printWeightedFile();
+        
+        delete output;
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "UnifracWeightedCommand", "runRandomCalcs");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
  int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
         try {
@@ -524,7 +767,9 @@ int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > na
  int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) { 
   try {
                 Tree* randT = new Tree(tmap);
-
+     
+        Weighted weighted(includeRoot);
+     
                 for (int h = start; h < (start+num); h++) {
         
                         if (m->control_pressed) { return 0; }
@@ -542,7 +787,7 @@ int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGrou
                         if (m->control_pressed) { delete randT;  return 0;  }
  
                         //get wscore of random tree
-                       EstOutput randomData = weighted->getValues(randT, groupA, groupB);
+                       EstOutput randomData = weighted.getValues(randT, groupA, groupB);
                 
                         if (m->control_pressed) { delete randT;  return 0;  }
                                                                                 
@@ -763,46 +1008,6 @@ void UnifracWeightedCommand::calculateFreqsCumuls() {
                 exit(1);
         }
  }
-/*****************************************************************/
-int UnifracWeightedCommand::readNamesFile() {
-       try {
-               m->names.clear();
-               numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = m->names.find(first);
-                       if (itNames == m->names.end()) {  
-                               m->names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = dupNames[i]; 
-                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
-               }
-               in.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "UnifracWeightedCommand", "readNamesFile");
-               exit(1);
-       }
-}
  /***********************************************************/
  
  
diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h

index b1db317de0add4b28352315e20dc282110fad789..9deb065b21da66b0bd83ef50d6d7d457e750025a 100644 (file)
--- a/unifracweightedcommand.h
+++ b/unifracweightedcommand.h
@@ -42,30 +42,24 @@ class UnifracWeightedCommand : public Command {
                         linePair(int i, int j) : start(i), num(j) {}
                 };
                 vector<linePair> lines;
-               
-               ReadTree* read;
-               SharedUtil* util;
+        TreeMap* tmap;
                 FileOutput* output;
                 vector<Tree*> T;           //user trees
                 vector<double> utreeScores;  //user tree unweighted scores
                 vector<double> WScoreSig;  //tree weighted score signifigance when compared to random trees - percentage of random trees with that score or lower.
                 vector<string> groupComb; // AB. AC, BC...
-               TreeMap* tmap;
-               Weighted* weighted;
                 string sumFile, outputDir;
                 int iters, numGroups, numComp, counter;
-               EstOutput userData;                     //weighted score info for user tree
-               EstOutput randomData;           //weighted score info for random trees
                 vector< vector<double> > rScores;  //vector<weighted scores for random trees.> each group comb has an entry
                 vector< vector<double> > uScores;  //vector<weighted scores for user trees.> each group comb has an entry
                 vector< map<float, float> > rScoreFreq;  //map <weighted score, number of random trees with that score.> -vector entry for each combination.
                 vector< map<float, float> > rCumul;  //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c                                                                
                 map<float, float>  validScores;  //map contains scores from random
                 
-               bool abort, phylip, random, includeRoot;
+               bool abort, phylip, random, includeRoot, subsample, consensus;
                 string groups, itersString, outputForm, treefile, groupfile, namefile;
                 vector<string> Groups, outputNames; //holds groups to be used
-               int processors, numUniquesInName;
+               int processors, subsampleSize, subsampleIters;
                 ofstream outSum;
                 map<string, string> nameMap;
                 
@@ -77,7 +71,10 @@ class UnifracWeightedCommand : public Command {
                 void calculateFreqsCumuls();
                 int createProcesses(Tree*,  vector< vector<string> >,  vector< vector<double> >&);
                 int driver(Tree*, vector< vector<string> >, int, int,  vector< vector<double> >&);
-               int readNamesFile();
+        int runRandomCalcs(Tree*, vector<double>);
+        vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+        int getConsensusTrees(vector< vector<double> >&, int);
+        int getAverageSTDMatrices(vector< vector<double> >&, int);
                 
  };
  
diff --git a/unweighted.cpp b/unweighted.cpp

index d4fd32731336913a5ab767736cf52ab1ffc2cd0a..864a9f8bab16f5d1ea52480c37c141ddc934e199 100644 (file)
--- a/unweighted.cpp
+++ b/unweighted.cpp
@@ -15,7 +15,9 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
         try {
                 processors = p;
                 outputDir = o;
-                       
+        
+        TreeMap* tmap = t->getTreeMap();
+        
                 //if the users enters no groups then give them the score of all groups
                 int numGroups = m->getNumGroups();
                 
@@ -50,7 +52,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
  
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -65,11 +67,11 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
                 
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
-                               data = createProcesses(t, namesOfGroupCombos);
+                               data = createProcesses(t, namesOfGroupCombos, tmap);
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
                 #endif
                 
                 return data;
@@ -81,7 +83,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -98,7 +100,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -120,7 +122,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -165,7 +167,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
         }
  }
  /**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) { 
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
   try {
         
          
@@ -259,6 +261,8 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
                 processors = p;
                 outputDir = o;
                 
+        TreeMap* tmap = t->getTreeMap();
+     
                 //if the users enters no groups then give them the score of all groups
                 int numGroups = m->getNumGroups();
                 
@@ -293,7 +297,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
  
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true);
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -307,12 +311,12 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
                                         
-                               data = createProcesses(t, namesOfGroupCombos, true);
+                               data = createProcesses(t, namesOfGroupCombos, true, tmap);
                                 
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true);
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
                 #endif
         
                 return data;
@@ -324,7 +328,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
  }
  /**************************************************************************************************/
  
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups, TreeMap* tmap) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -341,7 +345,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, tmap);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -361,7 +365,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, tmap);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -405,7 +409,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
         }
  }
  /**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups) { 
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups, TreeMap* tmap) { 
   try {
                 
                 EstOutput results; results.resize(num);
diff --git a/unweighted.h b/unweighted.h

index e751d2e79d19f17a2dd3a9af34ea11be95586ef4..c6c13bb3a7abd14c824dfe6b495871790809833d 100644 (file)
--- a/unweighted.h
+++ b/unweighted.h
@@ -19,7 +19,7 @@
  class Unweighted : public TreeCalculator  {
         
         public:
-               Unweighted(TreeMap* t, bool r) : tmap(t), includeRoot(r) {};
+        Unweighted(bool r) : includeRoot(r) {};
                 ~Unweighted() {};
                 EstOutput getValues(Tree*, int, string);
                 EstOutput getValues(Tree*, string, string, int, string);
@@ -33,16 +33,15 @@ class Unweighted : public TreeCalculator  {
                 vector<linePair> lines;
                 
                 EstOutput data;
-               TreeMap* tmap;
                 int processors;
                 string outputDir;
                 map< vector<string>, set<int> > rootForGrouping;  //maps a grouping combo to the roots for that combo
                 bool includeRoot;
                 
-               EstOutput driver(Tree*, vector< vector<string> >, int, int); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >);
-               EstOutput driver(Tree*, vector< vector<string> >, int, int, bool); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >, bool);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, bool, TreeMap*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, bool, TreeMap*);
                 int getRoot(Tree*, int, vector<string>);
  };
  
diff --git a/weighted.cpp b/weighted.cpp

index 7a31da4d55d398f47d022de7b80396bb44f6c6b4..85eed5207ff20d586bd999f670d59a4c8e840c67 100644 (file)
--- a/weighted.cpp
+++ b/weighted.cpp
@@ -18,6 +18,8 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                 vector<double> D;
                 processors = p;
                 outputDir = o;
+        
+        TreeMap* tmap = t->getTreeMap();
                 
                 numGroups = m->getNumGroups();
                 
@@ -36,7 +38,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                 
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -50,12 +52,12 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
  
-                               data = createProcesses(t, namesOfGroupCombos);
+                               data = createProcesses(t, namesOfGroupCombos, tmap);
                                 
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
                 #endif
                 
                 return data;
@@ -67,7 +69,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -85,9 +87,9 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
                         }else if (pid == 0){
         
                                 EstOutput Myresults;
-                               Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+                               Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
                         
-                               m->mothurOut("Merging results."); m->mothurOutEndLine();
+                               //m->mothurOut("Merging results."); m->mothurOutEndLine();
                                 
                                 //pass numSeqs to parent
                                 ofstream out;
@@ -108,7 +110,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
                         }
                 }
         
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
         
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -142,7 +144,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
                         m->mothurRemove(s);
                 }
                 
-               m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine();
+               //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine();
                 
                 return results;
  #endif         
@@ -153,7 +155,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
         }
  }
  /**************************************************************************************************/
-EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) { 
+EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
   try {
                 EstOutput results;
                 vector<double> D;
@@ -267,6 +269,8 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
   try {
                 
                 data.clear(); //clear out old values
+     
+        TreeMap* tmap = t->getTreeMap();
                 
                 if (m->control_pressed) { return data; }
                 
diff --git a/weighted.h b/weighted.h

index c23a11599b4b8d74fdfacd05a5a02ae727834455..180409ce2dbad5da67b570a7ca4a3666c61ae3b3 100644 (file)
--- a/weighted.h
+++ b/weighted.h
@@ -19,7 +19,7 @@
  class Weighted : public TreeCalculator  {
         
         public:
-               Weighted(TreeMap* t, bool r) : tmap(t), includeRoot(r) {};
+        Weighted( bool r) : includeRoot(r) {};
                 ~Weighted() {};
                 
                 EstOutput getValues(Tree*, string, string);
@@ -34,7 +34,6 @@ class Weighted : public TreeCalculator  {
                 vector<linePair> lines;
  
                 EstOutput data;
-               TreeMap* tmap;
                 map<string, int>::iterator it;
                 map<string, double> WScore; //a score for each group combination i.e. AB, AC, BC.
                 int processors;
@@ -42,8 +41,8 @@ class Weighted : public TreeCalculator  {
                 map< vector<string>, set<int> > rootForGrouping;  //maps a grouping combo to the root for that combo
                 bool includeRoot;
                 
-               EstOutput driver(Tree*, vector< vector<string> >, int, int); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
                 double getLengthToRoot(Tree*, int, string, string);
  };
author	Pat Schloss <pschloss@umich.edu>
	Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)
committer	Pat Schloss <pschloss@umich.edu>
	Tue, 1 May 2012 15:08:53 +0000 (11:08 -0400)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
bayesian.cpp		patch \| blob \| history
chimerauchimecommand.cpp		patch \| blob \| history
chimerauchimecommand.h		patch \| blob \| history
classifytreecommand.cpp		patch \| blob \| history
classifytreecommand.h		patch \| blob \| history
commandfactory.cpp		patch \| blob \| history
commandfactory.hpp		patch \| blob \| history
commandoptionparser.cpp		patch \| blob \| history
consensus.cpp		patch \| blob \| history
consensus.h		patch \| blob \| history
cooccurrencecommand.cpp		patch \| blob \| history
deuniquetreecommand.cpp		patch \| blob \| history
deuniquetreecommand.h		patch \| blob \| history
engine.cpp		patch \| blob \| history
getcurrentcommand.cpp		patch \| blob \| history
indicatorcommand.cpp		patch \| blob \| history
makebiomcommand.cpp	[new file with mode: 0644]	patch \| blob
makebiomcommand.h	[new file with mode: 0644]	patch \| blob
makefile		patch \| blob \| history
metastatscommand.cpp		patch \| blob \| history
mothurout.cpp		patch \| blob \| history
mothurout.h		patch \| blob \| history
optionparser.cpp		patch \| blob \| history
otuassociationcommand.cpp		patch \| blob \| history
pairwiseseqscommand.cpp		patch \| blob \| history
parsimony.cpp		patch \| blob \| history
parsimony.h		patch \| blob \| history
parsimonycommand.cpp		patch \| blob \| history
parsimonycommand.h		patch \| blob \| history
pcrseqscommand.h		patch \| blob \| history
phylodiversity.cpp	[deleted file]	patch \| blob \| history
phylodiversity.h	[deleted file]	patch \| blob \| history
phylodiversitycommand.cpp		patch \| blob \| history
phylodiversitycommand.h		patch \| blob \| history
phylosummary.cpp		patch \| blob \| history
prcseqscommand.cpp		patch \| blob \| history
preclustercommand.cpp		patch \| blob \| history
rarefactcommand.cpp		patch \| blob \| history
readtree.cpp		patch \| blob \| history
readtree.h		patch \| blob \| history
removegroupscommand.cpp		patch \| blob \| history
seqsummarycommand.cpp		patch \| blob \| history
setcurrentcommand.cpp		patch \| blob \| history
setcurrentcommand.h		patch \| blob \| history
setdircommand.cpp		patch \| blob \| history
sharedcommand.cpp		patch \| blob \| history
sharedcommand.h		patch \| blob \| history
sharedrabundfloatvector.cpp		patch \| blob \| history
sharedrabundvector.cpp		patch \| blob \| history
shhhercommand.cpp		patch \| blob \| history
subsample.cpp		patch \| blob \| history
subsample.h		patch \| blob \| history
tree.cpp		patch \| blob \| history
tree.h		patch \| blob \| history
treegroupscommand.cpp		patch \| blob \| history
treemap.cpp		patch \| blob \| history
treemap.h		patch \| blob \| history
treereader.cpp	[new file with mode: 0644]	patch \| blob
treereader.h	[new file with mode: 0644]	patch \| blob
trialSwap2.cpp		patch \| blob \| history
trialswap2.h		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history
uchime_src/addtargets2.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/alignchime.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/alignchimel.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/allocs.h	[new file with mode: 0644]	patch \| blob
uchime_src/alnheuristics.h	[new file with mode: 0644]	patch \| blob
uchime_src/alnparams.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/alnparams.h	[new file with mode: 0644]	patch \| blob
uchime_src/alpha.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/alpha.h	[new file with mode: 0644]	patch \| blob
uchime_src/alpha2.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/chainer.h	[new file with mode: 0644]	patch \| blob
uchime_src/chime.h	[new file with mode: 0644]	patch \| blob
uchime_src/counters.h	[new file with mode: 0644]	patch \| blob
uchime_src/diagbox.h	[new file with mode: 0644]	patch \| blob
uchime_src/dp.h	[new file with mode: 0644]	patch \| blob
uchime_src/evalue.h	[new file with mode: 0644]	patch \| blob
uchime_src/fractid.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/getparents.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/globalalign2.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/help.h	[new file with mode: 0644]	patch \| blob
uchime_src/hsp.h	[new file with mode: 0644]	patch \| blob
uchime_src/hspfinder.h	[new file with mode: 0644]	patch \| blob
uchime_src/make3way.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/mk	[new file with mode: 0755]	patch \| blob
uchime_src/mx.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/mx.h	[new file with mode: 0644]	patch \| blob
uchime_src/myopts.h	[new file with mode: 0644]	patch \| blob
uchime_src/myutils.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/myutils.h	[new file with mode: 0644]	patch \| blob
uchime_src/orf.h	[new file with mode: 0644]	patch \| blob
uchime_src/out.h	[new file with mode: 0644]	patch \| blob
uchime_src/path.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/path.h	[new file with mode: 0644]	patch \| blob
uchime_src/searchchime.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/seq.h	[new file with mode: 0644]	patch \| blob
uchime_src/seqdb.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/seqdb.h	[new file with mode: 0644]	patch \| blob
uchime_src/setnucmx.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/sfasta.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/sfasta.h	[new file with mode: 0644]	patch \| blob
uchime_src/svnmods.h	[new file with mode: 0644]	patch \| blob
uchime_src/svnversion.h	[new file with mode: 0644]	patch \| blob
uchime_src/timers.h	[new file with mode: 0644]	patch \| blob
uchime_src/timing.h	[new file with mode: 0644]	patch \| blob
uchime_src/tracebackbit.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/uc.h	[new file with mode: 0644]	patch \| blob
uchime_src/uchime_main.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/ultra.h	[new file with mode: 0644]	patch \| blob
uchime_src/usort.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/viterbifast.cpp	[new file with mode: 0644]	patch \| blob
uchime_src/windex.h	[new file with mode: 0644]	patch \| blob
uchime_src/writechhit.cpp	[new file with mode: 0644]	patch \| blob
unifracunweightedcommand.cpp		patch \| blob \| history
unifracunweightedcommand.h		patch \| blob \| history
unifracweightedcommand.cpp		patch \| blob \| history
unifracweightedcommand.h		patch \| blob \| history
unweighted.cpp		patch \| blob \| history
unweighted.h		patch \| blob \| history
weighted.cpp		patch \| blob \| history
weighted.h		patch \| blob \| history