A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
+ A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; };
A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
+ A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; };
A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; };
A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; };
A7E9B91312D37EC400DA6239 /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78312D37EC400DA6239 /* parsimony.cpp */; };
A7E9B91412D37EC400DA6239 /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78512D37EC400DA6239 /* parsimonycommand.cpp */; };
A7E9B91512D37EC400DA6239 /* pcoacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */; };
- A7E9B91612D37EC400DA6239 /* phylodiversity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */; };
A7E9B91712D37EC400DA6239 /* phylodiversitycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */; };
A7E9B91812D37EC400DA6239 /* phylosummary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */; };
A7E9B91912D37EC400DA6239 /* phylotree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B78F12D37EC400DA6239 /* phylotree.cpp */; };
A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = "<group>"; };
A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = "<group>"; };
A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = "<group>"; };
+ A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = "<group>"; };
+ A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = "<group>"; };
A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = "<group>"; };
A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearmemorycommand.h; sourceTree = "<group>"; };
A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cooccurrencecommand.h; sourceTree = "<group>"; };
A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trialSwap2.cpp; sourceTree = "<group>"; };
A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = "<group>"; };
+ A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = "<group>"; };
+ A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = "<group>"; };
A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = "<group>"; };
A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = "<group>"; };
A7E9B78612D37EC400DA6239 /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = "<group>"; };
A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcoacommand.cpp; sourceTree = "<group>"; };
A7E9B78812D37EC400DA6239 /* pcoacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcoacommand.h; sourceTree = "<group>"; };
- A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversity.cpp; sourceTree = "<group>"; };
- A7E9B78A12D37EC400DA6239 /* phylodiversity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversity.h; sourceTree = "<group>"; };
A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversitycommand.cpp; sourceTree = "<group>"; };
A7E9B78C12D37EC400DA6239 /* phylodiversitycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversitycommand.h; sourceTree = "<group>"; };
A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = "<group>"; };
A7E9B73D12D37EC400DA6239 /* listseqscommand.cpp */,
A7FA10001302E096003860FE /* mantelcommand.h */,
A7FA10011302E096003860FE /* mantelcommand.cpp */,
+ A724D2B4153C8600000A826F /* makebiomcommand.h */,
+ A724D2B6153C8628000A826F /* makebiomcommand.cpp */,
A799F5B71309A3E000AEEFA0 /* makefastqcommand.h */,
A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */,
A7E9B74412D37EC400DA6239 /* makegroupcommand.h */,
A7E9B68F12D37EC400DA6239 /* classify.h */,
A7E9B73812D37EC400DA6239 /* knn.h */,
A7E9B73712D37EC400DA6239 /* knn.cpp */,
- A7E9B78912D37EC400DA6239 /* phylodiversity.cpp */,
- A7E9B78A12D37EC400DA6239 /* phylodiversity.h */,
A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */,
A7E9B78E12D37EC400DA6239 /* phylosummary.h */,
A7E9B78F12D37EC400DA6239 /* phylotree.cpp */,
A713EBAB12DC7613000092AC /* readphylipvector.cpp */,
A7E9B84312D37EC400DA6239 /* splitmatrix.cpp */,
A7E9B84412D37EC400DA6239 /* splitmatrix.h */,
+ A7D755D71535F665009BF21A /* treereader.h */,
+ A7D755D91535F679009BF21A /* treereader.cpp */,
);
name = read;
sourceTree = "<group>";
A7E9B91312D37EC400DA6239 /* parsimony.cpp in Sources */,
A7E9B91412D37EC400DA6239 /* parsimonycommand.cpp in Sources */,
A7E9B91512D37EC400DA6239 /* pcoacommand.cpp in Sources */,
- A7E9B91612D37EC400DA6239 /* phylodiversity.cpp in Sources */,
A7E9B91712D37EC400DA6239 /* phylodiversitycommand.cpp in Sources */,
A7E9B91812D37EC400DA6239 /* phylosummary.cpp in Sources */,
A7E9B91912D37EC400DA6239 /* phylotree.cpp in Sources */,
A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */,
A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */,
A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */,
+ A7D755DA1535F679009BF21A /* treereader.cpp in Sources */,
+ A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
GCC_MODEL_TUNING = "";
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
- "VERSION=\"\\\"1.24.0\\\"\"",
- "RELEASE_DATE=\"\\\"3/12/2012\\\"\"",
+ "VERSION=\"\\\"1.25.0\\\"\"",
+ "RELEASE_DATE=\"\\\"4/30/2012\\\"\"",
);
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); }
/************calculate the probablity that each word will be in a specific taxonomy*************/
- string tfileroot = baseTName.substr(0,baseTName.find_last_of(".")+1);
+ string tfileroot = m->getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
string tempfileroot = m->getRootName(m->getSimpleName(baseName));
string phyloTreeName = tfileroot + "tree.train";
string phyloTreeSumName = tfileroot + "tree.sum";
delete phyloTree;
phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
-
+
//save probabilities
if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
}
int totalSeqs = 0;
if(processors == 1) { totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); }
- else { totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups); }
+ else { totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]); }
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
}
/**************************************************************************************************/
-int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector<string> groups) {
+int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector<string> groups, string nameFile, string groupFile, string fastaFile) {
try {
processIDS.clear();
// Allocate memory for thread data.
string extension = toString(i) + ".temp";
- uchimeData* tempUchime = new uchimeData(outputFName+extension, templatefile, filename+extension, fastafile, namefile, groupfile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end, i);
+ uchimeData* tempUchime = new uchimeData(outputFName+extension, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end, i);
tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
int printFile(vector<seqPriorityNode>&, string);
int deconvoluteResults(SequenceParser&, string, string, string);
int driverGroups(SequenceParser&, string, string, string, string, int, int, vector<string>);
- int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>);
+ int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>, string, string, string);
};
#include "classifytreecommand.h"
#include "phylotree.h"
+#include "treereader.h"
//**********************************************************************************************************************
vector<string> ClassifyTreeCommand::setParameters(){
if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
vector<string> tempOutNames;
outputTypes["tree"] = tempOutNames;
outputTypes["summary"] = tempOutNames;
// reading tree info //
/***************************************************/
m->setTreeFile(treefile);
- if (groupfile != "") {
- //read in group map info.
- tmap = new TreeMap(groupfile);
- tmap->readMap();
- }else{ //fake out by putting everyone in one group
- Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
- tmap = new TreeMap();
-
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
- }
-
- if (namefile != "") { readNamesFile(); }
-
- read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
-
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- vector<Tree*> T = read->getTrees();
- Tree* outputTree = T[0];
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (namefile != "") {
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
- }else { numNamesInTree = m->Treenames.size(); }
-
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
-
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
+
+ TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+ vector<Tree*> T = reader->getTrees();
+ TreeMap* tmap = T[0]->getTreeMap();
+ Tree* outputTree = T[0];
+ delete reader;
+
+ if (namefile != "") { readNamesFile(); }
- if (m->control_pressed) { delete outputTree; delete tmap; return 0; }
+ if (m->control_pressed) { delete tmap; delete outputTree; return 0; }
readTaxonomyFile();
-
/***************************************************/
// get concensus taxonomies //
/***************************************************/
int lc = T->tree[i].getLChild();
int rc = T->tree[i].getRChild();
+ TreeMap* tmap = T->getTreeMap();
if (lc == -1) { //you are a leaf your only descendant is yourself
string group = tmap->getGroup(T->tree[i].getName());
void help() { m->mothurOut(getHelpString()); }
private:
- ReadTree* read;
- TreeMap* tmap;
string treefile, taxonomyfile, groupfile, namefile, outputDir;
bool abort;
vector<string> outputNames;
#include "cooccurrencecommand.h"
#include "pcrseqscommand.h"
#include "createdatabasecommand.h"
+#include "makebiomcommand.h"
/*******************************************************/
commands["cooccurrence"] = "cooccurrence";
commands["pcr.seqs"] = "pcr.seqs";
commands["create.database"] = "create.database";
+ commands["make.biom"] = "make.biom";
commands["quit"] = "MPIEnabled";
}
delete shellcommand;
delete pipecommand;
}
+/***********************************************************/
+/***********************************************************/
+int CommandFactory::checkForRedirects(string optionString) {
+ try {
+
+ int pos = optionString.find("outputdir");
+ if (pos != string::npos) { //user has set outputdir in command option string
+ string outputOption = "";
+ bool foundEquals = false;
+ for(int i=pos;i<optionString.length();i++){
+ if(optionString[i] == ',') { break; }
+ else if(optionString[i] == '=') { foundEquals = true; }
+ if (foundEquals) { outputOption += optionString[i]; }
+ }
+ if(m->dirCheck(outputOption)){
+ setOutputDirectory(outputOption);
+ m->mothurOut("Setting output directory to: " + outputOption); m->mothurOutEndLine();
+ }
+ }
+
+ pos = optionString.find("inputdir");
+ if (pos != string::npos) { //user has set inputdir in command option string
+ string intputOption = "";
+ bool foundEquals = false;
+ for(int i=pos;i<optionString.length();i++){
+ if(optionString[i] == ',') { break; }
+ else if(optionString[i] == '=') { foundEquals = true; }
+ if (foundEquals) { intputOption += optionString[i]; }
+ }
+ if(m->dirCheck(intputOption)){
+ setInputDirectory(intputOption);
+ m->mothurOut("Setting input directory to: " + intputOption); m->mothurOutEndLine();
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CommandFactory", "getCommand");
+ exit(1);
+ }
+}
/***********************************************************/
/***********************************************************/
try {
delete command; //delete the old command
-
+
+ checkForRedirects(optionString);
+
//user has opted to redirect output from dir where input files are located to some other place
if (outputDir != "") {
if (optionString != "") { optionString += ", outputdir=" + outputDir; }
else if(commandName == "cooccurrence") { command = new CooccurrenceCommand(optionString); }
else if(commandName == "pcr.seqs") { command = new PcrSeqsCommand(optionString); }
else if(commandName == "create.database") { command = new CreateDatabaseCommand(optionString); }
+ else if(commandName == "make.biom") { command = new MakeBiomCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
try {
delete pipecommand; //delete the old command
+ checkForRedirects(optionString);
+
//user has opted to redirect output from dir where input files are located to some other place
if (outputDir != "") {
if (optionString != "") { optionString += ", outputdir=" + outputDir; }
else if(commandName == "cooccurrence") { pipecommand = new CooccurrenceCommand(optionString); }
else if(commandName == "pcr.seqs") { pipecommand = new PcrSeqsCommand(optionString); }
else if(commandName == "create.database") { pipecommand = new CreateDatabaseCommand(optionString); }
+ else if(commandName == "make.biom") { pipecommand = new MakeBiomCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "cooccurrence") { shellcommand = new CooccurrenceCommand(); }
else if(commandName == "pcr.seqs") { shellcommand = new PcrSeqsCommand(); }
else if(commandName == "create.database") { shellcommand = new CreateDatabaseCommand(); }
+ else if(commandName == "make.biom") { shellcommand = new MakeBiomCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
string outputDir, inputDir, logFileName;\r
bool append;\r
\r
+ int checkForRedirects(string);\r
+ \r
static CommandFactory* _uniqueInstance;\r
CommandFactory( const CommandFactory& ); // Disable copy constructor\r
void operator=( const CommandFactory& ); // Disable assignment operator\r
optionString = "";
commandString = "";
- if(openParen != -1 && closeParen != -1){
- commandString = input.substr(0, openParen); //commandString contains everything before "("
+ if(openParen != -1 && closeParen != -1){
+ //gobble extra spaces
+ int spot = 0;
+ for (int i = 0; i < input.length(); i++) { if (!(isspace(input[i]))) { spot = i; break; } }
+ if (spot > openParen) { spot = 0; }
+ commandString = input.substr(spot, openParen-spot); //commandString contains everything before "("
optionString = input.substr((openParen+1), (closeParen-openParen-1)); //optionString contains everything between "(" and ")".
}
else if (openParen == -1) { m->mothurOut("[ERROR]: You are missing ("); m->mothurOutEndLine(); }
else if (closeParen == -1) { m->mothurOut("[ERROR]:You are missing )"); m->mothurOutEndLine(); }
-
- //GlobalData* globaldata = GlobalData::getInstance();
- //globaldata->parseGlobalData(commandString, optionString); //parser to separate and check options
- }
+ }
catch(exception& e) {
m->errorOut(e, "CommandOptionParser", "CommandOptionParser");
exit(1);
#include "consensus.h"
//**********************************************************************************************************************
-Tree* Consensus::getTree(vector<Tree*>& t, TreeMap* tmap){
+Tree* Consensus::getTree(vector<Tree*>& t){
try {
numNodes = t[0]->getNumNodes();
numLeaves = t[0]->getNumLeaves();
if (m->control_pressed) { return 0; }
- consensusTree = new Tree(tmap);
+ consensusTree = new Tree(t[0]->getTreeMap());
it2 = nodePairs.find(treeSet);
buildConsensusTree(treeSet);
- if (m->control_pressed) { delete consensusTree; return 0; }
+ if (m->control_pressed) { delete consensusTree; return 0; }
- consensusTree->assembleTree();
+ map<string, string> empty;
+ consensusTree->assembleTree(empty);
- if (m->control_pressed) { delete consensusTree; return 0; }
+ if (m->control_pressed) { delete consensusTree; return 0; }
return consensusTree;
Consensus() { m = MothurOut::getInstance(); }
~Consensus() {}
- Tree* getTree(vector<Tree*>&, TreeMap*);
+ Tree* getTree(vector<Tree*>&);
private:
MothurOut* m;
m->mothurOut("[ERROR]: " + metric + " is not a valid metric option for the cooccurrence command. Choices are cscore, checker, combo, vratio."); m->mothurOutEndLine(); abort = true;
}
- matrix = validParameter.validFile(parameters, "matrix", false); if (matrix == "not found") { matrix = "sim2"; }
+ matrix = validParameter.validFile(parameters, "matrixmodel", false); if (matrix == "not found") { matrix = "sim2"; }
if ((matrix != "sim1") && (matrix != "sim2") && (matrix != "sim3") && (matrix != "sim4") && (matrix != "sim5" ) && (matrix != "sim6" ) && (matrix != "sim7" ) && (matrix != "sim8" ) && (matrix != "sim9" )) {
m->mothurOut("[ERROR]: " + matrix + " is not a valid matrix option for the cooccurrence command. Choices are sim1, sim2, sim3, sim4, sim5, sim6, sim7, sim8, sim9."); m->mothurOutEndLine(); abort = true;
//**********************************************************************************************************************
int CooccurrenceCommand::getCooccurrence(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
- try {
+ try {
int numOTUS = thisLookUp[0]->getNumBins();
- vector< vector<int> > initmatrix; initmatrix.resize(thisLookUp.size());
vector< vector<int> > co_matrix; co_matrix.resize(thisLookUp[0]->getNumBins());
for (int i = 0; i < thisLookUp[0]->getNumBins(); i++) { co_matrix[i].resize((thisLookUp.size()), 0); }
- for (int i = 0; i < thisLookUp.size(); i++) { initmatrix[i].resize((thisLookUp[i]->getNumBins()), 0); }
vector<int> columntotal; columntotal.resize(thisLookUp.size(), 0);
vector<int> rowtotal; rowtotal.resize(numOTUS, 0);
- int rowcount = 0;
- for (int i = 0; i < thisLookUp.size(); i++) {
- for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) {
- if (m->control_pressed) { return 0; }
- int abund = thisLookUp[i]->getAbundance(j);
-
- if(abund > 0) {
- initmatrix[i][j] = 1;
+ for (int i = 0; i < thisLookUp.size(); i++) { //nrows in the shared file
+ for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { //cols of original shared file
+ if (m->control_pressed) { return 0; }
+ int abund = thisLookUp[i]->getAbundance(j);
+
+ if(abund > 0) {
co_matrix[j][i] = 1;
- rowcount++;
- columntotal[j]++;
- }
- }
- rowtotal[i] = rowcount;
- rowcount = 0;
+ rowtotal[j]++;
+ columntotal[i]++;
+ }
+ }
}
//nrows is ncols of inital matrix. All the functions need this value. They assume the transposition has already taken place and nrows and ncols refer to that matrix.
//comatrix and initmatrix are still vectors of vectors of ints as in the original script. The abundancevector is only what was read in ie not a co-occurrence matrix!
- int ncols = numOTUS;//rows of inital matrix
- int nrows = thisLookUp.size();//groups
+ int nrows = numOTUS;//rows of inital matrix
+ int ncols = thisLookUp.size();//groups
double initscore = 0.0;
- //transpose matrix
- int newmatrows = ncols;
- int newmatcols = nrows;
-
- //swap for transposed matrix
- nrows = newmatrows;//ncols;
- ncols = newmatcols;//nrows;
- vector<int> initcolumntotal; initcolumntotal.resize(ncols, 0);
- vector<int> initrowtotal; initrowtotal.resize(nrows, 0);
vector<double> stats;
-
+ double probabilityMatrix[ncols * nrows];
+ vector<vector<int> > nullmatrix(nrows, vector<int>(ncols, 0));
+
TrialSwap2 trial;
- initcolumntotal = rowtotal;
- initrowtotal = columntotal;
- trial.update_row_col_totals(co_matrix, rowtotal, columntotal);
+ int n = accumulate( columntotal.begin(), columntotal.end(), 0 );
- if (metric == "cscore") { initscore = trial.calc_c_score(co_matrix, rowtotal); }
- else if (metric == "checker") { initscore = trial.calc_checker(co_matrix, rowtotal); }
- else if (metric == "vratio") { initscore = trial.calc_vratio(rowtotal, columntotal); }
- else if (metric == "combo") { initscore = trial.calc_combo(co_matrix); }
- else { m->mothurOut("[ERROR]: No metric selected!\n"); m->control_pressed = true; return 1; }
+ //============================================================
- m->mothurOut("Initial c score: " + toString(initscore)); m->mothurOutEndLine();
+ //generate a probability matrix. Only do this once.
+ float start = 0.0;
- //nullmatrix burn in
- for(int i=0;i<10000;i++) {
- if (m->control_pressed) { return 0; }
- if (matrix == "sim1") {
- trial.sim1(co_matrix);
- }else if (matrix == "sim2") {
- trial.sim2(co_matrix);
- }else if (matrix == "sim3") {
- trial.sim3(initmatrix);
- co_matrix = initmatrix;
- }else if (matrix == "sim4") {
- trial.sim4(columntotal, rowtotal, co_matrix);
- }else if (matrix == "sim5") {
- trial.sim5(initcolumntotal, initrowtotal, initmatrix);
- trial.transpose_matrix(initmatrix,co_matrix);
- }else if (matrix == "sim6") {
- trial.sim6(columntotal, co_matrix);
- }else if (matrix == "sim7") {
- trial.sim7(initcolumntotal, initmatrix);
- co_matrix = initmatrix;
- }else if (matrix == "sim8") {
- trial.sim8(columntotal, rowtotal, co_matrix);
- }else if (matrix == "sim9") {
- trial.swap_checkerboards (co_matrix);
- }else{
- m->mothurOut("[ERROR]: No model selected! \n");
- m->control_pressed = true;
+ if (matrix == "sim1") {
+ for(int i=0;i<nrows;i++) {
+ for(int j=0;j<ncols;j++) {
+ probabilityMatrix[ncols * i + j] = start + 1/double(nrows*ncols);
+ start = start + 1/double(nrows*ncols);
+ }
}
}
-
- //run
- for(int i=0;i<runs;i++) {
- if (m->control_pressed) { return 0; }
- //calc metric of nullmatrix
- if (matrix == "sim1") {
- trial.sim1(co_matrix);
- }else if (matrix == "sim2") {
- trial.sim2(co_matrix);
- }else if (matrix == "sim3") {
- trial.sim3(initmatrix);
- co_matrix = initmatrix;
- }else if (matrix == "sim4") {
- trial.sim4(columntotal, rowtotal, co_matrix);
- }else if (matrix == "sim5") {
- trial.sim5(initcolumntotal, initrowtotal, initmatrix);
- trial.transpose_matrix(initmatrix,co_matrix);
- }else if (matrix == "sim6") {
- trial.sim6(columntotal, co_matrix);
- }else if (matrix == "sim7") {
- trial.sim7(initcolumntotal, initmatrix);
- co_matrix = initmatrix;
- }else if (matrix == "sim8") {
- trial.sim8(columntotal, rowtotal, co_matrix);
- }else if (matrix == "sim9") {
- trial.swap_checkerboards (co_matrix);
- }else{
- m->mothurOut("[ERROR]: No model selected! \n");
- m->control_pressed = true;
+ //don't need a prob matrix because we just shuffle the rows, may use this in the future
+ else if (matrix == "sim2") { }
+// for(int i=0;i<nrows;i++) {
+// start = 0.0;
+// for(int j=0;j<ncols;j++) {
+// probabilityMatrix[ncols * i + j] = start + 1/double(ncols);
+// start = start + 1/double(ncols);
+// }
+// }
+// }
+
+ else if (matrix == "sim3") {
+ for(int j=0;j<ncols;j++) {
+ start = 0.0;
+ for(int i=0;i<nrows;i++) {
+ probabilityMatrix[ncols * i + j] = start + 1/double(nrows);
+ start = start + 1/double(nrows);
+ }
+ }
+ }
+
+ else if (matrix == "sim4") {
+ for(int i=0;i<nrows;i++) {
+ start = 0.0;
+ for(int j=0;j<ncols;j++) {
+ probabilityMatrix[ncols * i + j] = start + columntotal[j]/double(n);
+ start = start + columntotal[j]/double(n);
+ }
+ }
+ }
+
+ else if (matrix == "sim5") {
+ for(int j=0;j<ncols;j++) {
+ start = 0.0;
+ for(int i=0;i<nrows;i++) {
+ probabilityMatrix[ncols * i + j] = start + rowtotal[i]/double(n);
+ start = start + rowtotal[i]/double(n);
+ }
+ }
+ }
+
+ else if (matrix == "sim6") {
+ for(int i=0;i<nrows;i++) {
+ for(int j=0;j<ncols;j++) {
+ probabilityMatrix[ncols * i + j] = start + columntotal[j]/double(n*nrows);
+ start = start + columntotal[j]/double(n*nrows);
+ }
+ }
+ }
+
+
+ else if (matrix == "sim7") {
+ for(int i=0;i<nrows;i++) {
+ for(int j=0;j<ncols;j++) {
+ probabilityMatrix[ncols * i + j] = start + rowtotal[i]/double(n*ncols);
+ start = start + rowtotal[i]/double(n*ncols);
+ }
+ }
+ }
+
+ else if (matrix == "sim8") {
+ for(int i=0;i<nrows;i++) {
+ for(int j=0;j<ncols;j++) {
+ probabilityMatrix[ncols * i + j] = start + (rowtotal[i]*columntotal[j])/double(n*n);
+ start = start + (rowtotal[i]*columntotal[j])/double(n*n);
+ }
+ }
+ }
+ else if (matrix == "sim9" || matrix == "sim2") { }
+ else {
+ m->mothurOut("[ERROR]: No model selected! \n");
+ m->control_pressed = true;
+ }
+
+
+ //co_matrix is the transposed shared file, initmatrix is the original shared file
+ if (metric == "cscore") { initscore = trial.calc_c_score(co_matrix, rowtotal, ncols, nrows); }
+ else if (metric == "checker") { initscore = trial.calc_checker(co_matrix, rowtotal, ncols, nrows); }
+ else if (metric == "vratio") { initscore = trial.calc_vratio(nrows, ncols, rowtotal, columntotal); }
+ else if (metric == "combo") { initscore = trial.calc_combo(nrows, ncols, co_matrix); }
+ else { m->mothurOut("[ERROR]: No metric selected!\n"); m->control_pressed = true; return 1; }
+
+ m->mothurOut("Initial c score: " + toString(initscore)); m->mothurOutEndLine();
+
+ double previous;
+ double current;
+ double randnum;
+ int count;
+
+ //burn-in for sim9
+ if(matrix == "sim9") {
+ for(int i=0;i<10000;i++) trial.swap_checkerboards (co_matrix, ncols, nrows);
+ }
+
+ //populate null matrix from probability matrix, do this a lot.
+ for(int k=0;k<runs;k++){
+ nullmatrix.clear();
+ //zero-fill the null matrix
+ nullmatrix.assign(nrows, vector<int>(ncols, 0));
+
+ if(matrix == "sim1" || matrix == "sim6" || matrix == "sim8" || matrix == "sim7") {
+ count = 0;
+ while(count < n) {
+ if (m->control_pressed) { return 0; }
+ nextnum2:
+ previous = 0.0;
+ randnum = rand() / double(RAND_MAX);
+ for(int i=0;i<nrows;i++) {
+ for(int j=0;j<ncols;j++) {
+ current = probabilityMatrix[ncols * i + j];
+ if(randnum <= current && randnum > previous) {
+ nullmatrix[i][j] = 1;
+ count++;
+ if (count > n) break;
+ else
+ goto nextnum2;
+ }
+ previous = current;
+ }
+ }
+ }
+ }
+
+ else if (matrix == "sim2") {
+ for(int i=0;i<nrows;i++) {
+ random_shuffle( co_matrix[i].begin(), co_matrix[i].end() );
+ }
+ //do this for the scoring since those all have nullmatrix as a parameter
+ //nullmatrix gets cleared at the begining of each run
+ nullmatrix = co_matrix;
+ }
+
+ else if(matrix == "sim4") {
+ for(int i=0;i<nrows;i++) {
+ count = 0;
+ while(count < rowtotal[i]) {
+ previous = 0.0;
+ if (m->control_pressed) { return 0; }
+ randnum = rand() / double(RAND_MAX);
+ for(int j=0;j<ncols;j++) {
+ current = probabilityMatrix[ncols * i + j];
+ if(randnum <= current && randnum > previous && nullmatrix[i][j] != 1) {
+ nullmatrix[i][j] = 1;
+ count++;
+ previous = 0.0;
+ break;
+ }
+ previous = current;
+ }
+ }
+ }
+ }
+
+ else if(matrix == "sim3" || matrix == "sim5") {
+ //columns
+ for(int j=0;j<ncols;j++) {
+ count = 0;
+ while(count < columntotal[j]) {
+ if (m->control_pressed) { return 0; }
+ randnum = rand() / double(RAND_MAX);
+ for(int i=0;i<nrows;i++) {
+ current = probabilityMatrix[ncols * i + j];
+ if(randnum <= current && randnum > previous && nullmatrix[i][j] != 1) {
+ nullmatrix[i][j] = 1;
+ count++;
+ previous = 0.0;
+ break;
+ }
+ previous = current;
+ }
+ }
+ }
+ }
+
+ //swap_checkerboards takes the original matrix and swaps checkerboards
+ else if(matrix == "sim9") {
+ trial.swap_checkerboards (co_matrix, ncols, nrows);
+ }
+ else {
+ m->mothurOut("[ERROR]: No null model selected!\n\n"); m->control_pressed = true;
+ return 1;
}
- //
- //
- trial.update_row_col_totals(co_matrix, rowtotal, columntotal);
- if (metric == "cscore") {
- stats.push_back(trial.calc_c_score(co_matrix, rowtotal));
- }else if (metric == "checker") {
- stats.push_back(trial.calc_checker(co_matrix, rowtotal));
- }else if (metric == "vratio") {
- stats.push_back(trial.calc_vratio(rowtotal, columntotal));
- }else if (metric == "combo") {
- stats.push_back(trial.calc_combo(co_matrix));
- }else {
- m->mothurOut("[ERROR]: No metric selected!\n");
- m->control_pressed = true;
+ //run metric on null matrix and add score to the stats vector
+ if (metric == "cscore"){
+ stats.push_back(trial.calc_c_score(nullmatrix, rowtotal, ncols, nrows));
+ }
+ else if (metric == "checker") {
+ stats.push_back(trial.calc_checker(nullmatrix, rowtotal, ncols, nrows));
+ }
+ else if (metric == "vratio") {
+ stats.push_back(trial.calc_vratio(nrows, ncols, rowtotal, columntotal));
+ }
+ else if (metric == "combo") {
+ stats.push_back(trial.calc_combo(nrows, ncols, nullmatrix));
+ }
+ else {
+ m->mothurOut("[ERROR]: No metric selected!\n\n"); m->control_pressed = true;
return 1;
}
}
-
+
+
+
double total = 0.0;
- for (int i=0; i<stats.size();i++) { total+=stats[i]; }
+ for (int i=0; i<stats.size();i++) { total+=stats[i]; }
- double nullMean = double (total/(double)stats.size());
+ double nullMean = double (total/(double)stats.size());
m->mothurOutEndLine(); m->mothurOut("average metric score: " + toString(nullMean)); m->mothurOutEndLine();
double pvalue = 0.0;
- if (metric == "cscore" || metric == "checker") { pvalue = trial.calc_pvalue_greaterthan (stats, initscore); }
- else{ pvalue = trial.calc_pvalue_lessthan (stats, initscore); }
+ if (metric == "cscore" || metric == "checker") { pvalue = trial.calc_pvalue_greaterthan (stats, initscore); }
+ else{ pvalue = trial.calc_pvalue_lessthan (stats, initscore); }
m->mothurOut("pvalue: " + toString(pvalue)); m->mothurOutEndLine();
out << metric << '\t' << thisLookUp[0]->getLabel() << '\t' << nullMean << '\t' << pvalue << endl;
return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "CooccurrenceCommand", "Cooccurrence");
- exit(1);
- }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "Cooccurrence");
+ exit(1);
+ }
}
//**********************************************************************************************************************
*/
#include "deuniquetreecommand.h"
+#include "treereader.h"
//**********************************************************************************************************************
vector<string> DeuniqueTreeCommand::setParameters(){
}
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
- //check for required parameters
+ //check for required parameters
treefile = validParameter.validFile(parameters, "tree", true);
if (treefile == "not open") { abort = true; }
else if (treefile == "not found") { //if there is a current design file, use it
m->setTreeFile(treefile);
- //extracts names from tree to make faked out groupmap
- Tree* tree = new Tree(treefile); delete tree;
- tmap = new TreeMap();
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-
- if (m->control_pressed) { delete tmap; return 0; }
-
- readNamesFile();
-
- if (m->control_pressed) { delete tmap; return 0; }
-
- ReadTree* read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- vector<Tree*> T = read->getTrees();
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
-
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
-
+ TreeReader* reader = new TreeReader(treefile, "", namefile);
+ vector<Tree*> T = reader->getTrees();
+ map<string, string> nameMap = reader->getNameMap();
+ delete reader;
//print new Tree
string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + "deunique.tre";
outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
ofstream out;
m->openOutputFile(outputFile, out);
- T[0]->print(out, "deunique");
+ T[0]->print(out, nameMap);
out.close();
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+ delete (T[0]->getTreeMap());
+ for (int i = 0; i < T.size(); i++) { delete T[i]; }
//set phylip file as new current phylipfile
string current = "";
exit(1);
}
}
-/*****************************************************************/
-int DeuniqueTreeCommand::readNamesFile() {
- try {
- m->names.clear();
- numUniquesInName = 0;
-
- ifstream in;
- m->openInputFile(namefile, in);
-
- string first, second;
- map<string, string>::iterator itNames;
-
- while(!in.eof()) {
- in >> first >> second; m->gobble(in);
-
- numUniquesInName++;
-
- itNames = m->names.find(first);
- if (itNames == m->names.end()) {
- m->names[first] = second;
-
- //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
- vector<string> dupNames;
- m->splitAtComma(second, dupNames);
-
- for (int i = 0; i < dupNames.size(); i++) {
- nameMap[dupNames[i]] = dupNames[i];
- if (i != 0) { tmap->addSeq(dupNames[i], "Group1"); }
- }
- }else { m->mothurOut(first + " has already been seen in namefile, aborting."); m->mothurOutEndLine(); in.close(); m->names.clear(); m->control_pressed = true; return 1; }
- }
- in.close();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "DeuniqueTreeCommand", "readNamesFile");
- exit(1);
- }
-}
/***********************************************************/
#include "command.hpp"
-#include "treemap.h"
#include "sharedutilities.h"
#include "readtree.h"
private:
- TreeMap* tmap;
int numUniquesInName;
bool abort;
mout->clearGroups();
mout->clearAllGroups();
mout->Treenames.clear();
- mout->names.clear();
mout->saveNextLabel = "";
mout->printedHeaders = false;
mout->commandInputsConvertError = false;
mout->clearGroups();
mout->clearAllGroups();
mout->Treenames.clear();
- mout->names.clear();
mout->saveNextLabel = "";
mout->printedHeaders = false;
mout->commandInputsConvertError = false;
mout->clearGroups();
mout->clearAllGroups();
mout->Treenames.clear();
- mout->names.clear();
mout->saveNextLabel = "";
mout->printedHeaders = false;
mout->commandInputsConvertError = false;
m->setTaxonomyFile("");
}else if (types[i] == "flow") {
m->setFlowFile("");
+ }else if (types[i] == "biom") {
+ m->setBiomFile("");
}else if (types[i] == "processors") {
m->setProcessors("1");
}else if (types[i] == "all") {
m->clearGroups();
m->clearAllGroups();
m->Treenames.clear();
- m->names.clear();
vector<string> tempOutNames;
outputTypes["tree"] = tempOutNames;
designMap->readDesignMap();
//fill Groups - checks for "all" and for any typo groups
- SharedUtil* util = new SharedUtil();
+ SharedUtil util;
vector<string> nameGroups = designMap->getNamesOfGroups();
- util->setGroups(Groups, nameGroups);
+ util.setGroups(Groups, nameGroups);
designMap->setNamesOfGroups(nameGroups);
- delete util;
//loop through the Groups and fill Globaldata's Groups with the design file info
vector<string> namesSeqs = designMap->getNamesSeqs(Groups);
else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } }
for (int i = 0; i < T.size(); i++) { delete T[i]; } delete treeMap; return 0;
}
-
- T[0]->assembleTree();
+
+ map<string, string> nameMap;
+ T[0]->assembleTree(nameMap);
/***************************************************/
// create ouptut tree - respecting pickedGroups //
Tree* outputTree = new Tree(m->getNumGroups(), treeMap);
outputTree->getSubTree(T[0], m->getGroups());
- outputTree->assembleTree();
+ outputTree->assembleTree(nameMap);
//no longer need original tree, we have output tree to use and label
for (int i = 0; i < T.size(); i++) { delete T[i]; }
-
if (m->control_pressed) {
if (designfile != "") { delete designMap; }
if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } }
if (m->control_pressed) { out.close(); return 0; }
- out << (j+1) << '\t' << indicatorValues[j] << '\t';
+ out << m->currentBinLabels[j] << '\t' << indicatorValues[j] << '\t';
if (pValues[j] > (1/(float)iters)) { out << pValues[j] << endl; }
else { out << "<" << (1/(float)iters) << endl; }
if (pValues[j] <= 0.05) {
- cout << "OTU" << j+1 << '\t' << indicatorValues[j] << '\t';
+ cout << m->currentBinLabels[j] << '\t' << indicatorValues[j] << '\t';
string pValueString = "<" + toString((1/(float)iters));
if (pValues[j] > (1/(float)iters)) { pValueString = toString(pValues[j]); cout << pValues[j];}
else { cout << "<" << (1/(float)iters); }
- m->mothurOutJustToLog("OTU" + toString(j+1) + "\t" + toString(indicatorValues[j]) + "\t" + pValueString);
+ m->mothurOutJustToLog(m->currentBinLabels[j] + "\t" + toString(indicatorValues[j]) + "\t" + pValueString);
m->mothurOutEndLine();
}
}
//print headings
out << "TreeNode\t";
- for (int i = 0; i < numBins; i++) { out << "OTU" << (i+1) << "_IndValue" << '\t' << "pValue" << '\t'; }
+ for (int i = 0; i < numBins; i++) { out << m->currentBinLabels[i] << "_IndValue" << '\t' << "pValue" << '\t'; }
out << endl;
m->mothurOutEndLine(); m->mothurOut("Node\tSpecies\tIndicatorValue\tpValue\n");
}
if (pValues[j] <= 0.05) {
- cout << i+1 << "\tOTU" << j+1 << '\t' << indicatorValues[j] << '\t';
+ cout << i+1 << '\t' << m->currentBinLabels[j] << '\t' << indicatorValues[j] << '\t';
string pValueString = "<" + toString((1/(float)iters));
if (pValues[j] > (1/(float)iters)) { pValueString = toString(pValues[j]); cout << pValues[j];}
else { cout << "<" << (1/(float)iters); }
- m->mothurOutJustToLog(toString(i) + "\tOTU" + toString(j+1) + "\t" + toString(indicatorValues[j]) + "\t" + pValueString);
+ m->mothurOutJustToLog(toString(i) + "\t" + m->currentBinLabels[j] + "\t" + toString(indicatorValues[j]) + "\t" + pValueString);
m->mothurOutEndLine();
}
}
--- /dev/null
+//
+// makebiomcommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 4/16/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "makebiomcommand.h"
+#include "sharedrabundvector.h"
+#include "inputdata.h"
+
+//taken from http://biom-format.org/documentation/biom_format.html
+/* Minimal Sparse
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 0.9.1",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision 1.4.0-dev",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "sparse",
+ "matrix_element_type": "int",
+ "shape": [5, 6],
+ "data":[[0,2,1],
+ [1,0,5],
+ [1,1,1],
+ [1,3,2],
+ [1,4,3],
+ [1,5,1],
+ [2,2,1],
+ [2,3,4],
+ [2,4,2],
+ [3,0,2],
+ [3,1,1],
+ [3,2,1],
+ [3,5,1],
+ [4,1,1],
+ [4,2,1]
+ ]
+ }
+ */
+/* Minimal dense
+ {
+ "id":null,
+ "format": "Biological Observation Matrix 0.9.1",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "QIIME revision 1.4.0-dev",
+ "date": "2011-12-19T19:00:00",
+ "rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],
+ "columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],
+ "matrix_type": "dense",
+ "matrix_element_type": "int",
+ "shape": [5,6],
+ "data": [[0,0,1,0,0,0],
+ [5,1,0,2,3,1],
+ [0,0,1,4,2,0],
+ [2,1,1,0,0,1],
+ [0,1,1,0,0,0]]
+ }
+ */
+//**********************************************************************************************************************
+vector<string> MakeBiomCommand::setParameters(){
+ try {
+ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
+ CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcontaxonomy);
+ CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
+ CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "",false,false); parameters.push_back(pmatrixtype);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string MakeBiomCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The make.biom command parameters are shared, contaxonomy, groups, matrixtype and label. shared is required, unless you have a valid current file.\n";
+ helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
+ helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
+ helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
+ helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled.\n";
+ helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
+ helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
+ helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
+ helpString += "The make.biom command outputs a .biom file.\n";
+ helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+MakeBiomCommand::MakeBiomCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["biom"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+MakeBiomCommand::MakeBiomCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+ allLines = 1;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+ map<string,string>::iterator it;
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["biom"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("shared");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["shared"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("contaxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["contaxonomy"] = inputDir + it->second; }
+ }
+ }
+
+ //get shared file
+ sharedfile = validParameter.validFile(parameters, "shared", true);
+ if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+ else if (sharedfile == "not found") {
+ //if there is a current shared file, use it
+ sharedfile = m->getSharedFile();
+ if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setSharedFile(sharedfile); }
+
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(sharedfile); }
+
+ contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
+ if (contaxonomyfile == "not found") { contaxonomyfile = ""; }
+ else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
+
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; }
+ else {
+ if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
+ else { allLines = 1; }
+ }
+
+ groups = validParameter.validFile(parameters, "groups", false);
+ if (groups == "not found") { groups = ""; }
+ else {
+ m->splitAtDash(groups, Groups);
+ m->setGroups(Groups);
+ }
+
+ if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
+
+ format = validParameter.validFile(parameters, "matrixtype", false); if (format == "not found") { format = "sparse"; }
+
+ if ((format != "sparse") && (format != "dense")) {
+ m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true;
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int MakeBiomCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ InputData input(sharedfile, "sharedfile");
+ vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+ string lastLabel = lookup[0]->getLabel();
+
+ //if user did not specify a label, then use first one
+ if ((contaxonomyfile != "") && (labels.size() == 0)) {
+ allLines = 0;
+ labels.insert(lastLabel);
+ }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
+
+ if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ getBiom(lookup);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+ }
+
+ if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+
+ getBiom(lookup);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
+ }
+
+ lastLabel = lookup[0]->getLabel();
+
+ //prevent memory leak and get next set
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
+ lookup = input.getSharedRAbundVectors();
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ getBiom(lookup);
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //set sabund file as new current sabundfile
+ string current = "";
+ itTypes = outputTypes.find("biom");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
+ }
+
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
+ try {
+
+ string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + ".biom";
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
+
+ string mothurString = "mothur" + toString(m->getVersion());
+ time_t rawtime;
+ struct tm * timeinfo;
+ time ( &rawtime );
+ timeinfo = localtime ( &rawtime );
+ string dateString = asctime (timeinfo);
+ int pos = dateString.find('\n');
+ if (pos != string::npos) { dateString = dateString.substr(0, pos);}
+ string spaces = " ";
+
+ //standard
+ out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
+ out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
+
+ vector<string> metadata = getMetaData(lookup);
+
+ if (m->control_pressed) { out.close(); return 0; }
+
+ //get row info
+ /*"rows":[
+ {"id":"GG_OTU_1", "metadata":null},
+ {"id":"GG_OTU_2", "metadata":null},
+ {"id":"GG_OTU_3", "metadata":null},
+ {"id":"GG_OTU_4", "metadata":null},
+ {"id":"GG_OTU_5", "metadata":null}
+ ],*/
+ out << spaces + "\"rows\":[\n";
+ string rowFront = spaces + spaces + "{\"id\":\"";
+ string rowBack = "\", \"metadata\":";
+ for (int i = 0; i < m->currentBinLabels.size()-1; i++) {
+ if (m->control_pressed) { out.close(); return 0; }
+ out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n";
+ }
+ out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n";
+
+ //get column info
+ /*"columns": [
+ {"id":"Sample1", "metadata":null},
+ {"id":"Sample2", "metadata":null},
+ {"id":"Sample3", "metadata":null},
+ {"id":"Sample4", "metadata":null},
+ {"id":"Sample5", "metadata":null},
+ {"id":"Sample6", "metadata":null}
+ ],*/
+
+ string colBack = "\", \"metadata\":null}";
+ out << spaces + "\"columns\":[\n";
+ for (int i = 0; i < lookup.size()-1; i++) {
+ if (m->control_pressed) { out.close(); return 0; }
+ out << rowFront << lookup[i]->getGroup() << colBack << ",\n";
+ }
+ out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << "\n" + spaces + "],\n";
+
+ out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
+ out << spaces + "\"shape\": [" << m->currentBinLabels.size() << "," << lookup.size() << "],\n";
+ out << spaces + "\"data\": [";
+
+ vector<string> dataRows;
+ if (format == "sparse") {
+ /*"data":[[0,2,1],
+ [1,0,5],
+ [1,1,1],
+ [1,3,2],
+ [1,4,3],
+ [1,5,1],
+ [2,2,1],
+ [2,3,4],
+ [2,4,2],
+ [3,0,2],
+ [3,1,1],
+ [3,2,1],
+ [3,5,1],
+ [4,1,1],
+ [4,2,1]
+ ]*/
+ string output = "";
+ for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+
+ if (m->control_pressed) { out.close(); return 0; }
+
+ for (int j = 0; j < lookup.size(); j++) {
+ string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
+ //only print non zero values
+ if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
+ }
+ }
+ }else {
+
+ /* "matrix_type": "dense",
+ "matrix_element_type": "int",
+ "shape": [5,6],
+ "data": [[0,0,1,0,0,0],
+ [5,1,0,2,3,1],
+ [0,0,1,4,2,0],
+ [2,1,1,0,0,1],
+ [0,1,1,0,0,0]]*/
+
+ for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+
+ if (m->control_pressed) { out.close(); return 0; }
+
+ string binInfo = "[";
+ for (int j = 0; j < lookup.size()-1; j++) {
+ binInfo += toString(lookup[j]->getAbundance(i)) + ",";
+ }
+ binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
+ dataRows.push_back(binInfo);
+ }
+ }
+
+ for (int i = 0; i < dataRows.size()-1; i++) {
+ out << dataRows[i] << ",\n" + spaces + spaces;
+ }
+ out << dataRows[dataRows.size()-1] << "]\n";
+
+ out << "}\n";
+ out.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "getBiom");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
+ try {
+ vector<string> metadata;
+
+ if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) { metadata.push_back("null"); } }
+ else {
+
+ //read constaxonomy file storing in a map, otulabel -> taxonomy
+ //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
+ ifstream in;
+ m->openInputFile(contaxonomyfile, in);
+
+ //grab headers
+ m->getline(in); m->gobble(in);
+
+ string otuLabel, tax;
+ int size;
+ vector<string> otuLabels;
+ vector<string> taxs;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return metadata; }
+
+ in >> otuLabel >> size >> tax; m->gobble(in);
+
+ otuLabels.push_back(otuLabel);
+ taxs.push_back(tax);
+ }
+ in.close();
+
+ //should the labels be Otu001 or PhyloType001
+ string firstBin = m->currentBinLabels[0];
+ string binTag = "Otu";
+ if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType"; }
+
+ //convert list file bin labels to shared file bin labels
+ //parse tax strings
+ //save in map
+ map<string, string> labelTaxMap;
+ string snumBins = toString(otuLabels.size());
+ for (int i = 0; i < otuLabels.size(); i++) {
+
+ if (m->control_pressed) { return metadata; }
+
+ //if there is a bin label use it otherwise make one
+ string binLabel = binTag;
+ string sbinNumber = otuLabels[i];
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+
+ labelTaxMap[binLabel] = taxs[i];
+ }
+
+
+ //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
+
+ //traverse the binLabels forming the metadata strings and saving them
+ //make sure to sanity check
+ map<string, string>::iterator it;
+ for (int i = 0; i < m->currentBinLabels.size(); i++) {
+
+ if (m->control_pressed) { return metadata; }
+
+ it = labelTaxMap.find(m->currentBinLabels[i]);
+
+ if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentBinLabels[i] + ".\n"); m->control_pressed = true; }
+ else {
+ vector<string> bootstrapValues;
+ string data = "{\"taxonomy\":[";
+
+ vector<string> scores;
+ vector<string> taxonomies = parseTax(it->second, scores);
+
+ for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
+ data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
+
+ //add bootstrap values if available
+ if (scores[0] != "null") {
+ data += ", \"bootstrap\":[";
+
+ for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
+ data += scores[scores.size()-1] + "]";
+
+ }
+ data += "}";
+
+ metadata.push_back(data);
+ }
+ }
+ }
+
+ return metadata;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "getMetadata");
+ exit(1);
+ }
+
+}
+/**************************************************************************************************/
+//returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
+vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
+ try {
+
+ string taxon;
+ vector<string> taxs;
+
+ while (tax.find_first_of(';') != -1) {
+
+ if (m->control_pressed) { return taxs; }
+
+ //get taxon
+ taxon = tax.substr(0,tax.find_first_of(';'));
+
+ int pos = taxon.find_last_of('(');
+ if (pos != -1) {
+ //is it a number?
+ int pos2 = taxon.find_last_of(')');
+ if (pos2 != -1) {
+ string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
+ if (m->isNumeric1(confidenceScore)) {
+ taxon = taxon.substr(0, pos); //rip off confidence
+ scores.push_back(confidenceScore);
+ }else{ scores.push_back("null"); }
+ }
+ }
+
+ //strip "" if they are there
+ pos = taxon.find("\"");
+ if (pos != string::npos) {
+ string newTax = "";
+ for (int k = 0; k < taxon.length(); k++) {
+ if (taxon[k] != '\"') { newTax += taxon[k]; }
+ }
+ taxon = newTax;
+ }
+
+ //look for bootstrap value
+ taxs.push_back(taxon);
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ }
+
+ return taxs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "parseTax");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+
+
--- /dev/null
+#ifndef Mothur_makebiomcommand_h
+#define Mothur_makebiomcommand_h
+
+//
+// makebiomcommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 4/16/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+
+#include "command.hpp"
+#include "sharedrabundvector.h"
+#include "inputdata.h"
+
+
+class MakeBiomCommand : public Command {
+
+public:
+ MakeBiomCommand(string);
+ MakeBiomCommand();
+ ~MakeBiomCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "make.biom"; }
+ string getCommandCategory() { return "General"; }
+ string getHelpString();
+ string getCitation() { return "http://biom-format.org/documentation/biom_format.html, http://www.mothur.org/wiki/Make.biom"; }
+ string getDescription() { return "creates a biom file"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+
+ string sharedfile, contaxonomyfile, groups, outputDir, format, label;
+ vector<string> outputNames, Groups;
+ set<string> labels;
+
+ bool abort, allLines;
+
+ int getBiom(vector<SharedRAbundVector*>&);
+ vector<string> getMetaData(vector<SharedRAbundVector*>&);
+ vector<string> parseTax(string tax, vector<string>& scores);
+};
+
+
+#endif
CYGWIN_BUILD ?= no
USECOMPRESSION ?= no
MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"3/16/2012\""
-VERSION = "\"1.24.1\""
+RELEASE_DATE = "\"4/30/2012\""
+VERSION = "\"1.25.0\""
FORTAN_COMPILER = gfortran
FORTRAN_FLAGS =
outputNames.pop_back();
}else {
- ofstream outTemp;
- string tempOut = outputDir + "data." + setA + "-" + setB + ".matrix";
- m->openOutputFile(tempOut, outTemp);
- for (int i = 0; i < subset.size(); i++) { outTemp << '\t' << subset[i]->getGroup(); }
- outTemp << endl;
-
-
//fill data
for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
//data[j] = new double[subset.size()];
data2[j].resize(subset.size(), 0.0);
- outTemp << "OTU" << (j+1);
+
for (int i = 0; i < subset.size(); i++) {
data2[j][i] = (subset[i]->getAbundance(j));
- outTemp << '\t' << subset[i]->getAbundance(j);
}
- outTemp << endl;
}
- outTemp.close();
+
m->mothurOut("Comparing " + setA + " and " + setB + "..."); m->mothurOutEndLine();
//metastat_main(output, thisLookUp[0]->getNumBins(), subset.size(), threshold, iters, data, setACount);
if (taxonomyfile != "") { mothurOut("taxonomy=" + taxonomyfile); mothurOutEndLine(); }
if (treefile != "") { mothurOut("tree=" + treefile); mothurOutEndLine(); }
if (flowfile != "") { mothurOut("flow=" + flowfile); mothurOutEndLine(); }
+ if (biomfile != "") { mothurOut("biom=" + biomfile); mothurOutEndLine(); }
if (processors != "1") { mothurOut("processors=" + processors); mothurOutEndLine(); }
}
if (taxonomyfile != "") { return true; }
if (treefile != "") { return true; }
if (flowfile != "") { return true; }
+ if (biomfile != "") { return true; }
if (processors != "1") { return true; }
return hasCurrent;
accnosfile = "";
taxonomyfile = "";
flowfile = "";
+ biomfile = "";
processors = "1";
}
catch(exception& e) {
}
/***********************************************************************/
+bool MothurOut::dirCheck(string& dirName){
+ try {
+
+ string tag = "";
+ #ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ tag = toString(pid);
+ #endif
+
+ //add / to name if needed
+ string lastChar = dirName.substr(dirName.length()-1);
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if (lastChar != "/") { dirName += "/"; }
+ #else
+ if (lastChar != "\\") { dirName += "\\"; }
+ #endif
+
+ //test to make sure directory exists
+ dirName = getFullPathName(dirName);
+ string outTemp = dirName + tag + "temp";
+ ofstream out;
+ out.open(outTemp.c_str(), ios::trunc);
+ if(!out) {
+ mothurOut(dirName + " directory does not exist or is not writable."); mothurOutEndLine();
+ }else{
+ out.close();
+ mothurRemove(outTemp);
+ return true;
+ }
+
+ return false;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "dirCheck");
+ exit(1);
+ }
+
+}
+/***********************************************************************/
+
string MothurOut::hasPath(string longName){
try {
string path = "";
int numLines = 0;
if (ableToOpen == 0) { //you opened it
- while(char c = input.get()){
+ while(!input.eof()){
+ char c = input.get();
if(input.eof()) { break; }
else { output << c; if (c == '\n') {numLines++;} }
}
if (control_pressed) { break; }
string firstCol, secondCol;
- in >> firstCol >> secondCol; gobble(in);
+ in >> firstCol; gobble(in);
+ in >> secondCol; gobble(in);
int num = getNumNames(secondCol);
exit(1);
}
}
+/***********************************************************************/
+//This function splits up the various option parameters
+void MothurOut::splitAtChar(string& prefix, string& suffix, char c){
+ try {
+ prefix = suffix.substr(0,suffix.find_first_of(c));
+ if ((suffix.find_first_of(c)+2) <= suffix.length()) { //checks to make sure you don't have comma at end of string
+ suffix = suffix.substr(suffix.find_first_of(c)+1, suffix.length());
+ string space = " ";
+ while(suffix.at(0) == ' ')
+ suffix = suffix.substr(1, suffix.length());
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "splitAtComma");
+ exit(1);
+ }
+}
+
/***********************************************************************/
//This function splits up the various option parameters
vector<string> getAllGroups() { sort(namesOfGroups.begin(), namesOfGroups.end()); return namesOfGroups; }
vector<string> Treenames;
- map<string, string> names;
+ //map<string, string> names;
vector<string> binLabelsInFile;
vector<string> currentBinLabels;
string saveNextLabel, argv, sharedHeaderMode;
//functions from mothur.h
//file operations
+ bool dirCheck(string&); //completes path, appends appropriate / or \, makes sure dir is writable.
vector<unsigned long long> divideFile(string, int&);
int divideFile(string, int&, vector<string>&);
vector<unsigned long long> setFilePosEachLine(string, int&);
void splitAtDash(string&, set<string>&);
void splitAtDash(string&, vector<string>&);
void splitAtChar(string&, vector<string>&, char);
+ void splitAtChar(string&, string&, char);
int removeConfidences(string&);
//math operation
string getAccnosFile() { return accnosfile; }
string getTaxonomyFile() { return taxonomyfile; }
string getFlowFile() { return flowfile; }
+ string getBiomFile() { return biomfile; }
string getProcessors() { return processors; }
void setListFile(string f) { listfile = getFullPathName(f); }
void setAccnosFile(string f) { accnosfile = getFullPathName(f); }
void setTaxonomyFile(string f) { taxonomyfile = getFullPathName(f); }
void setFlowFile(string f) { flowfile = getFullPathName(f); }
+ void setBiomFile(string f) { biomfile = getFullPathName(f); }
void setProcessors(string p) { processors = p; }
void printCurrentFiles();
taxonomyfile = "";
processors = "1";
flowfile = "";
+ biomfile = "";
gui = false;
printedHeaders = false;
commandInputsConvertError = false;
string defaultPath, outputDir;
string releaseDate, version;
- string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile;
+ string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile;
string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile;
vector<string> Groups;
it->second = m->getAccnosFile();
}else if (it->first == "taxonomy") {
it->second = m->getTaxonomyFile();
+ }else if (it->first == "biom") {
+ it->second = m->getBiomFile();
}else {
m->mothurOut("[ERROR]: mothur does not save a current file for " + it->first); m->mothurOutEndLine();
}
else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- if (m->binLabelsInFile.size() != 0) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
- else { out << i+1 << '\t' << k+1 << '\t' << coef << '\t' << sig << endl; }
+ out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl;
}
}
else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- if (m->binLabelsInFile.size() != 0) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
- else { out << i+1 << '\t' << k+1 << '\t' << coef << '\t' << sig << endl; }
+ out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl;
}
}
temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
m->mothurConvert(temp, misMatch);
+ if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
m->mothurConvert(temp, gapOpen);
+ if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
m->mothurConvert(temp, gapExtend);
+ if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
try {
processors = p;
outputDir = o;
+ TreeMap* tmap = t->getTreeMap();
//if the users enters no groups then give them the score of all groups
vector<string> mGroups = m->getGroups();
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
}else{
lines.clear();
int numPairs = namesOfGroupCombos.size();
lines.push_back(linePair(startPos, numPairsPerProcessor));
}
- data = createProcesses(t, namesOfGroupCombos);
+ data = createProcesses(t, namesOfGroupCombos, tmap);
}
#else
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
#endif
return data;
}
/**************************************************************************************************/
-EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
process++;
}else if (pid == 0){
EstOutput myresults;
- myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+ myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
if (m->control_pressed) { exit(0); }
}
}
- results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+ results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
}
}
/**************************************************************************************************/
-EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) {
+EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) {
try {
EstOutput results; results.resize(num);
class Parsimony : public TreeCalculator {
public:
- Parsimony(TreeMap* t) : tmap(t) {};
+ Parsimony() {};
~Parsimony() {};
EstOutput getValues(Tree*, int, string);
- //EstOutput getValues(Tree*, string, string) { return data; }
private:
struct linePair {
vector<linePair> lines;
EstOutput data;
- TreeMap* tmap;
int processors;
string outputDir;
- EstOutput driver(Tree*, vector< vector<string> >, int, int);
- EstOutput createProcesses(Tree*, vector< vector<string> >);
+ EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*);
+ EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
};
/***********************************************************************/
*/
#include "parsimonycommand.h"
+#include "treereader.h"
//**********************************************************************************************************************
vector<string> ParsimonyCommand::setParameters(){
}
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
randomtree = validParameter.validFile(parameters, "random", false); if (randomtree == "not found") { randomtree = ""; }
m->setTreeFile(treefile);
- if (groupfile != "") {
- //read in group map info.
- tmap = new TreeMap(groupfile);
- tmap->readMap();
- }else{ //fake out by putting everyone in one group
- Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
- tmap = new TreeMap();
-
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
- }
-
- if (namefile != "") { readNamesFile(); }
-
- read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
-
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- T = read->getTrees();
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (namefile != "") {
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
- }else { numNamesInTree = m->Treenames.size(); }
-
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
-
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
-
+ TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+ T = reader->getTrees();
+ tmap = T[0]->getTreeMap();
+ delete reader;
+
if(outputDir == "") { outputDir += m->hasPath(treefile); }
output = new ColumnFile(outputDir + m->getSimpleName(treefile) + ".parsimony", itersString);
outputNames.push_back(outputDir + m->getSimpleName(treefile) + ".parsimony");
}
//set users groups to analyze
- util = new SharedUtil();
+ SharedUtil util;
vector<string> mGroups = m->getGroups();
vector<string> tGroups = tmap->getNamesOfGroups();
- util->setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony"); //sets the groups the user wants to analyze
- util->getCombos(groupComb, mGroups, numComp);
+ util.setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony"); //sets the groups the user wants to analyze
+ util.getCombos(groupComb, mGroups, numComp);
m->setGroups(mGroups);
- delete util;
if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
- pars = new Parsimony(tmap);
+ Parsimony pars;
counter = 0;
Progress* reading;
reading = new Progress("Comparing to random:", iters);
if (m->control_pressed) {
- delete reading; delete pars; delete output;
+ delete reading; delete output;
delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
if (randomtree == "") { outSum.close(); }
for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
if (randomtree == "") {
//get pscores for users trees
for (int i = 0; i < T.size(); i++) {
- userData = pars->getValues(T[i], processors, outputDir); //data = AB, AC, BC, ABC.
+ userData = pars.getValues(T[i], processors, outputDir); //data = AB, AC, BC, ABC.
if (m->control_pressed) {
- delete reading; delete pars; delete output;
+ delete reading; delete output;
delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
if (randomtree == "") { outSum.close(); }
for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
randT->assembleRandomTree();
//get pscore of random tree
- randomData = pars->getValues(randT, processors, outputDir);
+ randomData = pars.getValues(randT, processors, outputDir);
if (m->control_pressed) {
- delete reading; delete pars; delete output; delete randT;
+ delete reading; delete output; delete randT;
if (randomtree == "") { outSum.close(); }
for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
randT->assembleRandomTree();
if (m->control_pressed) {
- delete reading; delete pars; delete output; delete randT;
- delete tmap;
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
+ delete reading; delete output; delete randT; delete tmap;
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
}
//get pscore of random tree
- randomData = pars->getValues(randT, processors, outputDir);
+ randomData = pars.getValues(randT, processors, outputDir);
if (m->control_pressed) {
- delete reading; delete pars; delete output; delete randT;
- delete tmap;
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
+ delete reading; delete output; delete randT; delete tmap;
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
}
for(int r = 0; r < numComp; r++) {
}
if (m->control_pressed) {
- delete reading; delete pars; delete output;
+ delete reading; delete output;
delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
if (randomtree == "") { outSum.close(); }
for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
return 0;
}
//finish progress bar
reading->finish();
delete reading;
-
printParsimonyFile();
if (randomtree == "") { printUSummaryFile(); }
-
- //reset groups parameter
- m->clearGroups();
-
- delete pars; delete output;
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+
+ delete output; delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;}
exit(1);
}
}
-/*****************************************************************/
-int ParsimonyCommand::readNamesFile() {
- try {
- m->names.clear();
- numUniquesInName = 0;
-
- ifstream in;
- m->openInputFile(namefile, in);
-
- string first, second;
- map<string, string>::iterator itNames;
-
- while(!in.eof()) {
- in >> first >> second; m->gobble(in);
-
- numUniquesInName++;
-
- itNames = m->names.find(first);
- if (itNames == m->names.end()) {
- m->names[first] = second;
-
- //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
- vector<string> dupNames;
- m->splitAtComma(second, dupNames);
-
- for (int i = 0; i < dupNames.size(); i++) {
- nameMap[dupNames[i]] = dupNames[i];
- if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
- }
- }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
- }
- in.close();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "ParsimonyCommand", "readNamesFile");
- exit(1);
- }
-}
/***********************************************************/
void help() { m->mothurOut(getHelpString()); }
private:
- ReadTree* read;
- SharedUtil* util;
FileOutput* output;
vector<Tree*> T; //user trees
Tree* randT; //random tree
Tree* copyUserTree;
TreeMap* tmap;
TreeMap* savetmap;
- Parsimony* pars;
vector<string> groupComb; // AB. AC, BC...
string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile;
int iters, numGroups, numComp, counter, processors, numUniquesInName;
bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
bool abort, keepprimer, keepdots;
string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
- int start, end, pdiffs, processors, length;
+ int start, end, processors, length;
vector<string> revPrimer, outputNames;
vector<string> primers;
+++ /dev/null
-/*
- * phylodiversity.cpp
- * Mothur
- *
- * Created by westcott on 4/30/10.
- * Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "phylodiversity.h"
-
-/**************************************************************************************************
-EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes, vector< vector<float> >& data) {
- try {
-
- map<string, float> DScore;
- float totalLength = 0.0;
- data.clear();
-
- //initialize Dscore
- for (int i=0; i<globaldata->Groups.size(); i++) { DScore[globaldata->Groups[i]] = 0.0; }
-
- ********************************************************
- //calculate a D value for each group
- for(int v=0;v<treeNodes.size();v++){
-
- if (m->control_pressed) { return data; }
-
- //calc the branch length
- //while you aren't at root
- float sum = 0.0;
- int index = treeNodes[v];
-
- while(t->tree[index].getParent() != -1){
-
- //if you have a BL
- if(t->tree[index].getBranchLength() != -1){
- sum += abs(t->tree[index].getBranchLength());
- }
- index = t->tree[index].getParent();
- }
-
- //get last breanch length added
- if(t->tree[index].getBranchLength() != -1){
- sum += abs(t->tree[index].getBranchLength());
- }
-
- //for each group in the groups update the total branch length accounting for the names file
- vector<string> groups = t->tree[treeNodes[v]].getGroup();
- for (int j = 0; j < groups.size(); j++) {
- int numSeqsInGroupJ = 0;
- map<string, int>::iterator it;
- it = t->tree[treeNodes[v]].pcount.find(groups[j]);
- if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j
- numSeqsInGroupJ = it->second;
- }
-
- //add branch length to total for group
- DScore[groups[j]] += (numSeqsInGroupJ * sum);
- }
-
- }
-
-
- for (int i=0; i<globaldata->Groups.size(); i++) {
- float percent = DScore[globaldata->Groups[i]];
- data.push_back(percent);
-
- }
-
- return data;
- }
- catch(exception& e) {
- m->errorOut(e, "PhyloDiversity", "getValues");
- exit(1);
- }
-}
-**************************************************************************************************/
-
-
-
+++ /dev/null
-#ifndef PHYLODIVERSITY_H
-#define PHYLODIVERSITY_H
-
-
-/*
- * phylodiversity.h
- * Mothur
- *
- * Created by westcott on 4/30/10.
- * Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "treemap.h"
-#include "mothurout.h"
-
-
-/***********************************************************************/
-
-class PhyloDiversity {
-
- public:
- PhyloDiversity(TreeMap* t) : tmap(t) { m = MothurOut::getInstance(); }
- ~PhyloDiversity() {};
-
- //int getValues(Tree*, vector<int>, vector< vector< float> >&);
-
-
- private:
- MothurOut* m;
- TreeMap* tmap;
-};
-
-/***********************************************************************/
-
-
-#endif
-
*/
#include "phylodiversitycommand.h"
+#include "treereader.h"
//**********************************************************************************************************************
vector<string> PhyloDiversityCommand::setParameters(){
}
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
//check for required parameters
treefile = validParameter.validFile(parameters, "tree", true);
if (treefile == "not open") { treefile = ""; abort = true; }
if (abort == true) { if (calledHelp) { return 0; } return 2; }
m->setTreeFile(treefile);
-
- if (groupfile != "") {
- //read in group map info.
- tmap = new TreeMap(groupfile);
- tmap->readMap();
- }else{ //fake out by putting everyone in one group
- Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
- tmap = new TreeMap();
-
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
- }
-
- if (namefile != "") { readNamesFile(); }
-
- read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
-
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- vector<Tree*> trees = read->getTrees();
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (namefile != "") {
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
- }else { numNamesInTree = m->Treenames.size(); }
-
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
-
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
-
- SharedUtil* util = new SharedUtil();
+ TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+ vector<Tree*> trees = reader->getTrees();
+ tmap = trees[0]->getTreeMap();
+ delete reader;
+
+ SharedUtil util;
vector<string> mGroups = m->getGroups();
vector<string> tGroups = tmap->getNamesOfGroups();
- util->setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze
- delete util;
+ util.setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze
//incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i); break; } }
exit(1);
}
}
-/*****************************************************************/
-int PhyloDiversityCommand::readNamesFile() {
- try {
- m->names.clear();
- numUniquesInName = 0;
-
- ifstream in;
- m->openInputFile(namefile, in);
-
- string first, second;
- map<string, string>::iterator itNames;
-
- while(!in.eof()) {
- in >> first >> second; m->gobble(in);
-
- numUniquesInName++;
-
- itNames = m->names.find(first);
- if (itNames == m->names.end()) {
- m->names[first] = second;
-
- //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
- vector<string> dupNames;
- m->splitAtComma(second, dupNames);
-
- for (int i = 0; i < dupNames.size(); i++) {
- nameMap[dupNames[i]] = dupNames[i];
- if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
- }
- }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
- }
- in.close();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
- exit(1);
- }
-}
-
//**********************************************************************************************************************
#include "command.hpp"
#include "treemap.h"
-#include "readtree.h"
#include "sharedutilities.h"
-
+#include "tree.h"
class PhyloDiversityCommand : public Command {
int execute();
void help() { m->mothurOut(getHelpString()); }
private:
- ReadTree* read;
TreeMap* tmap;
float freq;
int iters, processors, numUniquesInName;
bool abort, rarefy, summary, collect, scale;
string groups, outputDir, treefile, groupfile, namefile;
vector<string> Groups, outputNames; //holds groups to be used, and outputFile names
- map<string, string> nameMap;
int readNamesFile();
void printData(set<int>&, map< string, vector<float> >&, ofstream&, int);
}
//check for necessary files
- string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
+ string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
ifstream FileTest(taxFileNameTest.c_str());
if (!FileTest) {
CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
- CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
string PcrSeqsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The pcr.seqs command reads a fasta file ...\n";
-
+ helpString += "The pcr.seqs command reads a fasta file.\n";
+ helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
+ helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
+ helpString += "The start parameter allows you to provide a starting position to trim to.\n";
+ helpString += "The end parameter allows you to provide a ending position to trim from.\n";
+ helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
+ helpString += "The processors parameter allows you to use multiple processors.\n";
+ helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
+ helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
return helpString;
}
- //if the user changes the output directory command factory will send this info to us in the output parameter
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
//check for required parameters
fastafile = validParameter.validFile(parameters, "fasta", true);
}else if (fastafile == "not open") { fastafile = ""; abort = true; }
else { m->setFastaFile(fastafile); }
-
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
string temp;
if (taxfile == "not found"){ taxfile = ""; }
else if(taxfile == "not open"){ taxfile = ""; abort = true; }
else { m->setTaxonomyFile(taxfile); }
-
- temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
- m->mothurConvert(temp, pdiffs);
-
+
temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
m->mothurConvert(temp, start);
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
+ outputNames.clear();
num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
+
+ string tempFile = toString(getpid()) + ".outputNames.temp";
+ ofstream outTemp;
+ m->openOutputFile(tempFile, outTemp);
+
+ outTemp << outputNames.size();
+ for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
+ outTemp.close();
+
exit(0);
}else {
m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
int temp = processIDS[i];
wait(&temp);
}
-
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempFile = toString(processIDS[i]) + ".outputNames.temp";
+ ifstream intemp;
+ m->openInputFile(tempFile, intemp);
+
+ int num;
+ intemp >> num;
+ for (int k = 0; k < num; k++) {
+ string name = "";
+ intemp >> name; m->gobble(intemp);
+
+ outputNames.push_back(name); outputTypes["map"].push_back(name);
+ }
+ intemp.close();
+ m->mothurRemove(tempFile);
+ }
#else
//////////////////////////////////////////////////////////////////////////////////////////////////////
//find different types of files
map<string, map<string, string> > typesFiles;
- map<string, string> temp;
for (int i = 0; i < outputNames.size(); i++) {
string extension = m->getExtension(outputNames[i]);
string newLine = labels.substr(0, labels.find_first_of('\t'));
newLine += "\tGroup" + labels.substr(labels.find_first_of('\t'));
-
- temp[outputNames[i]] = file2Group[i];
- typesFiles[extension] = temp;
+
+ map<string, map<string, string> >::iterator itfind = typesFiles.find(extension);
+ if (itfind != typesFiles.end()) {
+ (itfind->second)[outputNames[i]] = file2Group[i];
+ }else {
+ map<string, string> temp;
+ temp[outputNames[i]] = file2Group[i];
+ typesFiles[extension] = temp;
+ }
string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
m->openOutputFileAppend(combineFileName, out);
newFileNames.push_back(combineFileName);
-
map<string, string> thisTypesFiles = it->second;
//open each type summary file
string thisfilename = itFileNameGroup->first;
string group = itFileNameGroup->second;
-
+
ifstream temp;
m->openInputFile(thisfilename, temp);
count++;
thisFilesLines.push_back(thisLine);
-
m->gobble(temp);
}
for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
string thisfilename = itFileNameGroup->first;
-
map<int, int>::iterator itLine = lineToNumber.find(k);
if (itLine != lineToNumber.end()) {
string output = toString(itLine->second);
}
}
/***********************************************************************/
-int ReadTree::AssembleTrees() {
+int ReadTree::AssembleTrees(map<string, string> nameMap) {
try {
//assemble users trees
for (int i = 0; i < Trees.size(); i++) {
if (m->control_pressed) { return 0; }
- Trees[i]->assembleTree();
+ Trees[i]->assembleTree(nameMap);
}
return 0;
}
float readBranchLength(istream& f);
vector<Tree*> getTrees() { return Trees; }
- int AssembleTrees();
+ int AssembleTrees(map<string, string>);
protected:
vector<Tree*> Trees;
delete tempInput;
m->setGroups(groupsToKeep);
m->clearAllGroups();
- m->names.clear();
m->saveNextLabel = "";
m->printedHeaders = false;
m->currentBinLabels.clear();
//make sure this sequence is in the namefile, else error
map<string, int>::iterator it = nameMap.find(current.getName());
- if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+ if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
else { num = it->second; }
}
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pflow);
+ CommandParameter pbiom("biom", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pbiom);
CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pphylip);
CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcolumn);
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
try {
string helpString = "";
helpString += "The set.current command allows you to set the current files saved by mothur.\n";
- helpString += "The set.current command parameters are: clear, phylip, column, list, rabund, sabund, name, group, design, order, tree, shared, ordergroup, relabund, fasta, qfile, sff, oligos, accnos, taxonomy.\n";
+ helpString += "The set.current command parameters are: clear, phylip, column, list, rabund, sabund, name, group, design, order, tree, shared, ordergroup, relabund, fasta, qfile, sff, oligos, accnos, biom and taxonomy.\n";
helpString += "The clear paramter is used to indicate which file types you would like to clear values for, multiple types can be separated by dashes.\n";
helpString += "The set.current command should be in the following format: \n";
helpString += "set.current(fasta=yourFastaFile) or set.current(fasta=amazon.fasta, clear=name-accnos)\n";
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["flow"] = inputDir + it->second; }
}
+
+ it = parameters.find("biom");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["biom"] = inputDir + it->second; }
+ }
}
//check for parameters
if (flowfile == "not open") { m->mothurOut("Ignoring: " + parameters["flow"]); m->mothurOutEndLine(); flowfile = ""; }
else if (flowfile == "not found") { flowfile = ""; }
if (flowfile != "") { m->setFlowFile(flowfile); }
+
+ biomfile = validParameter.validFile(parameters, "biom", true);
+ if (biomfile == "not open") { m->mothurOut("Ignoring: " + parameters["biom"]); m->mothurOutEndLine(); biomfile = ""; }
+ else if (biomfile == "not found") { biomfile = ""; }
+ if (biomfile != "") { m->setBiomFile(biomfile); }
processors = validParameter.validFile(parameters, "processors", false);
if (processors == "not found") { processors = "1"; }
m->setTaxonomyFile("");
}else if (types[i] == "flow") {
m->setFlowFile("");
+ }else if (types[i] == "biom") {
+ m->setBiomFile("");
}else if (types[i] == "processors") {
m->setProcessors("1");
}else if (types[i] == "all") {
string clearTypes;
vector<string> types;
- string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile;
+ string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile;
string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile;
commandFactory = CommandFactory::getInstance();
- string tag = "";
-#ifdef USE_MPI
- int pid;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
- tag = toString(pid);
-#endif
-
m->mothurOut("Mothur's directories:"); m->mothurOutEndLine();
//redirect output
m->mothurOut("outputDir=" + output); m->mothurOutEndLine();
commandFactory->setOutputDirectory(output);
}else {
- //add / to name if needed
- string lastChar = output.substr(output.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- if (lastChar != "/") { output += "/"; }
- #else
- if (lastChar != "\\") { output += "\\"; }
- #endif
-
- //test to make sure directory exists
- output = m->getFullPathName(output);
- string outTemp = output + tag + "temp";
- ofstream out;
- out.open(outTemp.c_str(), ios::trunc);
- if(!out) {
- m->mothurOut(output + " directory does not exist or is not writable."); m->mothurOutEndLine();
- }else{
- out.close();
- m->mothurRemove(outTemp);
- m->mothurOut("outputDir=" + output); m->mothurOutEndLine();
+ if (m->dirCheck(output)) {
+ m->mothurOut("outputDir=" + output); m->mothurOutEndLine();
commandFactory->setOutputDirectory(output);
- }
+ }
}
//redirect input
m->mothurOut("inputDir=" + input); m->mothurOutEndLine();
commandFactory->setInputDirectory(input);
}else {
- //add / to name if needed
- string lastChar = input.substr(input.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- if (lastChar != "/") { input += "/"; }
- #else
- if (lastChar != "\\") { input += "\\"; }
- #endif
-
- //test to make sure directory exists
- input = m->getFullPathName(input);
- string inTemp = input + tag + "temp";
- ofstream in;
- in.open(inTemp.c_str(), ios::trunc);
- if(!in) {
- m->mothurOut(input + " directory does not exist or is not writable."); m->mothurOutEndLine();
- }else{
- in.close();
- m->mothurRemove(inTemp);
- m->mothurOut("inputDir=" + input); m->mothurOutEndLine();
+ if (m->dirCheck(input)) {
+ m->mothurOut("inputDir=" + input); m->mothurOutEndLine();
commandFactory->setInputDirectory(input);
- }
- }
+ }
+ }
//set default
if (tempdefault == "clear") {
m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();
m->setDefaultPath(tempdefault);
}else {
- //add / to name if needed
- string lastChar = tempdefault.substr(tempdefault.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- if (lastChar != "/") { tempdefault += "/"; }
- #else
- if (lastChar != "\\") { tempdefault += "\\"; }
- #endif
-
- //test to make sure directory exists
- tempdefault = m->getFullPathName(tempdefault);
- string inTemp = tempdefault + tag + "temp";
- ofstream in;
- in.open(inTemp.c_str(), ios::trunc);
- if(!in) {
- m->mothurOut(tempdefault + " directory does not exist or is not writable."); m->mothurOutEndLine();
- }else{
- in.close();
- m->mothurRemove(inTemp);
- m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();
- m->setDefaultPath(tempdefault);
- }
- }
+ if (m->dirCheck(tempdefault)) {
+ m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();
+ m->setDefaultPath(tempdefault);
+ }
+ }
return 0;
}
*/
#include "sharedcommand.h"
+#include "sharedutilities.h"
+
//********************************************************************************************************************
//sorts lowest to highest
inline bool compareSharedRabunds(SharedRAbundVector* left, SharedRAbundVector* right){
//**********************************************************************************************************************
vector<string> SharedCommand::setParameters(){
try {
- CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
- CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
+ CommandParameter pbiom("biom", "InputTypes", "", "", "BiomListGroup", "BiomListGroup", "none",false,false); parameters.push_back(pbiom);
+ CommandParameter plist("list", "InputTypes", "", "", "BiomListGroup", "BiomListGroup", "ListGroup",false,false); parameters.push_back(plist);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "ListGroup",false,false); parameters.push_back(pgroup);
//CommandParameter pordergroup("ordergroup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pordergroup);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
string SharedCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The make.shared command reads a list and group file and creates a shared file, as well as a rabund file for each group.\n";
- helpString += "The make.shared command parameters are list, group, ordergroup, groups and label. list and group are required unless a current file is available.\n";
+ helpString += "The make.shared command reads a list and group file or a biom file and creates a shared file. If a list and group are provided a rabund file is created for each group.\n";
+ helpString += "The make.shared command parameters are list, group, biom, groups and label. list and group are required unless a current file is available or you provide a biom file.\n";
helpString += "The groups parameter allows you to indicate which groups you want to include, group names should be separated by dashes. ex. groups=A-B-C. Default is all groups in your groupfile.\n";
- helpString += "The label parameter allows you to indicate which labels you want to include, label names should be separated by dashes. Default is all labels in your list file.\n";
+ helpString += "The label parameter is only valid with the list and group option and allows you to indicate which labels you want to include, label names should be separated by dashes. Default is all labels in your list file.\n";
//helpString += "The ordergroup parameter allows you to indicate the order of the groups in the sharedfile, by default the groups are listed alphabetically.\n";
return helpString;
}
if (path == "") { parameters["group"] = inputDir + it->second; }
}
- it = parameters.find("ordergroup");
+ /*it = parameters.find("ordergroup");
//user has given a template file
if(it != parameters.end()){
path = m->hasPath(it->second);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["ordergroup"] = inputDir + it->second; }
+ }*/
+
+ it = parameters.find("biom");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["biom"] = inputDir + it->second; }
}
}
//check for required parameters
listfile = validParameter.validFile(parameters, "list", true);
if (listfile == "not open") { listfile = ""; abort = true; }
- else if (listfile == "not found") {
- listfile = m->getListFile();
- if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
- else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
- }else { m->setListFile(listfile); }
+ else if (listfile == "not found") { listfile = ""; }
+ else { m->setListFile(listfile); }
+
+ biomfile = validParameter.validFile(parameters, "biom", true);
+ if (biomfile == "not open") { biomfile = ""; abort = true; }
+ else if (biomfile == "not found") { biomfile = ""; }
+ else { m->setBiomFile(biomfile); }
ordergroupfile = validParameter.validFile(parameters, "ordergroup", true);
if (ordergroupfile == "not open") { abort = true; }
groupfile = validParameter.validFile(parameters, "group", true);
if (groupfile == "not open") { groupfile = ""; abort = true; }
- else if (groupfile == "not found") {
- groupfile = m->getGroupFile();
- if (groupfile != "") {
- m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine();
- groupMap = new GroupMap(groupfile);
-
- int error = groupMap->readMap();
- if (error == 1) { abort = true; }
- vector<string> allGroups = groupMap->getNamesOfGroups();
- m->setAllGroups(allGroups);
- }
- else { m->mothurOut("You have no current group file and the group parameter is required."); m->mothurOutEndLine(); abort = true; }
- }else {
- groupMap = new GroupMap(groupfile);
-
- int error = groupMap->readMap();
- if (error == 1) { abort = true; }
- vector<string> allGroups = groupMap->getNamesOfGroups();
- m->setAllGroups(allGroups);
- m->setGroupFile(groupfile);
- }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+ if ((biomfile == "") && (listfile == "")) {
+ //is there are current file available for either of these?
+ //give priority to list, then biom
+ listfile = m->getListFile();
+ if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
+ else {
+ biomfile = m->getBiomFile();
+ if (biomfile != "") { m->mothurOut("Using " + biomfile + " as input file for the biom parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("No valid current files. You must provide a list or biom file before you can use the make.shared command."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }
+ }
+ else if ((biomfile != "") && (listfile != "")) { m->mothurOut("When executing a make.shared command you must enter ONLY ONE of the following: list or biom."); m->mothurOutEndLine(); abort = true; }
+
+ if (listfile != "") {
+ if (groupfile == "") {
+ groupfile = m->getGroupFile();
+ if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You need to provide a groupfle if you are going to use the list format."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }
+ }
+
+
string groups = validParameter.validFile(parameters, "groups", false);
if (groups == "not found") { groups = ""; }
else {
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
-
+
//getting output filename
- filename = listfile;
+ string filename = "";
+ if (listfile != "") { filename = listfile; }
+ else { filename = biomfile; }
if (outputDir == "") { outputDir += m->hasPath(filename); }
filename = outputDir + m->getRootName(m->getSimpleName(filename));
filename = filename + "shared";
- outputTypes["shared"].push_back(filename);
-
- m->openOutputFile(filename, out);
- pickedGroups = false;
-
- //if hte user has not specified any groups then use them all
- if (Groups.size() == 0) {
- Groups = groupMap->getNamesOfGroups(); m->setGroups(Groups);
- }else { pickedGroups = true; }
-
- //fill filehandles with neccessary ofstreams
- int i;
- ofstream* temp;
- for (i=0; i<Groups.size(); i++) {
- temp = new ofstream;
- filehandles[Groups[i]] = temp;
- }
-
- //set fileroot
- fileroot = outputDir + m->getRootName(m->getSimpleName(listfile));
-
- //clears file before we start to write to it below
- for (int i=0; i<Groups.size(); i++) {
- m->mothurRemove((fileroot + Groups[i] + ".rabund"));
- outputNames.push_back((fileroot + Groups[i] + ".rabund"));
- outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
- }
-
- //lookup.clear();
- string errorOff = "no error";
- //errorOff = "";
-
- //if user provided an order file containing the order the shared file should be in read it
- if (ordergroupfile != "") { readOrderFile(); }
-
- input = new InputData(listfile, "shared");
- SharedList = input->getSharedListVector();
- string lastLabel = SharedList->getLabel();
- vector<SharedRAbundVector*> lookup;
-
- if (m->control_pressed) {
- delete input; delete SharedList; delete groupMap;
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
- out.close(); m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
-
- //sanity check
- int error = ListGroupSameSeqs();
-
- if ((!pickedGroups) && (SharedList->getNumSeqs() != groupMap->getNumSeqs())) { //if the user has not specified any groups and their files don't match exit with error
- m->mothurOut("Your group file contains " + toString(groupMap->getNumSeqs()) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine();
-
- out.close();
- m->mothurRemove(filename); //remove blank shared file you made
-
- createMisMatchFile();
-
- //delete memory
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
- delete it3->second;
- }
-
- delete input; delete SharedList; delete groupMap;
-
- return 0;
- }
-
- if (error == 1) { m->control_pressed = true; }
-
- //if user has specified groups make new groupfile for them
- if (pickedGroups) { //make new group file
- string groups = "";
- if (m->getNumGroups() < 4) {
- for (int i = 0; i < m->getNumGroups(); i++) {
- groups += (m->getGroups())[i] + ".";
- }
- }else { groups = "merge"; }
-
- string newGroupFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + groups + "groups";
- outputTypes["group"].push_back(newGroupFile);
- outputNames.push_back(newGroupFile);
- ofstream outGroups;
- m->openOutputFile(newGroupFile, outGroups);
-
- vector<string> names = groupMap->getNamesSeqs();
- string groupName;
- for (int i = 0; i < names.size(); i++) {
- groupName = groupMap->getGroup(names[i]);
- if (isValidGroup(groupName, m->getGroups())) {
- outGroups << names[i] << '\t' << groupName << endl;
- }
- }
- outGroups.close();
- }
-
- //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
- set<string> processedLabels;
- set<string> userLabels = labels;
-
- while((SharedList != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
- if (m->control_pressed) {
- delete input; delete SharedList; delete groupMap;
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
- out.close(); m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
-
- if(allLines == 1 || labels.count(SharedList->getLabel()) == 1){
-
- lookup = SharedList->getSharedRAbundVector();
-
- m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
- if (pickedGroups) { //check for otus with no seqs in them
- eliminateZeroOTUS(lookup);
- }
-
- if (m->control_pressed) {
- delete input; delete SharedList; delete groupMap;
- for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
- out.close(); m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
-
- if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
- printSharedData(lookup); //prints info to the .shared file
- for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
-
- processedLabels.insert(SharedList->getLabel());
- userLabels.erase(SharedList->getLabel());
- }
-
- if ((m->anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) == true) && (processedLabels.count(lastLabel) != 1)) {
- string saveLabel = SharedList->getLabel();
-
- delete SharedList;
- SharedList = input->getSharedListVector(lastLabel); //get new list vector to process
-
- lookup = SharedList->getSharedRAbundVector();
- m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
- if (pickedGroups) { //check for otus with no seqs in them
- eliminateZeroOTUS(lookup);
- }
-
-
- if (m->control_pressed) {
- delete input; delete SharedList; delete groupMap;
- for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
- out.close(); m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
-
- if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
- printSharedData(lookup); //prints info to the .shared file
- for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
-
- processedLabels.insert(SharedList->getLabel());
- userLabels.erase(SharedList->getLabel());
-
- //restore real lastlabel to save below
- SharedList->setLabel(saveLabel);
- }
-
-
- lastLabel = SharedList->getLabel();
-
- delete SharedList;
- SharedList = input->getSharedListVector(); //get new list vector to process
- }
-
- //output error messages about any remaining user labels
- set<string>::iterator it;
- bool needToRun = false;
- for (it = userLabels.begin(); it != userLabels.end(); it++) {
- if (processedLabels.count(lastLabel) != 1) {
- needToRun = true;
- }
- }
-
- //run last label if you need to
- if (needToRun == true) {
- if (SharedList != NULL) { delete SharedList; }
- SharedList = input->getSharedListVector(lastLabel); //get new list vector to process
-
- lookup = SharedList->getSharedRAbundVector();
- m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
- if (pickedGroups) { //check for otus with no seqs in them
- eliminateZeroOTUS(lookup);
- }
-
- if (m->control_pressed) {
- delete input; delete groupMap;
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
- out.close(); m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
-
- if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
- printSharedData(lookup); //prints info to the .shared file
- for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
- delete SharedList;
- }
-
- out.close();
-
- for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
- delete it3->second;
- }
-
- delete input; delete groupMap;
-
- if (m->control_pressed) {
- m->mothurRemove(filename);
- for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
- return 0;
- }
+ outputNames.push_back(filename); outputTypes["shared"].push_back(filename);
+ if (listfile != "") { createSharedFromListGroup(filename); }
+ else { createSharedFromBiom(filename); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } }
+
//set rabund file as new current rabundfile
string current = "";
itTypes = outputTypes.find("rabund");
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
- m->mothurOut(filename); m->mothurOutEndLine();
m->mothurOutEndLine();
return 0;
}
}
//**********************************************************************************************************************
-void SharedCommand::printSharedData(vector<SharedRAbundVector*> thislookup) {
+int SharedCommand::createSharedFromBiom(string filename) {
+ try {
+ ofstream out;
+ m->openOutputFile(filename, out);
+
+ /*{
+ "id":"/Users/SarahsWork/Desktop/release/temp.job2.shared-unique",
+ "format": "Biological Observation Matrix 0.9.1",
+ "format_url": "http://biom-format.org",
+ "type": "OTU table",
+ "generated_by": "mothur1.24.0",
+ "date": "Tue Apr 17 13:12:07 2012", */
+
+ ifstream in;
+ m->openInputFile(biomfile, in);
+
+ m->getline(in); m->gobble(in); //grab first '{'
+
+ string matrixFormat = "";
+ int numRows = 0;
+ int numCols = 0;
+ int shapeNumRows = 0;
+ int shapeNumCols = 0;
+ vector<string> otuNames;
+ vector<string> groupNames;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ string line = m->getline(in); m->gobble(in);
+
+ string tag = getTag(line);
+
+ if (tag == "type") {
+ //check to make sure this is an OTU table
+ string type = getTag(line);
+ if (type != "OTU table") { m->mothurOut("[ERROR]: " + type + " is not a valid biom type for mothur. Only type allowed is OTU table.\n"); m->control_pressed = true; }
+ }else if (tag == "matrix_type") {
+ //get type and check type
+ matrixFormat = getTag(line);
+ if ((matrixFormat != "sparse") && (matrixFormat != "dense")) { m->mothurOut("[ERROR]: " + matrixFormat + " is not a valid biom matrix_type for mothur. Types allowed are sparse and dense.\n"); m->control_pressed = true; }
+ }else if (tag == "matrix_element_type") {
+ //get type and check type
+ string matrixElementType = getTag(line);
+ if (matrixElementType != "int") { m->mothurOut("[ERROR]: " + matrixElementType + " is not a valid matrix_element_type for mothur. Only type allowed is int.\n"); m->control_pressed = true; }
+ }else if (tag == "rows") {
+ //read otu names
+ otuNames = readRows(line, in, numRows);
+ }else if (tag == "columns") {
+ //read sample names
+ groupNames = readRows(line, in, numCols);
+
+ //if users selected groups, then remove the groups not wanted.
+ SharedUtil util;
+ vector<string> Groups = m->getGroups();
+ vector<string> allGroups = groupNames;
+ util.setGroups(Groups, allGroups);
+ m->setGroups(Groups);
+
+ //fill filehandles with neccessary ofstreams
+ int i;
+ ofstream* temp;
+ for (i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]] = temp;
+ }
+
+ //set fileroot
+ fileroot = outputDir + m->getRootName(m->getSimpleName(biomfile));
+
+ //clears file before we start to write to it below
+ for (int i=0; i<Groups.size(); i++) {
+ m->mothurRemove((fileroot + Groups[i] + ".rabund"));
+ outputNames.push_back((fileroot + Groups[i] + ".rabund"));
+ outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
+ }
+
+ }else if (tag == "shape") {
+ getDims(line, shapeNumRows, shapeNumCols);
+
+ //check shape
+ if (shapeNumCols != numCols) {
+ m->mothurOut("[ERROR]: shape indicates " + toString(shapeNumCols) + " columns, but I only read " + toString(numCols) + " columns.\n"); m->control_pressed = true;
+ }
+
+ if (shapeNumRows != numRows) {
+ m->mothurOut("[ERROR]: shape indicates " + toString(shapeNumRows) + " rows, but I only read " + toString(numRows) + " rows.\n"); m->control_pressed = true;
+ }
+ }else if (tag == "data") {
+ m->currentBinLabels = otuNames;
+
+ //read data
+ vector<SharedRAbundVector*> lookup = readData(matrixFormat, line, in, groupNames, otuNames.size());
+
+ m->mothurOutEndLine(); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ lookup[0]->printHeaders(out);
+ printSharedData(lookup, out);
+ }
+ }
+ in.close();
+
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "createSharedFromBiom");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<SharedRAbundVector*> SharedCommand::readData(string matrixFormat, string line, ifstream& in, vector<string>& groupNames, int numOTUs) {
+ try {
+
+ vector<SharedRAbundVector*> lookup;
+
+ //creates new sharedRAbunds
+ for (int i = 0; i < groupNames.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector(numOTUs); //sets all abunds to 0
+ temp->setLabel("dummy");
+ temp->setGroup(groupNames[i]);
+ lookup.push_back(temp);
+ }
+
+ bool dataStart = false;
+ bool inBrackets = false;
+ string num = "";
+ vector<int> nums;
+ int otuCount = 0;
+ for (int i = 0; i < line.length(); i++) {
+
+ if (m->control_pressed) { return lookup; }
+
+ //look for opening [ to indicate data is starting
+ if ((line[i] == '[') && (!dataStart)) { dataStart = true; i++; if (!(i < line.length())) { break; } }
+ else if ((line[i] == ']') && dataStart && (!inBrackets)) { break; } //we are done reading data
+
+ if (dataStart) {
+ if ((line[i] == '[') && (!inBrackets)) { inBrackets = true; i++; if (!(i < line.length())) { break; } }
+ else if ((line[i] == ']') && (inBrackets)) {
+ inBrackets = false;
+ int temp;
+ m->mothurConvert(num, temp);
+ nums.push_back(temp);
+ num = "";
+
+ //save info to vectors
+ if (matrixFormat == "dense") {
+
+ //sanity check
+ if (nums.size() != lookup.size()) { m->mothurOut("[ERROR]: trouble parsing OTU data. OTU " + toString(otuCount) + " causing errors.\n"); m->control_pressed = true; }
+
+ //set abundances for this otu
+ //nums contains [abundSample0, abundSample1, abundSample2, ...] for current OTU
+ for (int j = 0; j < lookup.size(); j++) { lookup[j]->set(otuCount, nums[j], groupNames[j]); }
+
+ otuCount++;
+ }else {
+ //sanity check
+ if (nums.size() != 3) { m->mothurOut("[ERROR]: trouble parsing OTU data.\n"); m->control_pressed = true; }
+
+ //nums contains [otuNum, sampleNum, abundance]
+ lookup[nums[1]]->set(nums[0], nums[2], groupNames[nums[1]]);
+ }
+ nums.clear();
+ }
+
+ if (inBrackets) {
+ if (line[i] == ',') {
+ int temp;
+ m->mothurConvert(num, temp);
+ nums.push_back(temp);
+ num = "";
+ }else { if (!isspace(line[i])) { num += line[i]; } }
+ }
+ }
+ }
+
+ //same as above just reading from file.
+ while (!in.eof()) {
+
+ char c = in.get(); m->gobble(in);
+
+ if (m->control_pressed) { return lookup; }
+
+ //look for opening [ to indicate data is starting
+ if ((c == '[') && (!dataStart)) { dataStart = true; c = in.get(); if (in.eof()) { break; } }
+ else if ((c == ']') && dataStart && (!inBrackets)) { break; } //we are done reading data
+
+ if (dataStart) {
+ if ((c == '[') && (!inBrackets)) { inBrackets = true; c = in.get(); if (in.eof()) { break; } }
+ else if ((c == ']') && (inBrackets)) {
+ inBrackets = false;
+ int temp;
+ m->mothurConvert(num, temp);
+ nums.push_back(temp);
+ num = "";
+
+ //save info to vectors
+ if (matrixFormat == "dense") {
+
+ //sanity check
+ if (nums.size() != lookup.size()) { m->mothurOut("[ERROR]: trouble parsing OTU data. OTU " + toString(otuCount) + " causing errors.\n"); m->control_pressed = true; }
+
+ //set abundances for this otu
+ //nums contains [abundSample0, abundSample1, abundSample2, ...] for current OTU
+ for (int j = 0; j < lookup.size(); j++) { lookup[j]->set(otuCount, nums[j], groupNames[j]); }
+
+ otuCount++;
+ }else {
+ //sanity check
+ if (nums.size() != 3) { m->mothurOut("[ERROR]: trouble parsing OTU data.\n"); m->control_pressed = true; }
+
+ //nums contains [otuNum, sampleNum, abundance]
+ lookup[nums[1]]->set(nums[0], nums[2], groupNames[nums[1]]);
+ }
+ nums.clear();
+ }
+
+ if (inBrackets) {
+ if (c == ',') {
+ int temp;
+ m->mothurConvert(num, temp);
+ nums.push_back(temp);
+ num = "";
+ }else { if (!isspace(c)) { num += c; } }
+ }
+ }
+ }
+
+ SharedUtil util;
+
+ bool remove = false;
+ for (int i = 0; i < lookup.size(); i++) {
+ //if this sharedrabund is not from a group the user wants then delete it.
+ if (util.isValidGroup(lookup[i]->getGroup(), m->getGroups()) == false) {
+ remove = true;
+ delete lookup[i]; lookup[i] = NULL;
+ lookup.erase(lookup.begin()+i);
+ i--;
+ }
+ }
+
+ if (remove) { eliminateZeroOTUS(lookup); }
+
+
+ return lookup;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "readData");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
+ try {
+
+ vector<SharedRAbundVector*> newLookup;
+ for (int i = 0; i < thislookup.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ vector<string> newBinLabels;
+ string snumBins = toString(thislookup[0]->getNumBins());
+ for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+
+ //look at each sharedRabund and make sure they are not all zero
+ bool allZero = true;
+ for (int j = 0; j < thislookup.size(); j++) {
+ if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
+ }
+
+ //if they are not all zero add this bin
+ if (!allZero) {
+ for (int j = 0; j < thislookup.size(); j++) {
+ newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
+ }
+
+ //if there is a bin label use it otherwise make one
+ string binLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+ if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
+
+ newBinLabels.push_back(binLabel);
+ }
+ }
+
+ for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
+
+ thislookup = newLookup;
+ m->currentBinLabels = newBinLabels;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "eliminateZeroOTUS");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SharedCommand::getDims(string line, int& shapeNumRows, int& shapeNumCols) {
+ try {
+ //get shape
+ bool inBar = false;
+ string num = "";
+
+ for (int i = 0; i < line.length(); i++) {
+
+ //you want to ignore any ; until you reach the next '
+ if ((line[i] == '[') && (!inBar)) { inBar = true; i++; if (!(i < line.length())) { break; } }
+ else if ((line[i] == ']') && (inBar)) {
+ inBar= false;
+ m->mothurConvert(num, shapeNumCols);
+ break;
+ }
+
+ if (inBar) {
+ if (line[i] == ',') {
+ m->mothurConvert(num, shapeNumRows);
+ num = "";
+ }else { if (!isspace(line[i])) { num += line[i]; } }
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "getDims");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> SharedCommand::readRows(string line, ifstream& in, int& numRows) {
+ try {
+ /*"rows":[
+ {"id":"Otu01", "metadata":{"taxonomy":["Bacteria", "Bacteroidetes", "Bacteroidia", "Bacteroidales", "Porphyromonadaceae", "unclassified"], "bootstrap":[100, 100, 100, 100, 100, 100]}},
+ {"id":"Otu02", "metadata":{"taxonomy":["Bacteria", "Bacteroidetes", "Bacteroidia", "Bacteroidales", "Rikenellaceae", "Alistipes"], "bootstrap":[100, 100, 100, 100, 100, 100]}},
+ ...
+ ],*/
+ vector<string> names;
+ int countOpenBrace = 0;
+ int countClosedBrace = 0;
+ int openParen = 0;
+ int closeParen = 0;
+ string nextRow = "";
+ bool end = false;
+
+ for (int i = 0; i < line.length(); i++) {
+
+ if (m->control_pressed) { return names; }
+
+ if (line[i] == '[') { countOpenBrace++; }
+ else if (line[i] == ']') { countClosedBrace++; }
+ else if (line[i] == '{') { openParen++; }
+ else if (line[i] == '}') { closeParen++; }
+ else if (openParen != 0) { nextRow += line[i]; } //you are reading the row info
+
+ //you have reached the end of the rows info
+ if ((countOpenBrace == countClosedBrace) && (countClosedBrace != 0)) { end = true; break; }
+ if ((openParen == closeParen) && (closeParen != 0)) { //process row
+ numRows++;
+ vector<string> items;
+ m->splitAtChar(nextRow, items, ','); //parse by comma, will return junk for metadata but we aren't using that anyway
+ string part = items[0]; items.clear();
+ m->splitAtChar(part, items, ':'); //split part we want containing the ids
+ string name = items[1];
+
+ //remove "" if needed
+ int pos = name.find("\"");
+ if (pos != string::npos) {
+ string newName = "";
+ for (int k = 0; k < name.length(); k++) {
+ if (name[k] != '\"') { newName += name[k]; }
+ }
+ name = newName;
+ }
+ names.push_back(name);
+ nextRow = "";
+ openParen = 0;
+ closeParen = 0;
+ }
+ }
+
+ //keep reading
+ if (!end) {
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ char c = in.get(); m->gobble(in);
+
+ if (c == '[') { countOpenBrace++; }
+ else if (c == ']') { countClosedBrace++; }
+ else if (c == '{') { openParen++; }
+ else if (c == '}') { closeParen++; }
+ else if (openParen != 0) { nextRow += c; } //you are reading the row info
+
+
+ //you have reached the end of the rows info
+ if ((countOpenBrace == countClosedBrace) && (countClosedBrace != 0)) { end = true; break; }
+ if ((openParen == closeParen) && (closeParen != 0)) { //process row
+ numRows++;
+ vector<string> items;
+ m->splitAtChar(nextRow, items, ','); //parse by comma, will return junk for metadata but we aren't using that anyway
+ string part = items[0]; items.clear();
+ m->splitAtChar(part, items, ':'); //split part we want containing the ids
+ string name = items[1];
+
+ //remove "" if needed
+ int pos = name.find("\"");
+ if (pos != string::npos) {
+ string newName = "";
+ for (int k = 0; k < name.length(); k++) {
+ if (name[k] != '\"') { newName += name[k]; }
+ }
+ name = newName;
+ }
+ names.push_back(name);
+ nextRow = "";
+ openParen = 0;
+ closeParen = 0;
+ }
+ }
+ }
+
+ return names;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "readRows");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//designed for things like "type": "OTU table", returns map type -> OTU table
+string SharedCommand::getTag(string& line) {
+ try {
+ bool inQuotes = false;
+ string tag = "";
+ char c = '\"';
+
+ for (int i = 0; i < line.length(); i++) {
+
+ //you want to ignore any ; until you reach the next '
+ if ((line[i] == c) && (!inQuotes)) { inQuotes = true; }
+ else if ((line[i] == c) && (inQuotes)) {
+ inQuotes= false;
+ line = line.substr(i+1);
+ return tag;
+ }
+
+ if (inQuotes) { if (line[i] != c) { tag += line[i]; } }
+ }
+
+ return tag;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "getInfo");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SharedCommand::createSharedFromListGroup(string filename) {
+ try {
+ ofstream out;
+ m->openOutputFile(filename, out);
+
+ GroupMap* groupMap = new GroupMap(groupfile);
+
+ int groupError = groupMap->readMap();
+ if (groupError == 1) { delete groupMap; return 0; }
+ vector<string> allGroups = groupMap->getNamesOfGroups();
+ m->setAllGroups(allGroups);
+
+ pickedGroups = false;
+
+ //if hte user has not specified any groups then use them all
+ if (Groups.size() == 0) {
+ Groups = groupMap->getNamesOfGroups(); m->setGroups(Groups);
+ }else { pickedGroups = true; }
+
+ //fill filehandles with neccessary ofstreams
+ int i;
+ ofstream* temp;
+ for (i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]] = temp;
+ }
+
+ //set fileroot
+ fileroot = outputDir + m->getRootName(m->getSimpleName(listfile));
+
+ //clears file before we start to write to it below
+ for (int i=0; i<Groups.size(); i++) {
+ m->mothurRemove((fileroot + Groups[i] + ".rabund"));
+ outputNames.push_back((fileroot + Groups[i] + ".rabund"));
+ outputTypes["rabund"].push_back((fileroot + Groups[i] + ".rabund"));
+ }
+
+ string errorOff = "no error";
+
+ //if user provided an order file containing the order the shared file should be in read it
+ //if (ordergroupfile != "") { readOrderFile(); }
+
+ InputData input(listfile, "shared");
+ SharedListVector* SharedList = input.getSharedListVector();
+ string lastLabel = SharedList->getLabel();
+ vector<SharedRAbundVector*> lookup;
+
+ if (m->control_pressed) {
+ delete SharedList; delete groupMap;
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
+ out.close(); m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ //sanity check
+ vector<string> groupMapNamesSeqs = groupMap->getNamesSeqs();
+ int error = ListGroupSameSeqs(groupMapNamesSeqs, SharedList);
+
+ if ((!pickedGroups) && (SharedList->getNumSeqs() != groupMap->getNumSeqs())) { //if the user has not specified any groups and their files don't match exit with error
+ m->mothurOut("Your group file contains " + toString(groupMap->getNumSeqs()) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine();
+
+ out.close();
+ m->mothurRemove(filename); //remove blank shared file you made
+
+ createMisMatchFile(SharedList, groupMap);
+
+ //delete memory
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ delete it3->second;
+ }
+
+ delete SharedList; delete groupMap;
+
+ return 0;
+ }
+
+ if (error == 1) { m->control_pressed = true; }
+
+ //if user has specified groups make new groupfile for them
+ if (pickedGroups) { //make new group file
+ string groups = "";
+ if (m->getNumGroups() < 4) {
+ for (int i = 0; i < m->getNumGroups(); i++) {
+ groups += (m->getGroups())[i] + ".";
+ }
+ }else { groups = "merge"; }
+
+ string newGroupFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + groups + "groups";
+ outputTypes["group"].push_back(newGroupFile);
+ outputNames.push_back(newGroupFile);
+ ofstream outGroups;
+ m->openOutputFile(newGroupFile, outGroups);
+
+ vector<string> names = groupMap->getNamesSeqs();
+ string groupName;
+ for (int i = 0; i < names.size(); i++) {
+ groupName = groupMap->getGroup(names[i]);
+ if (isValidGroup(groupName, m->getGroups())) {
+ outGroups << names[i] << '\t' << groupName << endl;
+ }
+ }
+ outGroups.close();
+ }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ while((SharedList != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+ if (m->control_pressed) {
+ delete SharedList; delete groupMap;
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
+ out.close(); m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ if(allLines == 1 || labels.count(SharedList->getLabel()) == 1){
+
+ lookup = SharedList->getSharedRAbundVector();
+
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ if (pickedGroups) { //check for otus with no seqs in them
+ eliminateZeroOTUS(lookup);
+ }
+
+ if (m->control_pressed) {
+ delete SharedList; delete groupMap;
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
+ out.close(); m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+ printSharedData(lookup, out); //prints info to the .shared file
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+
+ processedLabels.insert(SharedList->getLabel());
+ userLabels.erase(SharedList->getLabel());
+ }
+
+ if ((m->anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = SharedList->getLabel();
+
+ delete SharedList;
+ SharedList = input.getSharedListVector(lastLabel); //get new list vector to process
+
+ lookup = SharedList->getSharedRAbundVector();
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ if (pickedGroups) { //check for otus with no seqs in them
+ eliminateZeroOTUS(lookup);
+ }
+
+
+ if (m->control_pressed) {
+ delete SharedList; delete groupMap;
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
+ out.close(); m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+ printSharedData(lookup, out); //prints info to the .shared file
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+
+ processedLabels.insert(SharedList->getLabel());
+ userLabels.erase(SharedList->getLabel());
+
+ //restore real lastlabel to save below
+ SharedList->setLabel(saveLabel);
+ }
+
+
+ lastLabel = SharedList->getLabel();
+
+ delete SharedList;
+ SharedList = input.getSharedListVector(); //get new list vector to process
+ }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ if (processedLabels.count(lastLabel) != 1) {
+ needToRun = true;
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ if (SharedList != NULL) { delete SharedList; }
+ SharedList = input.getSharedListVector(lastLabel); //get new list vector to process
+
+ lookup = SharedList->getSharedRAbundVector();
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ if (pickedGroups) { //check for otus with no seqs in them
+ eliminateZeroOTUS(lookup);
+ }
+
+ if (m->control_pressed) {
+ delete groupMap;
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { delete it3->second; }
+ out.close(); m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ if (!m->printedHeaders) { lookup[0]->printHeaders(out); }
+ printSharedData(lookup, out); //prints info to the .shared file
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ delete SharedList;
+ }
+
+ out.close();
+
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ delete it3->second;
+ }
+
+ delete groupMap;
+
+ if (m->control_pressed) {
+ m->mothurRemove(filename);
+ for (int i=0; i<Groups.size(); i++) { m->mothurRemove((fileroot + Groups[i] + ".rabund")); }
+ return 0;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SharedCommand", "createSharedFromListGroup");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+void SharedCommand::printSharedData(vector<SharedRAbundVector*> thislookup, ofstream& out) {
try {
if (order.size() == 0) { //user has not specified an order so do aplabetically
}
}
//**********************************************************************************************************************
-int SharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
- try {
-
- vector<SharedRAbundVector*> newLookup;
- for (int i = 0; i < thislookup.size(); i++) {
- SharedRAbundVector* temp = new SharedRAbundVector();
- temp->setLabel(thislookup[i]->getLabel());
- temp->setGroup(thislookup[i]->getGroup());
- newLookup.push_back(temp);
- }
-
- //for each bin
- for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
- if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
-
- //look at each sharedRabund and make sure they are not all zero
- bool allZero = true;
- for (int j = 0; j < thislookup.size(); j++) {
- if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
- }
-
- //if they are not all zero add this bin
- if (!allZero) {
- for (int j = 0; j < thislookup.size(); j++) {
- newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
- }
- //if there is a bin label use it otherwise make one
- }
- //else{ cout << "bin # " << i << " is all zeros" << endl; }
- }
-
- for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
- thislookup = newLookup;
-
- return 0;
-
- }
- catch(exception& e) {
- m->errorOut(e, "SharedCommand", "eliminateZeroOTUS");
- exit(1);
- }
-}
-//**********************************************************************************************************************
-int SharedCommand::createMisMatchFile() {
+int SharedCommand::createMisMatchFile(SharedListVector* SharedList, GroupMap* groupMap) {
try {
ofstream outMisMatch;
string outputMisMatchName = outputDir + m->getRootName(m->getSimpleName(listfile));
}
}
//**********************************************************************************************************************
-int SharedCommand::ListGroupSameSeqs() {
+int SharedCommand::ListGroupSameSeqs(vector<string>& groupMapsSeqs, SharedListVector* SharedList) {
try {
-
int error = 0;
-
- vector<string> groupMapsSeqs = groupMap->getNamesSeqs();
set<string> groupNamesSeqs;
for(int i = 0; i < groupMapsSeqs.size(); i++) {
void help() { m->mothurOut(getHelpString()); }
private:
- void printSharedData(vector<SharedRAbundVector*>);
- int createMisMatchFile();
+ void printSharedData(vector<SharedRAbundVector*>, ofstream&);
+ int createMisMatchFile(SharedListVector*, GroupMap*);
int readOrderFile();
bool isValidGroup(string, vector<string>);
int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
- int ListGroupSameSeqs();
+ int ListGroupSameSeqs(vector<string>&, SharedListVector*);
+ int createSharedFromListGroup(string);
+ int createSharedFromBiom(string);
+ string getTag(string&);
+ vector<string> readRows(string, ifstream&, int&);
+ int getDims(string, int&, int&);
+ vector<SharedRAbundVector*> readData(string, string, ifstream&, vector<string>&, int);
- SharedListVector* SharedList;
- InputData* input;
- GroupMap* groupMap;
vector<string> Groups, outputNames, order;
set<string> labels;
- ofstream out;
- string filename, fileroot, outputDir, listfile, groupfile, ordergroupfile;
+ string fileroot, outputDir, listfile, groupfile, biomfile, ordergroupfile;
bool firsttime, pickedGroups, abort, allLines;
map<string, ofstream*> filehandles;
map<string, ofstream*>::iterator it3;
//are we at the beginning of the file??
if (m->saveNextLabel == "") {
f >> label;
-
+
//is this a shared file that has headers
if (label == "label") {
//gets "group"
if (m->control_pressed) { break; }
string temp;
iStringStream >> temp; m->gobble(iStringStream);
-
+
m->binLabelsInFile.push_back(temp);
}
- f >> label;
- }
- }else { label = m->saveNextLabel; }
+ f >> label >> groupN >> num;
+ }else {
+ //read in first row since you know there is at least 1 group.
+ f >> groupN >> num;
+
+ //make binlabels because we don't have any
+ string snumBins = toString(num);
+ m->binLabelsInFile.clear();
+ for (int i = 0; i < num; i++) {
+ //if there is a bin label use it otherwise make one
+ string binLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+ m->binLabelsInFile.push_back(binLabel);
+ }
+ }
+ }else {
+ label = m->saveNextLabel;
+
+ //read in first row since you know there is at least 1 group.
+ f >> groupN >> num;
+ }
//reset labels, currentLabels may have gotten changed as otus were eliminated because of group choices or sampling
m->currentBinLabels = m->binLabelsInFile;
- //read in first row since you know there is at least 1 group.
- f >> groupN >> num;
-
holdLabel = label;
//add new vector to lookup
m->binLabelsInFile.push_back(temp);
}
- f >> label;
- }
- }else { label = m->saveNextLabel; }
+ f >> label >> groupN >> num;
+ }else {
+ //read in first row since you know there is at least 1 group.
+ f >> groupN >> num;
+
+ //make binlabels because we don't have any
+ string snumBins = toString(num);
+ m->binLabelsInFile.clear();
+ for (int i = 0; i < num; i++) {
+ //if there is a bin label use it otherwise make one
+ string binLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+ m->binLabelsInFile.push_back(binLabel);
+ }
+ }
+ }else {
+ label = m->saveNextLabel;
+
+ //read in first row since you know there is at least 1 group.
+ f >> groupN >> num;
+ }
//reset labels, currentLabels may have gotten changed as otus were eliminated because of group choices or sampling
m->currentBinLabels = m->binLabelsInFile;
- //read in first row since you know there is at least 1 group.
- f >> groupN >> num;
-
holdLabel = label;
//add new vector to lookup
if (path == "") { parameters["file"] = inputDir + it->second; }
}
}
-
-
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
//check for required parameters
flowFileName = validParameter.validFile(parameters, "flow", true);
flowFilesFileName = validParameter.validFile(parameters, "file", true);
}
else{
ofstream temp;
-
+
+ string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
+
//flow.files = 9 character offset
compositeFASTAFileName = outputDir + flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
m->openOutputFile(compositeFASTAFileName, temp);
if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
}
else{
+ outputDir += m->hasPath(flowFileName);
flowFileVector.push_back(flowFileName);
}
-
-
- //if the user changes the output directory command factory will send this info to us in the output parameter
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
- outputDir = "";
- outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it
- }
-
-
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
string temp;
#include "subsample.h"
+//**********************************************************************************************************************
+Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
+ try {
+ Tree* newTree = NULL;
+
+ vector<string> subsampledSeqs = getSample(tmap, size);
+ map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs);
+
+ //remove seqs not in sample from treemap
+ for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+ //is that name in the subsample?
+ int count = 0;
+ for (int j = 0; j < subsampledSeqs.size(); j++) {
+ if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
+ count++;
+ }
+
+ if (m->control_pressed) { return newTree; }
+
+ //if you didnt find it, remove it
+ if (count == subsampledSeqs.size()) {
+ tmap->removeSeq(tmap->namesOfSeqs[i]);
+ i--; //need this because removeSeq removes name from namesOfSeqs
+ }
+ }
+
+ //create new tree
+ int numUniques = sampledNameMap.size();
+ if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
+
+ newTree = new Tree(numUniques, tmap); //numNodes, treemap
+ newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
+
+ return newTree;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "getSample-Tree");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//assumes whole maps dupName -> uniqueName
+map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
+ try {
+ map<string, string> nameMap;
+
+ //whole will be empty if user gave no name file, so we don't need to make a new one
+ if (whole.size() == 0) { return nameMap; }
+
+ vector<string> newWanted;
+ for (int i = 0; i < wanted.size(); i++) {
+
+ if (m->control_pressed) { break; }
+
+ string dupName = wanted[i];
+
+ map<string, string>::iterator itWhole = whole.find(dupName);
+ if (itWhole != whole.end()) {
+ string repName = itWhole->second;
+
+ //do we already have this rep?
+ map<string, string>::iterator itName = nameMap.find(repName);
+ if (itName != nameMap.end()) { //add this seqs to dups list
+ (itName->second) += "," + dupName;
+ }else { //first sighting of this seq
+ nameMap[repName] = dupName;
+ newWanted.push_back(repName);
+ }
+ }else { m->mothurOut("[ERROR]: "+dupName+" is not in your name file, please correct.\n"); m->control_pressed = true; }
+ }
+
+ wanted = newWanted;
+ return nameMap;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "deconvolute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(TreeMap* tMap, int size) {
+ try {
+ vector<string> sample;
+
+ vector<string> Groups = tMap->getNamesOfGroups();
+ for (int i = 0; i < Groups.size(); i++) {
+
+ if (m->control_pressed) { break; }
+
+ vector<string> thisGroup; thisGroup.push_back(Groups[i]);
+ vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
+ int thisSize = thisGroupsSeqs.size();
+
+ if (thisSize >= size) {
+
+ random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
+
+ for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
+ }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+ }
+
+ return sample;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "getSample-TreeMap");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
try {
}
catch(exception& e) {
- m->errorOut(e, "SubSample", "getSample");
+ m->errorOut(e, "SubSample", "getSample-shared");
exit(1);
}
}
#include "mothurout.h"
#include "sharedrabundvector.h"
+#include "treemap.h"
+#include "tree.h"
//subsampling overwrites the sharedRabunds. If you need to reuse the original use the getSamplePreserve function.
~SubSample() {}
vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
-
+
+ Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
private:
MothurOut* m;
int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
+
+ vector<string> getSample(TreeMap*, int); //returns map contains names of seqs in subsample -> group.
+ map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
+
};
numLeaves = num;
numNodes = 2*numLeaves - 1;
-
+
tree.resize(numNodes);
}
catch(exception& e) {
Tree::Tree(string g) { //do not use tree generated by this its just to extract the treenames, its a chicken before the egg thing that needs to be revisited.
try {
m = MothurOut::getInstance();
-
- tmap = NULL;
-
parseTreeFile(); m->runParse = false;
}
catch(exception& e) {
exit(1);
}
}
-
+/*****************************************************************/
+Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
+ try {
+ m = MothurOut::getInstance();
+
+ if (m->runParse == true) { parseTreeFile(); m->runParse = false; }
+ numLeaves = m->Treenames.size();
+ numNodes = 2*numLeaves - 1;
+
+ tree.resize(numNodes);
+
+ //initialize groupNodeInfo
+ for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
+ groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0);
+ }
+
+ //initialize tree with correct number of nodes, name and group info.
+ for (int i = 0; i < numNodes; i++) {
+ //initialize leaf nodes
+ if (i <= (numLeaves-1)) {
+ tree[i].setName(m->Treenames[i]);
+
+ //save group info
+ string group = tmap->getGroup(m->Treenames[i]);
+
+ vector<string> tempGroups; tempGroups.push_back(group);
+ tree[i].setGroup(tempGroups);
+ groupNodeInfo[group].push_back(i);
+
+ //set pcount and pGroup for groupname to 1.
+ tree[i].pcount[group] = 1;
+ tree[i].pGroups[group] = 1;
+
+ //Treemap knows name, group and index to speed up search
+ tmap->setIndex(m->Treenames[i], i);
+
+ //intialize non leaf nodes
+ }else if (i > (numLeaves-1)) {
+ tree[i].setName("");
+ vector<string> tempGroups;
+ tree[i].setGroup(tempGroups);
+ }
+ }
+
+ //build tree from matrix
+ //initialize indexes
+ map<int, int> indexes; //maps row in simMatrix to vector index in the tree
+ for (int g = 0; g < numLeaves; g++) { indexes[g] = g; }
+
+ //do merges and create tree structure by setting parents and children
+ //there are numGroups - 1 merges to do
+ for (int i = 0; i < (numLeaves - 1); i++) {
+ float largest = -1000.0;
+
+ if (m->control_pressed) { break; }
+
+ int row, column;
+ //find largest value in sims matrix by searching lower triangle
+ for (int j = 1; j < sims.size(); j++) {
+ for (int k = 0; k < j; k++) {
+ if (sims[j][k] > largest) { largest = sims[j][k]; row = j; column = k; }
+ }
+ }
+
+ //set non-leaf node info and update leaves to know their parents
+ //non-leaf
+ tree[numLeaves + i].setChildren(indexes[row], indexes[column]);
+
+ //parents
+ tree[indexes[row]].setParent(numLeaves + i);
+ tree[indexes[column]].setParent(numLeaves + i);
+
+ //blength = distance / 2;
+ float blength = ((1.0 - largest) / 2);
+
+ //branchlengths
+ tree[indexes[row]].setBranchLength(blength - tree[indexes[row]].getLengthToLeaves());
+ tree[indexes[column]].setBranchLength(blength - tree[indexes[column]].getLengthToLeaves());
+
+ //set your length to leaves to your childs length plus branchlength
+ tree[numLeaves + i].setLengthToLeaves(tree[indexes[row]].getLengthToLeaves() + tree[indexes[row]].getBranchLength());
+
+
+ //update index
+ indexes[row] = numLeaves+i;
+ indexes[column] = numLeaves+i;
+
+ //remove highest value that caused the merge.
+ sims[row][column] = -1000.0;
+ sims[column][row] = -1000.0;
+
+ //merge values in simsMatrix
+ for (int n = 0; n < sims.size(); n++) {
+ //row becomes merge of 2 groups
+ sims[row][n] = (sims[row][n] + sims[column][n]) / 2;
+ sims[n][row] = sims[row][n];
+ //delete column
+ sims[column][n] = -1000.0;
+ sims[n][column] = -1000.0;
+ }
+ }
+
+ //adjust tree to make sure root to tip length is .5
+ int root = findRoot();
+ tree[root].setBranchLength((0.5 - tree[root].getLengthToLeaves()));
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "Tree");
+ exit(1);
+ }
+}
/*****************************************************************/
Tree::~Tree() {}
/*****************************************************************/
-void Tree::addNamesToCounts() {
+void Tree::addNamesToCounts(map<string, string> nameMap) {
try {
//ex. seq1 seq2,seq3,se4
// seq1 = pasture
string name = tree[i].getName();
- map<string, string>::iterator itNames = m->names.find(name);
+ map<string, string>::iterator itNames = nameMap.find(name);
- if (itNames == m->names.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1); }
+ if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1); }
else {
vector<string> dupNames;
- m->splitAtComma(m->names[name], dupNames);
+ m->splitAtComma(nameMap[name], dupNames);
map<string, int>::iterator itCounts;
int maxPars = 1;
}
}
/*****************************************************************/
-int Tree::assembleTree() {
+int Tree::assembleTree(map<string, string> nameMap) {
try {
- //float A = clock();
+ //save for later
+ names = nameMap;
//if user has given a names file we want to include that info in the pgroups and pcount info.
- if(m->names.size() != 0) { addNamesToCounts(); }
+ if(nameMap.size() != 0) { addNamesToCounts(nameMap); }
//build the pGroups in non leaf nodes to be used in the parsimony calcs.
for (int i = numLeaves; i < numNodes; i++) {
tree[i].pGroups = (mergeGroups(i));
tree[i].pcount = (mergeGcounts(i));
}
- //float B = clock();
- //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl;
+
return 0;
}
catch(exception& e) {
exit(1);
}
}
-/*****************************************************************/
+/*****************************************************************
int Tree::assembleTree(string n) {
try {
}
}
/*****************************************************************/
-void Tree::getSubTree(Tree* copy, vector<string> Groups) {
+//assumes leaf node names are in groups and no names file - used by indicator command
+void Tree::getSubTree(Tree* Ctree, vector<string> Groups) {
try {
-
+
+ //copy Tree since we are going to destroy it
+ Tree* copy = new Tree(tmap);
+ copy->getCopy(Ctree);
+ map<string, string> empty;
+ copy->assembleTree(empty);
+
//we want to select some of the leaf nodes to create the output tree
//go through the input Tree starting at parents of leaves
for (int i = 0; i < numNodes; i++) {
//you found the root
if (copy->tree[i].getParent() == -1) { root = i; break; }
}
-
+
int nextSpot = numLeaves;
populateNewTree(copy->tree, root, nextSpot);
+
+ delete copy;
}
catch(exception& e) {
- m->errorOut(e, "Tree", "getCopy");
+ m->errorOut(e, "Tree", "getSubTree");
+ exit(1);
+ }
+}
+/*****************************************************************/
+//assumes nameMap contains unique names as key or is empty.
+//assumes numLeaves defined in tree constructor equals size of seqsToInclude and seqsToInclude only contains unique seqs.
+int Tree::getSubTree(Tree* copy, vector<string> seqsToInclude, map<string, string> nameMap) {
+ try {
+
+ if (numLeaves != seqsToInclude.size()) { m->mothurOut("[ERROR]: numLeaves does not equal numUniques, cannot create subtree.\n"); m->control_pressed = true; return 0; }
+
+ getSubTree(copy, seqsToInclude);
+ if (nameMap.size() != 0) { addNamesToCounts(nameMap); }
+
+ //build the pGroups in non leaf nodes to be used in the parsimony calcs.
+ for (int i = numLeaves; i < numNodes; i++) {
+ if (m->control_pressed) { return 1; }
+
+ tree[i].pGroups = (mergeGroups(i));
+ tree[i].pcount = (mergeGcounts(i));
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "getSubTree");
exit(1);
}
}
}
}
/**************************************************************************************************/
-
void Tree::randomLabels(vector<string> g) {
try {
exit(1);
}
}
-/**************************************************************************************************
-
-void Tree::randomLabels(string groupA, string groupB) {
- try {
- int numSeqsA = globaldata->gTreemap->seqsPerGroup[groupA];
- int numSeqsB = globaldata->gTreemap->seqsPerGroup[groupB];
-
- vector<string> randomGroups(numSeqsA+numSeqsB, groupA);
- for(int i=numSeqsA;i<randomGroups.size();i++){
- randomGroups[i] = groupB;
- }
- random_shuffle(randomGroups.begin(), randomGroups.end());
-
- int randomCounter = 0;
- for(int i=0;i<numLeaves;i++){
- if(tree[i].getGroup() == groupA || tree[i].getGroup() == groupB){
- tree[i].setGroup(randomGroups[randomCounter]);
- tree[i].pcount.clear();
- tree[i].pcount[randomGroups[randomCounter]] = 1;
- tree[i].pGroups.clear();
- tree[i].pGroups[randomGroups[randomCounter]] = 1;
- randomCounter++;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "Tree", "randomLabels");
- exit(1);
- }
-}
-**************************************************************************************************/
+/**************************************************************************************************/
void Tree::randomBlengths() {
try {
for(int i=numNodes-1;i>=0;i--){
/*************************************************************************************************/
void Tree::assembleRandomUnifracTree(vector<string> g) {
randomLabels(g);
- assembleTree("noNameCounts");
+ map<string, string> empty;
+ assembleTree(empty);
}
/*************************************************************************************************/
void Tree::assembleRandomUnifracTree(string groupA, string groupB) {
-
vector<string> temp; temp.push_back(groupA); temp.push_back(groupB);
randomLabels(temp);
- assembleTree("noNameCounts");
+ map<string, string> empty;
+ assembleTree(empty);
}
/*************************************************************************************************/
//for now it's just random topology but may become random labels as well later that why this is such a simple function now...
void Tree::assembleRandomTree() {
randomTopology();
- assembleTree();
+ map<string, string> empty;
+ assembleTree(empty);
}
/**************************************************************************************************/
}
}
/*****************************************************************/
+void Tree::print(ostream& out, map<string, string> nameMap) {
+ try {
+ int root = findRoot();
+ printBranch(root, out, nameMap);
+ out << ";" << endl;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "print");
+ exit(1);
+ }
+}
+/*****************************************************************/
void Tree::print(ostream& out, string mode) {
try {
int root = findRoot();
}
}
/*****************************************************************/
-void Tree::printBranch(int node, ostream& out, string mode) {
+void Tree::printBranch(int node, ostream& out, map<string, string> names) {
try {
// you are not a leaf
+ if (tree[node].getLChild() != -1) {
+ out << "(";
+ printBranch(tree[node].getLChild(), out, names);
+ out << ",";
+ printBranch(tree[node].getRChild(), out, names);
+ out << ")";
+
+ //if there is a branch length then print it
+ if (tree[node].getBranchLength() != -1) {
+ out << ":" << tree[node].getBranchLength();
+ }
+
+ }else { //you are a leaf
+ map<string, string>::iterator itNames = names.find(tree[node].getName());
+
+ string outputString = "";
+ if (itNames != names.end()) {
+
+ vector<string> dupNames;
+ m->splitAtComma((itNames->second), dupNames);
+
+ if (dupNames.size() == 1) {
+ outputString += tree[node].getName();
+ if (tree[node].getBranchLength() != -1) {
+ outputString += ":" + toString(tree[node].getBranchLength());
+ }
+ }else {
+ outputString += "(";
+
+ for (int u = 0; u < dupNames.size()-1; u++) {
+ outputString += dupNames[u];
+
+ if (tree[node].getBranchLength() != -1) {
+ outputString += ":" + toString(0.0);
+ }
+ outputString += ",";
+ }
+
+ outputString += dupNames[dupNames.size()-1];
+ if (tree[node].getBranchLength() != -1) {
+ outputString += ":" + toString(0.0);
+ }
+
+ outputString += ")";
+ if (tree[node].getBranchLength() != -1) {
+ outputString += ":" + toString(tree[node].getBranchLength());
+ }
+ }
+ }else {
+ outputString = tree[node].getName();
+ //if there is a branch length then print it
+ if (tree[node].getBranchLength() != -1) {
+ outputString += ":" + toString(tree[node].getBranchLength());
+ }
+
+ m->mothurOut("[ERROR]: " + tree[node].getName() + " is not in your namefile, please correct."); m->mothurOutEndLine();
+ }
+
+ out << outputString;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "printBranch");
+ exit(1);
+ }
+}
+/*****************************************************************/
+void Tree::printBranch(int node, ostream& out, string mode) {
+ try {
+
+ // you are not a leaf
if (tree[node].getLChild() != -1) {
out << "(";
printBranch(tree[node].getLChild(), out, mode);
if (tree[node].getBranchLength() != -1) {
out << ":" << tree[node].getBranchLength();
}
- }else if (mode == "deunique") {
- //if there is a branch length then print it
- if (tree[node].getBranchLength() != -1) {
- out << ":" << tree[node].getBranchLength();
- }
}
}else { //you are a leaf
string leafGroup = tmap->getGroup(tree[node].getName());
if (tree[node].getBranchLength() != -1) {
out << ":" << tree[node].getBranchLength();
}
- }else if (mode == "deunique") {
- map<string, string>::iterator itNames = m->names.find(tree[node].getName());
-
- string outputString = "";
- if (itNames != m->names.end()) {
-
- vector<string> dupNames;
- m->splitAtComma((itNames->second), dupNames);
-
- if (dupNames.size() == 1) {
- outputString += tree[node].getName();
- if (tree[node].getBranchLength() != -1) {
- outputString += ":" + toString(tree[node].getBranchLength());
- }
- }else {
- outputString += "(";
-
- for (int u = 0; u < dupNames.size()-1; u++) {
- outputString += dupNames[u];
-
- if (tree[node].getBranchLength() != -1) {
- outputString += ":" + toString(0.0);
- }
- outputString += ",";
- }
-
- outputString += dupNames[dupNames.size()-1];
- if (tree[node].getBranchLength() != -1) {
- outputString += ":" + toString(0.0);
- }
-
- outputString += ")";
- if (tree[node].getBranchLength() != -1) {
- outputString += ":" + toString(tree[node].getBranchLength());
- }
- }
- }else {
- outputString = tree[node].getName();
- //if there is a branch length then print it
- if (tree[node].getBranchLength() != -1) {
- outputString += ":" + toString(tree[node].getBranchLength());
- }
-
- m->mothurOut("[ERROR]: " + tree[node].getName() + " is not in your namefile, please correct."); m->mothurOutEndLine();
- }
-
- out << outputString;
}
}
Tree(string); //do not use tree generated by this constructor its just to extract the treenames, its a chicken before the egg thing that needs to be revisited.
Tree(int, TreeMap*);
Tree(TreeMap*); //to generate a tree from a file
+ Tree(TreeMap*, vector< vector<double> >&); //create tree from sim matrix
~Tree();
+ TreeMap* getTreeMap() { return tmap; }
void getCopy(Tree*); //makes tree a copy of the one passed in.
void getSubTree(Tree*, vector<string>); //makes tree a that contains only the names passed in.
+ int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided.
+
void assembleRandomTree();
void assembleRandomUnifracTree(vector<string>);
void assembleRandomUnifracTree(string, string);
+
void createNewickFile(string);
int getIndex(string);
void setIndex(string, int);
void printTree();
void print(ostream&);
void print(ostream&, string);
+ void print(ostream&, map<string, string>);
int findRoot(); //return index of root node
//this function takes the leaf info and populates the non leaf nodes
- int assembleTree();
- int assembleTree(string);
+ int assembleTree(map<string, string>);
vector<Node> tree; //the first n nodes are the leaves, where n is the number of sequences.
map< string, vector<int> > groupNodeInfo; //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file.
ofstream out;
string filename;
+ map<string, string> names;
map<string, int>::iterator it, it2;
map<string, int> mergeGroups(int); //returns a map with a groupname and the number of times that group was seen in the children
map<string,int> mergeGcounts(int);
- void addNamesToCounts();
+ void addNamesToCounts(map<string, string>);
void randomTopology();
void randomBlengths();
void randomLabels(vector<string>);
//void randomLabels(string, string);
- void printBranch(int, ostream&, string); //recursively print out tree
+ void printBranch(int, ostream&, map<string, string>); //recursively print out tree
+ void printBranch(int, ostream&, string);
void parseTreeFile(); //parses through tree file to find names of nodes and number of them
//this is required in case user has sequences in the names file that are
//not included in the tree.
CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
+//CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
Tree* TreeGroupCommand::createTree(vector< vector<double> >& simMatrix){
try {
//create tree
- t = new Tree(tmap);
+ t = new Tree(tmap, simMatrix);
- //initialize index
- map<int, int> index; //maps row in simMatrix to vector index in the tree
- for (int g = 0; g < numGroups; g++) { index[g] = g; }
+ if (m->control_pressed) { delete t; t = NULL; return t; }
- //do merges and create tree structure by setting parents and children
- //there are numGroups - 1 merges to do
- for (int i = 0; i < (numGroups - 1); i++) {
- float largest = -1000.0;
-
- if (m->control_pressed) { delete t; t = NULL; return t; }
-
- int row, column;
- //find largest value in sims matrix by searching lower triangle
- for (int j = 1; j < simMatrix.size(); j++) {
- for (int k = 0; k < j; k++) {
- if (simMatrix[j][k] > largest) { largest = simMatrix[j][k]; row = j; column = k; }
- }
- }
+ //assemble tree
+ map<string, string> empty;
+ t->assembleTree(empty);
- //set non-leaf node info and update leaves to know their parents
- //non-leaf
- t->tree[numGroups + i].setChildren(index[row], index[column]);
-
- //parents
- t->tree[index[row]].setParent(numGroups + i);
- t->tree[index[column]].setParent(numGroups + i);
-
- //blength = distance / 2;
- float blength = ((1.0 - largest) / 2);
-
- //branchlengths
- t->tree[index[row]].setBranchLength(blength - t->tree[index[row]].getLengthToLeaves());
- t->tree[index[column]].setBranchLength(blength - t->tree[index[column]].getLengthToLeaves());
-
- //set your length to leaves to your childs length plus branchlength
- t->tree[numGroups + i].setLengthToLeaves(t->tree[index[row]].getLengthToLeaves() + t->tree[index[row]].getBranchLength());
-
-
- //update index
- index[row] = numGroups+i;
- index[column] = numGroups+i;
-
- //remove highest value that caused the merge.
- simMatrix[row][column] = -1000.0;
- simMatrix[column][row] = -1000.0;
-
- //merge values in simsMatrix
- for (int n = 0; n < simMatrix.size(); n++) {
- //row becomes merge of 2 groups
- simMatrix[row][n] = (simMatrix[row][n] + simMatrix[column][n]) / 2;
- simMatrix[n][row] = simMatrix[row][n];
- //delete column
- simMatrix[column][n] = -1000.0;
- simMatrix[n][column] = -1000.0;
- }
- }
-
- //adjust tree to make sure root to tip length is .5
- int root = t->findRoot();
- t->tree[root].setBranchLength((0.5 - t->tree[root].getLengthToLeaves()));
-
- //assemble tree
- t->assembleTree();
-
- if (m->control_pressed) { delete t; t = NULL; return t; }
-
return t;
-
}
catch(exception& e) {
m->errorOut(e, "TreeGroupCommand", "createTree");
if (m->control_pressed) { for (int k = 0; k < trees.size(); k++) { delete trees[k]; } }
Consensus consensus;
- Tree* conTree = consensus.getTree(trees, tmap);
+ //clear old tree names if any
+ m->Treenames.clear(); m->Treenames = m->getGroups(); //may have changed if subsample eliminated groups
+ Tree* conTree = consensus.getTree(trees);
//create a new filename
string conFile = outputDir + m->getRootName(m->getSimpleName(inputfile)) + treeCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".cons.tre";
- outputNames.push_back(conFile); outputTypes["tree"].push_back(outputFile);
+ outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
ofstream outTree;
m->openOutputFile(conFile, outTree);
/************************************************************/
TreeMap::~TreeMap(){}
+/************************************************************/
+int TreeMap::readMap(string gf) {
+
+ groupFileName = gf;
+ m->openInputFile(gf, fileHandle);
+
+ string seqName, seqGroup;
+ int error = 0;
+
+ while(fileHandle){
+ fileHandle >> seqName; m->gobble(fileHandle); //read from first column
+ fileHandle >> seqGroup; //read from second column
+
+ if (m->control_pressed) { fileHandle.close(); return 1; }
+
+ setNamesOfGroups(seqGroup);
+
+ map<string, GroupIndex>::iterator itCheck = treemap.find(seqName);
+ if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
+ else {
+ namesOfSeqs.push_back(seqName);
+ treemap[seqName].groupname = seqGroup; //store data in map
+
+ it2 = seqsPerGroup.find(seqGroup);
+ if (it2 == seqsPerGroup.end()) { //if it's a new group
+ seqsPerGroup[seqGroup] = 1;
+ }else {//it's a group we already have
+ seqsPerGroup[seqGroup]++;
+ }
+ }
+
+ m->gobble(fileHandle);
+ }
+ fileHandle.close();
+
+ return error;
+}
/************************************************************/
int TreeMap::readMap() {
int error = 0;
while(fileHandle){
- fileHandle >> seqName; //read from first column
+ fileHandle >> seqName; m->gobble(fileHandle); //read from first column
fileHandle >> seqGroup; //read from second column
if (m->control_pressed) { fileHandle.close(); return 1; }
exit(1);
}
}
+/************************************************************/
+int TreeMap::getCopy(TreeMap& copy){
+ try {
+
+ namesOfGroups = copy.getNamesOfGroups();
+ numGroups = copy.getNumGroups();
+ namesOfSeqs = copy.namesOfSeqs;
+ seqsPerGroup = copy.seqsPerGroup;
+ treemap = copy.treemap;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeMap", "getCopy");
+ exit(1);
+ }
+}
+/************************************************************/
+vector<string> TreeMap::getNamesSeqs(){
+ try {
+
+ vector<string> names;
+
+ for(it = treemap.begin(); it != treemap.end(); it++){
+ names.push_back(it->first);
+ }
+
+ return names;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeMap", "getNamesSeqs");
+ exit(1);
+ }
+}
+/************************************************************/
+vector<string> TreeMap::getNamesSeqs(vector<string> picked){
+ try {
+
+ vector<string> names;
+
+ for(it = treemap.begin(); it != treemap.end(); it++){
+ //if you are belong to one the the groups in the picked vector add you
+ if (m->inUsersGroups(it->second.groupname, picked)) {
+ names.push_back(it->first);
+ }
+ }
+
+ return names;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeMap", "getNamesSeqs");
+ exit(1);
+ }
+}
/************************************************************/
*/
#include "mothur.h"
-#include "groupmap.h"
#include "listvector.hpp"
/* This class is used by the read.tree command to build the tree container. */
int vectorIndex;
};
-class GroupMap;
-class ListVector;
-
class TreeMap {
public:
TreeMap() { m = MothurOut::getInstance(); }
TreeMap(string);
~TreeMap();
+
int readMap();
+ int readMap(string);
int getNumGroups();
int getNumSeqs();
void setIndex(string, int); //sequencename, index
sort(namesOfGroups.begin(), namesOfGroups.end());
return namesOfGroups;
}
- vector<string> namesOfSeqs;
- map<string,int> seqsPerGroup; //groupname, number of seqs in that group.
- map<string, GroupIndex> treemap; //sequence name and <groupname, vector index>
- void print(ostream&);
+
+ void print(ostream&);
void makeSim(vector<string>); //takes groupmap info and fills treemap for use by tree.shared command.
void makeSim(ListVector*); //takes listvector info and fills treemap for use by tree.shared command.
-
+ vector<string> getNamesSeqs();
+ vector<string> getNamesSeqs(vector<string>); //get names of seqs belonging to a group or set of groups
+ int getCopy(TreeMap&);
+
+ vector<string> namesOfSeqs;
+ map<string,int> seqsPerGroup; //groupname, number of seqs in that group.
+ map<string, GroupIndex> treemap; //sequence name and <groupname, vector index>
+
+
private:
vector<string> namesOfGroups;
ifstream fileHandle;
--- /dev/null
+//
+// treereader.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 4/11/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "treereader.h"
+#include "readtree.h"
+
+/***********************************************************************/
+
+TreeReader::TreeReader(string tf) : treefile(tf) {
+ try {
+ m = MothurOut::getInstance();
+ namefile = "";
+ groupfile = "";
+ readTrees();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeReader", "TreeReader");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+TreeReader::TreeReader(string tf, string gf) : treefile(tf), groupfile(gf) {
+ try {
+ m = MothurOut::getInstance();
+ namefile = "";
+ readTrees();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeReader", "TreeReader");
+ exit(1);
+ }
+}
+/***********************************************************************/
+TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf), groupfile(gf), namefile(nf) {
+ try {
+ m = MothurOut::getInstance();
+ readTrees();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeReader", "TreeReader");
+ exit(1);
+ }
+}
+/***********************************************************************/
+bool TreeReader::readTrees() {
+ try {
+
+ tmap = new TreeMap();
+ if (groupfile != "") { tmap->readMap(groupfile); }
+ else{ //fake out by putting everyone in one group
+ Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
+ for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
+ }
+
+ int numUniquesInName = 0;
+ if (namefile != "") { numUniquesInName = readNamesFile(); }
+
+ ReadTree* read = new ReadNewickTree(treefile);
+ int readOk = read->read(tmap);
+
+ if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete read; m->control_pressed=true; return 0; }
+
+ read->AssembleTrees(names);
+ trees = read->getTrees();
+ delete read;
+
+ //make sure all files match
+ //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
+ int numNamesInTree;
+ if (namefile != "") {
+ if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
+ else { numNamesInTree = m->Treenames.size(); }
+ }else { numNamesInTree = m->Treenames.size(); }
+
+
+ //output any names that are in group file but not in tree
+ if (numNamesInTree < tmap->getNumSeqs()) {
+ for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+ //is that name in the tree?
+ int count = 0;
+ for (int j = 0; j < m->Treenames.size(); j++) {
+ if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
+ count++;
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; } return 0; }
+
+ //then you did not find it so report it
+ if (count == m->Treenames.size()) {
+ //if it is in your namefile then don't remove
+ map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
+
+ if (it == nameMap.end()) {
+ m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
+ tmap->removeSeq(tmap->namesOfSeqs[i]);
+ i--; //need this because removeSeq removes name from namesOfSeqs
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeReader", "readTrees");
+ exit(1);
+ }
+}
+/*****************************************************************/
+int TreeReader::readNamesFile() {
+ try {
+ nameMap.clear();
+ names.clear();
+ int numUniquesInName = 0;
+
+ ifstream in;
+ m->openInputFile(namefile, in);
+
+ string first, second;
+ map<string, string>::iterator itNames;
+
+ while(!in.eof()) {
+ in >> first >> second; m->gobble(in);
+
+ numUniquesInName++;
+
+ itNames = nameMap.find(first);
+ if (itNames == nameMap.end()) {
+ names[first] = second;
+
+ //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
+ vector<string> dupNames;
+ m->splitAtComma(second, dupNames);
+
+ for (int i = 0; i < dupNames.size(); i++) {
+ nameMap[dupNames[i]] = first;
+ if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
+ }
+ }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); nameMap.clear(); names.clear(); namefile = ""; return 1; }
+ }
+ in.close();
+
+ return numUniquesInName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TreeReader", "readNamesFile");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+
--- /dev/null
+#ifndef Mothur_treereader_h
+#define Mothur_treereader_h
+
+//
+// treereader.h
+// Mothur
+//
+// Created by Sarah Westcott on 4/11/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothurout.h"
+#include "tree.h"
+
+class TreeReader {
+
+public:
+
+ TreeReader(string tf);
+ TreeReader(string tf, string gf);
+ TreeReader(string tf, string gf, string nf);
+ ~TreeReader() {}
+
+ vector<Tree*> getTrees() { return trees; }
+ map<string, string> getNames() { return nameMap; } //dups -> unique
+ map<string, string> getNameMap() { return names; } //unique -> dups list
+
+
+private:
+ MothurOut* m;
+ vector<Tree*> trees;
+ TreeMap* tmap;
+ map<string, string> nameMap; //dupName -> uniqueName
+ map<string, string> names;
+
+ string treefile, groupfile, namefile;
+
+ bool readTrees();
+ int readNamesFile();
+};
+
+
+
+#endif
//The sum_of_squares, havel_hakimi and calc_c_score algorithms have been adapted from I. Miklos and J. Podani. 2004. Randomization of presence-absence matrices: comments and new algorithms. Ecology 85:86-92.
-/**************************************************************************************************
-int TrialSwap2::intrand(int n){
- try {
- double z;
-
- z = (double)random() * (double)n / (double)RAND_MAX;
- if(z>=n)
- z=n-1;
- if(z<0)
- z=0;
- return((int)floor(z));
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "intrand");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/* completely random matrix, all column and row totals are variable, matrix size is the same
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim1(vector<vector<int> > &co_matrix){
- try {
- vector<int> randRow;
- vector<vector<int> > tmpmatrix;
- int nrows = co_matrix.size();
- int ncols = co_matrix[0].size();
-
- //clear co_matrix
- // for(i=0;i<nrows;i++)
- // {
- // co_matrix.clear();
- // }
-
- //cout << "building matrix" << endl;
- for(int i=0;i<nrows;i++){
- if (m->control_pressed) { break; }
-
- for(int j=0;j<ncols;j++){
- double randNum = rand() / double(RAND_MAX);
- //cout << randNum << endl;
-
- if(randNum > 0.5) {
- randRow.push_back(1);
- }else{
- randRow.push_back(0);
- }
- }
- tmpmatrix.push_back(randRow);
- randRow.clear();
- //cout << endl;
- }
- co_matrix = tmpmatrix;
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim1");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- *row sums fixed, columns equiprobable
- */
-void TrialSwap2::sim2(vector<vector<int> > &co_matrix)
-{
- try {
-
- for(int i=0;i<co_matrix.size();i++)
- {
- if (m->control_pressed) { break; }
- random_shuffle( co_matrix[i].begin(), co_matrix[i].end() );
- }
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim2");
- exit(1);
- }
-}
-/**************************************************************************************************/
-int TrialSwap2::sim2plus(vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{
- try {
- int nrows = co_matrix.size();
- int ncols = co_matrix[0].size();
- double cellprob = 1.0/ncols;
- vector<double> cellprobvec;
- vector<int> tmprow;
- vector<vector<int> > tmpmatrix;
- //double randNum;
-
- double start = 0.0;
-
- for(int i=0; i<ncols; i++)
- {
- if (m->control_pressed) { return 0; }
- cellprobvec.push_back(start + cellprob);
- start = cellprobvec[i];
- }
-
- for(int i=0; i<nrows; i++)
- {
- tmprow.assign(ncols, 0);
-
- while( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
- {
- if (m->control_pressed) { return 0; }
- double randNum = rand() / double(RAND_MAX);
- //cout << randNum << endl;
- if(randNum <= cellprobvec[0])
- {
- tmprow[0] = 1;
- continue;
- }
- for(int j=1;j<ncols;j++)
- {
- //cout << range[j] << endl;
- if(randNum <= cellprobvec[j] && randNum > cellprobvec[j-1] && tmprow[j] != 1)
- {
- tmprow[j] = 1;
- }
- }
- }
- tmpmatrix.push_back(tmprow);
- tmprow.clear();
- }
- co_matrix = tmpmatrix;
- tmpmatrix.clear();
- cellprobvec.clear();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim2plus");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- * same as sim2 but using initmatrix which is the initial co-occurrence matrix before transposition
- * may have to be changed depending on what matrix 'seed' is used. One way to use is to transpose
- * every null matrix before using an index and use the random matrix as a seed for the next null.
- */
-/**************************************************************************************************/
-void TrialSwap2::sim3(vector<vector<int> > &initmatrix)
-{
- try {
- for(int i=0;i<initmatrix.size();i++)
- {
- if (m->control_pressed) { break; }
- random_shuffle( initmatrix[i].begin(), initmatrix[i].end() );
- }
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim3");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim4(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{
- try {
- vector<double> colProb;
- vector<int> tmprow;//(ncols, 7);
- vector<vector<int> > tmpmatrix;
- vector<double> range;
- vector<double> randNums;
- int ncols = columntotal.size();
- int nrows = rowtotal.size();
- tmprow.clear();
-
- double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
- //cout << "col sum: " << colSum << endl;
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- colProb.push_back(columntotal[i]/colSum);
- }
-
- double start = 0.0;
-
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- range.push_back(start + colProb[i]);
- start = range[i];
- }
-
- for(int i=0;i<nrows;i++)
- {
- tmprow.assign(ncols, 0);
- if (m->control_pressed) { return 0; }
-
- while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
- {
- if (m->control_pressed) { return 0; }
-
- double randNum = rand() / double(RAND_MAX);
- if(randNum <= range[0])
- {
- tmprow[0] = 1;
- continue;
- }
- for(int j=1;j<ncols;j++)
- {
- if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
- {
- tmprow[j] = 1;
- }
-
- }
- }
- tmpmatrix.push_back(tmprow);
- tmprow.clear();
- }
-
- co_matrix = tmpmatrix;
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim4");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- * inverse of sim4, MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim5(vector<int> initcolumntotal,vector<int> initrowtotal, vector<vector<int> > &initmatrix)
-{
- try {
- vector<double> colProb;
- vector<int> tmprow;//(ncols, 7);
- vector<vector<int> > tmpmatrix;
- vector<double> range;
- vector<double> randNums;
- int ncols = initcolumntotal.size();
- int nrows = initrowtotal.size();
-
- tmprow.clear();
-
- double colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
- //cout << "col sum: " << colSum << endl;
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- colProb.push_back(initcolumntotal[i]/colSum);
- }
-
- double start = 0.0;
-
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- range.push_back(start + colProb[i]);
- start = range[i];
- }
-
- for(int i=0;i<nrows;i++)
- {
- tmprow.assign(ncols, 0);
- if (m->control_pressed) { return 0; }
-
- while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < initrowtotal[i])
- {
- if (m->control_pressed) { return 0; }
-
- double randNum = rand() / double(RAND_MAX);
- if(randNum <= range[0])
- {
- tmprow[0] = 1;
- continue;
- }
- for(int j=1;j<ncols;j++)
- {
- if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
- {
- tmprow[j] = 1;
- }
-
- }
- }
- tmpmatrix.push_back(tmprow);
- tmprow.clear();
- }
-
- initmatrix = tmpmatrix;
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim5");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim6(vector<int> columntotal, vector<vector<int> > &co_matrix)
-{
- try {
- vector<vector<int> > tmpmatrix;
- vector<double> colProb;
- vector<int> tmprow;
- vector<double> range;
- int ncols = columntotal.size();
- int nrows = co_matrix.size();
-
- int colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
-
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- colProb.push_back(columntotal[i]/double (colSum));
- }
-
- double start = 0.0;
-
- for(int i=0;i<ncols;i++)
- {
- if (m->control_pressed) { return 0; }
- range.push_back(start + colProb[i]);
- start = range[i];
- }
-
- for(int i=0;i<nrows;i++)
- {
- if (m->control_pressed) { return 0; }
- tmprow.assign(ncols, 0);
- int tmprowtotal;
- tmprowtotal = (rand() / double (RAND_MAX)) * 10;
- while ( tmprowtotal > ncols) {
- if (m->control_pressed) { return 0; }
- tmprowtotal = (rand() / double (RAND_MAX)) * 10;
- }
- //cout << tmprowtotal << endl;
- //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
-
- while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
- {
- if (m->control_pressed) { return 0; }
- double randNum = rand() / double(RAND_MAX);
- //cout << randNum << endl;
- if(randNum <= range[0])
- {
- tmprow[0] = 1;
- continue;
- }
- for(int j=1;j<ncols;j++)
- {
- //cout << range[j] << endl;
- if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
- {
- tmprow[j] = 1;
- }
-
- }
-
-
- }
-
- tmpmatrix.push_back(tmprow);
- tmprow.clear();
- }
-
- co_matrix = tmpmatrix;
- tmpmatrix.clear();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim6");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- * MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim7(vector<int> initrowtotal, vector<vector<int> > &co_matrix)
-{
- try {
- vector<vector<double> > probmatrix;
- vector<vector<int> > tmpmatrix;
- vector<double> colProb;
- vector<double> probrow;
- vector<int> tmprow;
- vector<double> range;
- double nc;
- int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
-
- tmpmatrix.assign(nrows, vector<int>(ncols, 0.));
-
- int rowsum = accumulate( initrowtotal.begin(), initrowtotal.end(), 0 );
-
- nc = rowsum * ncols;
- //cout << nc << endl;
-
- //assign null matrix based on probabilities
-
- double start = 0.0; // don't reset start -- probs should be from 0-1 thoughout the entire matrix
-
- for(int i=0;i<nrows;i++)
- {
- if (m->control_pressed) { return 0; }
- //cout << initrowtotal[i]/double(nc) << endl;
- double cellprob = initrowtotal[i]/double(nc);
- //cout << cellprob << endl;
- for(int j=0;j<ncols;j++)
- {
-
- probrow.push_back(start + cellprob);
- //cout << probrow[j] << endl;
- //cout << start << endl;
- start = start + cellprob;
- }
- probmatrix.push_back(probrow);
- probrow.clear();
- }
-
-
- //while(tmprowsum < rowsum)
- //for(int k=0;k<rowsum;k++)
- int k = 0;
- while(k < rowsum)
- {
- if (m->control_pressed) { return 0; }
- done:
- //cout << k << endl;
- //tmprowsum = accumulate( tmprowtotal.begin(), tmprowtotal.end(), 0 );
- double randNum = rand() / double(RAND_MAX);
- //cout << randNum << "+" << endl;
- //special case for the first entry
- if(randNum <= probmatrix[0][0] && tmpmatrix[0][0] != 1)
- {
- tmpmatrix[0][0] = 1;
- k++;
- //cout << k << endl;
- continue;
- }
-
-
- for(int i=0;i<nrows;i++)
- {
- if (m->control_pressed) { return 0; }
- for(int j=0;j<ncols;j++)
- {
- //cout << probmatrix[i][j] << endl;
- if(randNum <= probmatrix[i][j] && randNum > probmatrix[i][j-1] && tmpmatrix[i][j] != 1)
- {
- tmpmatrix[i][j] = 1;
- k++;
- //cout << k << endl;
- goto done;
- }
- //else
- //k = k-1;
- }
-
- }
-
- }
-
- co_matrix = tmpmatrix;
- return 0;
- //build probibility matrix
- /* for(int i=0;i<nrows;i++)
- {
- for(int j=0;j<ncols;j++)
- {
- probrow.push_back(rowtotal[i]/nc);
- }
- probmatrix.pushback(probrow);
- probrow.clear;
- }
- */
-
- /* int colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
-
- for(int i=0;i<ncols;i++)
- {
- colProb.push_back(initcolumntotal[i]/double (colSum));
- }
-
- double start = 0.0;
-
- for(int i=0;i<ncols;i++)
- {
- range.push_back(start + colProb[i]);
- start = range[i];
- }
-
- for(int i=0;i<nrows;i++)
- {
- tmprow.assign(ncols, 0);
- int tmprowtotal;
- tmprowtotal = (rand() / double (RAND_MAX)) * 10;
- while ( tmprowtotal > ncols)
- tmprowtotal = (rand() / double (RAND_MAX)) * 10;
- //cout << tmprowtotal << endl;
- //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
-
- while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
- {
- double randNum = rand() / double(RAND_MAX);
- //cout << randNum << endl;
- if(randNum <= range[0])
- {
- tmprow[0] = 1;
- continue;
- }
- for(int j=1;j<ncols;j++)
- {
- //cout << range[j] << endl;
- if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
- {
- tmprow[j] = 1;
- }
- }
- }
-
- tmpmatrix.push_back(tmprow);
- tmprow.clear();
- }
-
- initmatrix = tmpmatrix;
- */
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim7");
- exit(1);
- }
-}
-/**************************************************************************************************/
-/*
- *
- *
- *
- */
-/**************************************************************************************************/
-int TrialSwap2::sim8(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
-{
- try {
- double prob;
- double start = 0.0;
- int ncols = columntotal.size(); int nrows = rowtotal.size();
- double probarray[nrows * ncols];
- double randnum;
- int grandtotal;
- int total = 0;
-
- //double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
- double rowSum = accumulate( rowtotal.begin(), rowtotal.end(), 0 );
-
- if (m->control_pressed) { return 0; }
-
- //cout << "rowsum: " << rowSum << endl;
-
- grandtotal = rowSum;
-
- //create probability matrix with each site being between 0 and 1
- for (int i=0;i<nrows;i++) {
- if (m->control_pressed) { return 0; }
- for (int j=0;j<ncols;j++) {
- prob = (rowtotal[i] * columntotal[j])/(rowSum*rowSum);
- if (prob == 0.0)
- probarray[ncols * i + j] = -1;
- else
- probarray[ncols * i + j] = start + prob;
- //probmatrixrow.pushback(start + prob);
- start += prob;
- }
- }
- //cout << "prbarray" << endl;
- //for(int i=0;i<(nrows*ncols);i++)
- //cout << probarray[i] << " ";
- //cout << endl;
-
- //generate random muber between 0 and 1 and interate through probarray until found
- while (total < grandtotal) {
- if (m->control_pressed) { return 0; }
- randnum = rand() / double(RAND_MAX);
- //cout << "rand num: " << randnum << endl;
- if((randnum <= probarray[0]) && (probarray[0] != 2) ) {
- probarray[0] = 2;
- total++;
- continue;
- }
- for(int i=1;i<(nrows*ncols);i++) {
- if (m->control_pressed) { return 0; }
- if((randnum <= probarray[i]) && (randnum > probarray[i-1]) && (probarray[i] != 2) ) {
- probarray[i] = 2;
- total++;
- break;
- }
- else
- continue;
- }
- }
- //cout << "prbarray" << endl;
- //for(int i=0;i<(nrows*ncols);i++)
- //cout << probarray[i] << " ";
- //cout << endl;
- for(int i=0;i<nrows;i++) {
- if (m->control_pressed) { return 0; }
- for(int j=0;j<ncols;j++) {
- if(probarray[ncols * i + j] == 2)
- co_matrix[i][j] = 1;
- else
- co_matrix[i][j] = 0;
- }
- }
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "sim8");
- exit(1);
- }
-}
-/**************************************************************************************************/
-double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int> rowtotal)
+double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix, vector<int> rowtotal, int ncols, int nrows)
{
try {
double cscore = 0.0;
double D;
double normcscore = 0.0;
int nonzeros = 0;
- int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
+ //int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
vector<vector<double> > s; s.resize(nrows);
for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0.0); }//only fill half the matrix
-
+
for(int i=0;i<nrows-1;i++)
{
if(maxD != 0)
{
normcscore += D/maxD;
- nonzeros++;
- }
+ nonzeros++;
+ }
}
}
return cscore;
}
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "calc_c_score");
- exit(1);
- }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_c_score");
+ exit(1);
+ }
}
/**************************************************************************************************/
-int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int> rowtotal)
+int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int> rowtotal, int ncols, int nrows)
{
try {
int cunits=0;
//int s[nrows][ncols];
- int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
+ //int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
vector<vector<int> > s; s.resize(nrows);
for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0); }//only fill half the matrix
}
}
- return cunits;
+ return cunits;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_checker");
+ exit(1);
}
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "calc_checker");
- exit(1);
- }
}
/**************************************************************************************************/
-double TrialSwap2::calc_vratio (vector<int> rowtotal, vector<int> columntotal)
+double TrialSwap2::calc_vratio (int nrows, int ncols, vector<int> rowtotal, vector<int> columntotal)
{
try {
- int nrows = rowtotal.size();
- int ncols = columntotal.size();
+ //int nrows = rowtotal.size();
+ //int ncols = columntotal.size();
int sumCol = accumulate(columntotal.begin(), columntotal.end(), 0 );
- // int sumRow = accumulate(rowtotal.begin(), rowtotal.end(), 0 );
+ // int sumRow = accumulate(rowtotal.begin(), rowtotal.end(), 0 );
double colAvg = (double) sumCol / (double) ncols;
- // double rowAvg = (double) sumRow / (double) nrows;
+ // double rowAvg = (double) sumRow / (double) nrows;
double p = 0.0;
- // double totalRowVar = 0.0;
+ // double totalRowVar = 0.0;
double rowVar = 0.0;
double colVar = 0.0;
if (m->control_pressed) { return 0; }
p = (double) rowtotal[i]/(double) ncols;
rowVar += p * (1.0-p);
- }
+ }
for(int i=0;i<ncols;i++)
{
m->errorOut(e, "TrialSwap2", "calc_vratio");
exit(1);
}
-
+
}
/**************************************************************************************************/
-int TrialSwap2::calc_combo (vector<vector<int> > &initmatrix)
+int TrialSwap2::calc_combo (int nrows, int ncols, vector<vector<int> > &nullmatrix)
{
try {
- int initrows = initmatrix.size();
+ //need to transpose so we can compare rows (row-major order)
+ int tmpnrows = nrows;
+ vector<vector<int> > tmpmatrix;
+
+ vector<int> tmprow;
+ if(!tmpmatrix.empty())
+ tmpmatrix.clear();
+ for (int i=0;i<ncols;i++)
+ {
+ for (int j=0;j<nrows;j++)
+ {
+ tmprow.push_back(nullmatrix[j][i]);
+ }
+
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+
int unique = 0;
int match = 0;
- int matches = 0;
- for(int i=0;i<initrows;i++)
+ for(int j=0;j<ncols;j++)
{
match = 0;
- for(int j=i+1;j<=initrows;j++)
+ for(int i=j+1;i<=ncols;i++)
{
- if (m->control_pressed) { return 0; }
- if( (initmatrix[i] == initmatrix[j]))
+ //comparing matrix rows
+ if( (tmpmatrix[j] == tmpmatrix[i]))
{
match++;
- matches++;
break;
}
- }
+ }
//on the last iteration of a previously matched row it will add itself because it doesn't match any following rows, so that combination is counted
if (match == 0)
m->errorOut(e, "TrialSwap2", "calc_combo");
exit(1);
}
-}
+}
/**************************************************************************************************/
-int TrialSwap2::swap_checkerboards (vector<vector<int> > &co_matrix)
+int TrialSwap2::swap_checkerboards (vector<vector<int> > &co_matrix, int ncols, int nrows)
{
try {
- int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
+ //int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
int i, j, k, l;
i = m->getRandomIndex(nrows-1);
while((j = m->getRandomIndex(nrows-1) ) == i ) {;if (m->control_pressed) { return 0; }}
k = m->getRandomIndex(ncols-1);
while((l = m->getRandomIndex(ncols-1)) == k ) {;if (m->control_pressed) { return 0; }}
-
- //cout << co_matrix[i][k] << " " << co_matrix[j][l] << endl;
- //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
- //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
- //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
+
if((co_matrix[i][k]*co_matrix[j][l]==1 && co_matrix[i][l]+co_matrix[j][k]==0)||(co_matrix[i][k]+co_matrix[j][l]==0 && co_matrix[i][l]*co_matrix[j][k]==1)) //checking for checkerboard value and swap
{
co_matrix[i][k]=1-co_matrix[i][k];
co_matrix[i][l]=1-co_matrix[i][l];
co_matrix[j][k]=1-co_matrix[j][k];
co_matrix[j][l]=1-co_matrix[j][l];
- //cout << "swapped!" << endl;
+
}
- //cout << "i: " << i << " j: " << j << " k: " << " l: " << l << endl;
+
return 0;
}
catch(exception& e) {
m->mothurOut("nullMean: " + toString(nullMean)); m->mothurOutEndLine();
- m->mothurOut("sum: " + toString(sum)); m->mothurOutEndLine();
+ m->mothurOut("sum: " + toString(sum)); m->mothurOutEndLine();
sampleSD = sqrt( (1/runs) * sum );
- m->mothurOut("samplSD: " + toString(sampleSD)); m->mothurOutEndLine();
+ m->mothurOut("samplSD: " + toString(sampleSD)); m->mothurOutEndLine();
t = (nullMean - initialscore) / (sampleSD / sqrt(runs));
int TrialSwap2::print_matrix(vector<vector<int> > &matrix, int nrows, int ncols)
{
try {
- m->mothurOut("matrix:"); m->mothurOutEndLine();
+ m->mothurOut("matrix:"); m->mothurOutEndLine();
for (int i = 0; i < nrows; i++)
{
if (m->control_pressed) { return 0; }
for (int j = 0; j < ncols; j++)
{
- m->mothurOut(toString(matrix[i][j]));
- }
+ m->mothurOut(toString(matrix[i][j]));
+ }
m->mothurOutEndLine();
}
return 0;
}
}
/**************************************************************************************************/
-int TrialSwap2::transpose_matrix (vector<vector<int> > &initmatrix, vector<vector<int> > &co_matrix)//, int nrows, int nocols)
-{
- try {
- int ncols = initmatrix.size(); int nrows = initmatrix[0].size();
- int tmpnrows = nrows;
- //vector<vector<int> > tmpvec;
- vector<int> tmprow;
- if(!co_matrix.empty())
- co_matrix.clear();
- for (int i=0;i<nrows;i++)
- {
- if (m->control_pressed) { return 0; }
- for (int j=0;j<ncols;j++)
- {
- tmprow.push_back(initmatrix[j][i]);
- }
- /*if (accumulate( tmprow.begin(), tmprow.end(), 0 ) == 0)
- {
- tmpnrows--;
- }
- else */
- co_matrix.push_back(tmprow);
- tmprow.clear();
- }
- nrows = tmpnrows;
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "transpose_matrix");
- exit(1);
- }
-}
-/**************************************************************************************************/
-int TrialSwap2::update_row_col_totals(vector<vector<int> > &co_matrix, vector<int> &rowtotal, vector<int> &columntotal)
-{
- try {
- //rowtotal.clear();
- //columntotal.clear();
- //generate (rowtotal.begin(), rowtotal.end(), 0);
- //generate (columntotal.begin(), columntotal.end(), 0);
- int nrows = co_matrix.size();
- int ncols = co_matrix[0].size();
- vector<int> tmpcolumntotal; tmpcolumntotal.resize(ncols, 0);
- vector<int> tmprowtotal; tmprowtotal.resize(nrows, 0);
-
- int rowcount = 0;
-
- for (int i = 0; i < nrows; i++)
- {
- if (m->control_pressed) { return 0; }
- for (int j = 0; j < ncols; j++)
- {
- if (co_matrix[i][j] == 1)
- {
- rowcount++;
- tmpcolumntotal[j]++;
- }
- }
- tmprowtotal[i] = rowcount;
- rowcount = 0;
- }
- columntotal = tmpcolumntotal;
- rowtotal = tmprowtotal;
- /*cout << "rowtotal: ";
- for(int i = 0; i<nrows; i++) { cout << rowtotal[i]; }
- cout << " ";
- cout << " coltotal: ";
- for(int i = 0; i<ncols; i++) { cout << columntotal[i]; }
- cout << endl;*/
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "TrialSwap2", "update_row_col_totals");
- exit(1);
- }
-}
-/**************************************************************************************************/
class TrialSwap2 {
public:
- TrialSwap2(){ m = MothurOut::getInstance(); };
+ TrialSwap2(){ m = MothurOut::getInstance(); };
~TrialSwap2(){};
double calc_pvalue_lessthan (vector<double>, double);
double calc_pvalue_greaterthan (vector<double>, double);
- int swap_checkerboards (vector<vector<int> > &);
- int calc_combo (vector<vector<int> > &);
- double calc_vratio (vector<int>, vector<int>);
- int calc_checker (vector<vector<int> > &,vector<int>);
- double calc_c_score (vector<vector<int> > &,vector<int>);
-
- int sim1 (vector<vector<int> > &);
- void sim2(vector<vector<int> >&);
- int sim2plus(vector<int>, vector<vector<int> > &);
- void sim3(vector<vector<int> > &);
- int sim4(vector<int>, vector<int>, vector<vector<int> > &);
- int sim5(vector<int>, vector<int>, vector<vector<int> > &);
- int sim6(vector<int>, vector<vector<int> > &);
- int sim7(vector<int>, vector<vector<int> > &);
- int sim8(vector<int>, vector<int>, vector<vector<int> > &);
- int transpose_matrix (vector<vector<int> > &, vector<vector<int> > &);
- int update_row_col_totals(vector<vector<int> > &, vector<int>&, vector<int>&);
-
+ int swap_checkerboards (vector<vector<int> > &, int, int);
+ int calc_combo (int, int, vector<vector<int> > &);
+ double calc_vratio (int, int, vector<int>, vector<int>);
+ int calc_checker (vector<vector<int> > &, vector<int>, int, int);
+ double calc_c_score (vector<vector<int> > &, vector<int>, int, int);
+
private:
MothurOut* m;
int print_matrix(vector<vector<int> > &, int, int);
-
+
};
-
#endif
Sequence currSeq(in); m->gobble(in);
out << currSeq.getName() << '\t' << it->second << endl;
+
+ if (nameFile != "") {
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ vector<string> thisSeqsNames;
+ m->splitAtChar(itName->second, thisSeqsNames, ',');
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ out << thisSeqsNames[k] << '\t' << it->second << endl;
+ }
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
}
in.close();
out.close();
cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
}
-
- if(qfilename == "") { qLines = lines; } //files with duds
}
+ if(qfilename == "") { qLines = lines; } //files with duds
return 1;
#endif
--- /dev/null
+#if UCHIMES\r
+\r
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "ultra.h"\r
+#include <set>\r
+\r
+const float MAX_WORD_COUNT_DROP = 1;\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order);\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path);\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path);\r
+void USort(const SeqData &Query, const SeqDB &DB, vector<float> &WordCounts,\r
+ vector<unsigned> &Order);\r
+\r
+void AddTargets(SeqDB &DB, const SeqData &Query, set<unsigned> &TargetIndexes)\r
+ {\r
+ const unsigned SeqCount = DB.GetSeqCount();\r
+ if (SeqCount == 0)\r
+ return;\r
+\r
+ vector<float> WordCounts;\r
+ vector<unsigned> Order;\r
+ USort(Query, DB, WordCounts, Order);\r
+ asserta(SIZE(Order) == SeqCount);\r
+ unsigned TopSeqIndex = Order[0];\r
+ float TopWordCount = WordCounts[TopSeqIndex];\r
+ for (unsigned i = 0; i < SeqCount; ++i)\r
+ {\r
+ unsigned SeqIndex = Order[i];\r
+ float WordCount = WordCounts[SeqIndex];\r
+ if (TopWordCount - WordCount > MAX_WORD_COUNT_DROP)\r
+ return;\r
+ TargetIndexes.insert(SeqIndex);\r
+ }\r
+ }\r
+\r
+#endif\r
--- /dev/null
+#include "myutils.h"\r
+#include "seq.h"\r
+#include "chime.h"\r
+#include "dp.h"\r
+\r
+#define TRACE 0\r
+#define TRACE_BS 0\r
+\r
+void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB,\r
+ const string &PathQA, const string &PathQB,\r
+ string &Q3, string &A3, string &B3);\r
+\r
+void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,\r
+ const string &QLabel, const string &ALabel, const string &BLabel,\r
+ ChimeHit2 &Hit);\r
+\r
+double GetScore2(double Y, double N, double A)\r
+ {\r
+ return Y/(opt_xn*(N + opt_dn) + opt_xa*A);\r
+ }\r
+\r
+void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,\r
+ const string &QLabel, const string &ALabel, const string &BLabel,\r
+ ChimeHit2 &Hit)\r
+ {\r
+ Hit.Clear();\r
+ Hit.QLabel = QLabel;\r
+\r
+ const byte *Q3Seq = (const byte *) Q3.c_str();\r
+ const byte *A3Seq = (const byte *) A3.c_str();\r
+ const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+ const unsigned ColCount = SIZE(Q3);\r
+ asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+#if TRACE\r
+ Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq);\r
+ Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq);\r
+ Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq);\r
+#endif\r
+\r
+// Discard terminal gaps\r
+ unsigned ColLo = UINT_MAX;\r
+ unsigned ColHi = UINT_MAX;\r
+ for (unsigned Col = 2; Col + 2 < ColCount; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (isacgt(q) && isacgt(a) && isacgt(b))\r
+ {\r
+ if (ColLo == UINT_MAX)\r
+ ColLo = Col;\r
+ ColHi = Col;\r
+ }\r
+ }\r
+\r
+ if (ColLo == UINT_MAX)\r
+ return;\r
+\r
+ unsigned QPos = 0;\r
+ unsigned APos = 0;\r
+ unsigned BPos = 0;\r
+ unsigned DiffCount = 0;\r
+\r
+ vector<unsigned> ColToQPos(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumCount(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumSameA(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumSameB(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumForA(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumForB(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumAbstain(ColLo, UINT_MAX);\r
+ vector<unsigned> AccumAgainst(ColLo, UINT_MAX);\r
+\r
+ unsigned SumSameA = 0;\r
+ unsigned SumSameB = 0;\r
+ unsigned SumSameAB = 0;\r
+ unsigned Sum = 0;\r
+ unsigned SumForA = 0;\r
+ unsigned SumForB = 0;\r
+ unsigned SumAbstain = 0;\r
+ unsigned SumAgainst = 0;\r
+ for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (isacgt(q) && isacgt(a) && isacgt(b))\r
+ {\r
+ if (q == a)\r
+ ++SumSameA;\r
+ if (q == b)\r
+ ++SumSameB;\r
+ if (a == b)\r
+ ++SumSameAB;\r
+ if (q == a && q != b)\r
+ ++SumForA;\r
+ if (q == b && q != a)\r
+ ++SumForB;\r
+ if (a == b && q != a)\r
+ ++SumAgainst;\r
+ if (q != a && q != b)\r
+ ++SumAbstain;\r
+ ++Sum;\r
+ }\r
+\r
+ ColToQPos.push_back(QPos);\r
+ AccumSameA.push_back(SumSameA);\r
+ AccumSameB.push_back(SumSameB);\r
+ AccumCount.push_back(Sum);\r
+ AccumForA.push_back(SumForA);\r
+ AccumForB.push_back(SumForB);\r
+ AccumAbstain.push_back(SumAbstain);\r
+ AccumAgainst.push_back(SumAgainst);\r
+\r
+ if (q != '-')\r
+ ++QPos;\r
+ if (a != '-')\r
+ ++APos;\r
+ if (b != '-')\r
+ ++BPos;\r
+ }\r
+\r
+ asserta(SIZE(ColToQPos) == ColHi+1);\r
+ asserta(SIZE(AccumSameA) == ColHi+1);\r
+ asserta(SIZE(AccumSameB) == ColHi+1);\r
+ asserta(SIZE(AccumAbstain) == ColHi+1);\r
+ asserta(SIZE(AccumAgainst) == ColHi+1);\r
+\r
+ double IdQA = double(SumSameA)/Sum;\r
+ double IdQB = double(SumSameB)/Sum;\r
+ double IdAB = double(SumSameAB)/Sum;\r
+ double MaxId = max(IdQA, IdQB);\r
+\r
+#if TRACE\r
+ Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0);\r
+ Log("\n");\r
+ Log(" x AQB IdAL IdBL IdAR IdBR DivAB DivBA YAL YBL YAR YBR AbL AbR ScoreAB ScoreAB XLo Xhi\n");\r
+ Log("----- --- ----- ----- ----- ----- ------ ------ ----- ----- ----- ----- ----- ----- ------- ------- ----- -----\n");\r
+#endif\r
+ unsigned BestXLo = UINT_MAX;\r
+ unsigned BestXHi = UINT_MAX;\r
+ double BestDiv = 0.0;\r
+ double BestIdQM = 0.0;\r
+ double BestScore = 0.0;\r
+\r
+// Find range of cols BestXLo..BestXHi that maximizes score\r
+ bool FirstA = false;\r
+\r
+// NOTE: Must be < ColHi not <= because use Col+1 below\r
+ for (unsigned Col = ColLo; Col < ColHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ unsigned SameAL = AccumSameA[Col];\r
+ unsigned SameBL = AccumSameB[Col];\r
+ unsigned SameAR = SumSameA - AccumSameA[Col];\r
+ unsigned SameBR = SumSameB - AccumSameB[Col];\r
+\r
+ double IdAB = double(SameAL + SameBR)/Sum;\r
+ double IdBA = double(SameBL + SameAR)/Sum;\r
+\r
+ unsigned ForAL = AccumForA[Col];\r
+ unsigned ForBL = AccumForB[Col];\r
+ unsigned ForAR = SumForA - AccumForA[Col+1];\r
+ unsigned ForBR = SumForB - AccumForB[Col+1];\r
+ unsigned AbL = AccumAbstain[Col];\r
+ unsigned AbR = SumAbstain - AccumAbstain[Col+1];\r
+\r
+ double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR);\r
+ double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR);\r
+ \r
+ double DivAB = IdAB/MaxId;\r
+ double DivBA = IdBA/MaxId;\r
+ double MaxDiv = max(DivAB, DivBA);\r
+\r
+ //if (MaxDiv > BestDiv)\r
+ // {\r
+ // BestDiv = MaxDiv;\r
+ // BestXLo = Col;\r
+ // BestXHi = Col;\r
+ // FirstA = (DivAB > DivBA);\r
+ // if (FirstA)\r
+ // BestIdQM = IdAB;\r
+ // else\r
+ // BestIdQM = IdBA;\r
+ // }\r
+ //else if (MaxDiv == BestDiv)\r
+ // BestXHi = Col;\r
+\r
+ double MaxScore = max(ScoreAB, ScoreBA);\r
+ if (MaxScore > BestScore)\r
+ {\r
+ BestScore = MaxScore;\r
+ BestXLo = Col;\r
+ BestXHi = Col;\r
+ FirstA = (ScoreAB > ScoreBA);\r
+ if (FirstA)\r
+ BestIdQM = IdAB;\r
+ else\r
+ BestIdQM = IdBA;\r
+ if (MaxDiv > BestDiv)\r
+ BestDiv = MaxDiv;\r
+ }\r
+ else if (MaxScore == BestScore)\r
+ {\r
+ BestXHi = Col;\r
+ if (MaxDiv > BestDiv)\r
+ BestDiv = MaxDiv;\r
+ }\r
+\r
+#if TRACE\r
+ {\r
+ Log("%5u", Col);\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ Log(" %c%c%c", a, q, b);\r
+ Log(" %5u", SameAL);\r
+ Log(" %5u", SameBL);\r
+ Log(" %5u", SameAR);\r
+ Log(" %5u", SameBR);\r
+ Log(" %5.4f", DivAB);\r
+ Log(" %5.4f", DivBA);\r
+ Log(" %5u", ForAL);\r
+ Log(" %5u", ForBL);\r
+ Log(" %5u", ForAR);\r
+ Log(" %5u", ForBR);\r
+ Log(" %5u", AbL);\r
+ Log(" %5u", AbR);\r
+ Log(" %7.4f", ScoreAB);\r
+ Log(" %7.4f", ScoreBA);\r
+ if (BestXLo != UINT_MAX)\r
+ Log(" %5u", BestXLo);\r
+ if (BestXHi != UINT_MAX)\r
+ Log(" %5u", BestXHi);\r
+ Log("\n");\r
+ }\r
+#endif\r
+ }\r
+\r
+ if (BestXLo == UINT_MAX)\r
+ {\r
+#if TRACE\r
+ Log("\n");\r
+ Log("No crossover found.\n");\r
+#endif\r
+ return;\r
+ }\r
+#if TRACE\r
+ Log("BestX col %u - %u\n", BestXLo, BestXHi);\r
+#endif\r
+\r
+// Find maximum region of identity within BestXLo..BestXHi\r
+ unsigned ColXLo = (BestXLo + BestXHi)/2;\r
+ unsigned ColXHi = ColXLo;\r
+ unsigned SegLo = UINT_MAX;\r
+ unsigned SegHi = UINT_MAX;\r
+ for (unsigned Col = BestXLo; Col <= BestXHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (q == a && q == b)\r
+ {\r
+ if (SegLo == UINT_MAX)\r
+ SegLo = Col;\r
+ SegHi = Col;\r
+ }\r
+ else\r
+ {\r
+ unsigned SegLength = SegHi - SegLo + 1;\r
+ unsigned BestSegLength = ColXHi - ColXLo + 1;\r
+ if (SegLength > BestSegLength)\r
+ {\r
+ ColXLo = SegLo;\r
+ ColXHi = SegHi;\r
+ }\r
+ SegLo = UINT_MAX;\r
+ SegHi = UINT_MAX;\r
+ }\r
+ }\r
+ unsigned SegLength = SegHi - SegLo + 1;\r
+ unsigned BestSegLength = ColXHi - ColXLo + 1;\r
+ if (SegLength > BestSegLength)\r
+ {\r
+ ColXLo = SegLo;\r
+ ColXHi = SegHi;\r
+ }\r
+\r
+ QPos = 0;\r
+ for (unsigned x = 0; x < ColCount; ++x)\r
+ {\r
+ if (x == ColXLo)\r
+ Hit.QXLo = QPos;\r
+ else if (x == ColXHi)\r
+ {\r
+ Hit.QXHi = QPos;\r
+ break;\r
+ }\r
+ char q = Q3Seq[x];\r
+ if (q != '-')\r
+ ++QPos;\r
+ }\r
+\r
+ Hit.ColXLo = ColXLo;\r
+ Hit.ColXHi = ColXHi;\r
+\r
+ //if (FirstA)\r
+ // {\r
+ // Hit.LY = AccumForA[ColXLo];\r
+ // Hit.LN = AccumForB[ColXLo];\r
+\r
+ // Hit.RY = SumForB - AccumForB[ColXHi];\r
+ // Hit.RN = SumForA - AccumForA[ColXHi];\r
+ // }\r
+ //else\r
+ // {\r
+ // Hit.LY = AccumForB[ColXLo];\r
+ // Hit.LN = AccumForA[ColXLo];\r
+ // Hit.RY = SumForA - AccumForA[ColXHi];\r
+ // Hit.RN = SumForB - AccumForB[ColXHi];\r
+ // }\r
+\r
+ //Hit.LA = AccumAgainst[ColXLo];\r
+ //Hit.LD = AccumAbstain[ColXLo];\r
+\r
+ //Hit.RA = SumAgainst - AccumAgainst[ColXHi];\r
+ //Hit.RD = SumAbstain - AccumAbstain[ColXHi];\r
+\r
+ Hit.PctIdAB = IdAB*100.0;\r
+ Hit.PctIdQM = BestIdQM*100.0;\r
+\r
+ Hit.Div = (BestDiv - 1.0)*100.0;\r
+\r
+ //Hit.QSD = QSD;\r
+ Hit.Q3 = Q3;\r
+ Hit.QLabel = QLabel;\r
+ if (FirstA)\r
+ {\r
+ //Hit.ASD = ASD;\r
+ //Hit.BSD = BSD;\r
+ //Hit.PathQA = PathQA;\r
+ //Hit.PathQB = PathQB;\r
+ Hit.A3 = A3;\r
+ Hit.B3 = B3;\r
+ Hit.ALabel = ALabel;\r
+ Hit.BLabel = BLabel;\r
+ Hit.PctIdQA = IdQA*100.0;\r
+ Hit.PctIdQB = IdQB*100.0;\r
+ }\r
+ else\r
+ {\r
+ Hit.A3 = B3;\r
+ Hit.B3 = A3;\r
+ Hit.ALabel = BLabel;\r
+ Hit.BLabel = ALabel;\r
+ Hit.PctIdQA = IdQB*100.0;\r
+ Hit.PctIdQB = IdQA*100.0;\r
+ }\r
+\r
+// CS SNPs\r
+ Hit.CS_LY = 0;\r
+ Hit.CS_LN = 0;\r
+ Hit.CS_RY = 0;\r
+ Hit.CS_RN = 0;\r
+ Hit.CS_LA = 0;\r
+ Hit.CS_RA = 0;\r
+\r
+ //vector<float> Cons;\r
+ //for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ // {\r
+ // char q = Q3Seq[Col];\r
+ // char a = A3Seq[Col];\r
+ // char b = B3Seq[Col];\r
+ // if (q == a && q == b && a == b)\r
+ // {\r
+ // Cons.push_back(1.0f);\r
+ // continue;\r
+ // }\r
+\r
+ // bool gapq = isgap(q);\r
+ // bool gapa = isgap(a);\r
+ // bool gapb = isgap(b);\r
+\r
+ // if (!gapq && !gapa && !gapb)\r
+ // {\r
+ // if (q == a || q == b || a == b)\r
+ // Cons.push_back(0.75);\r
+ // else\r
+ // Cons.push_back(0.5);\r
+ // }\r
+ // else\r
+ // {\r
+ // if (!gapa && (a == b || a == q))\r
+ // Cons.push_back(0.5f);\r
+ // else if (!gapb && b == q)\r
+ // Cons.push_back(0.5f);\r
+ // else\r
+ // Cons.push_back(0.0f);\r
+ // }\r
+ // }\r
+\r
+ //float fLY = 0.0f;\r
+ //float fLN = 0.0f;\r
+ //float fLA = 0.0f;\r
+ //float fRY = 0.0f;\r
+ //float fRN = 0.0f;\r
+ //float fRA = 0.0f;\r
+ for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ if (q == a && q == b && a == b)\r
+ continue;\r
+\r
+ unsigned ngaps = 0;\r
+ if (isgap(q))\r
+ ++ngaps;\r
+ if (isgap(a))\r
+ ++ngaps;\r
+ if (isgap(b))\r
+ ++ngaps;\r
+\r
+ if (opt_skipgaps)\r
+ {\r
+ if (ngaps == 3)\r
+ continue;\r
+ }\r
+ else\r
+ {\r
+ if (ngaps == 2)\r
+ continue;\r
+ }\r
+\r
+ if (!FirstA)\r
+ swap(a, b);\r
+\r
+ //float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4;\r
+ //if (Col < ColXLo)\r
+ // {\r
+ // if (q == a && q != b)\r
+ // fLY += AvgCons;\r
+ // else if (q == b && q != a)\r
+ // fLN += AvgCons;\r
+ // else\r
+ // fLA += AvgCons;\r
+ // }\r
+ //else if (Col > ColXHi)\r
+ // {\r
+ // if (q == b && q != a)\r
+ // fRY += AvgCons;\r
+ // else if (q == a && q != b)\r
+ // fRN += AvgCons;\r
+ // else\r
+ // fRA += AvgCons;\r
+ // }\r
+\r
+ if (opt_skipgaps2)\r
+ {\r
+ if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+ continue;\r
+ if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+ continue;\r
+ }\r
+\r
+ //if (Col > 0 && isgap(Q3Seq[Col-1]))\r
+ //continue;\r
+ //if (Col + 1 < ColCount && isgap(Q3Seq[Col+1]))\r
+ // continue;\r
+\r
+ if (Col < ColXLo)\r
+ {\r
+ if (q == a && q != b)\r
+ ++Hit.CS_LY;\r
+ else if (q == b && q != a)\r
+ ++Hit.CS_LN;\r
+ else\r
+ ++Hit.CS_LA;\r
+ }\r
+ else if (Col > ColXHi)\r
+ {\r
+ if (q == b && q != a)\r
+ ++Hit.CS_RY;\r
+ else if (q == a && q != b)\r
+ ++Hit.CS_RN;\r
+ else\r
+ ++Hit.CS_RA;\r
+ }\r
+ }\r
+\r
+ double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);\r
+ double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);\r
+ Hit.Score = ScoreL*ScoreR;\r
+\r
+ extern bool g_UchimeDeNovo;\r
+\r
+ //if (0)//g_UchimeDeNovo)\r
+ // {\r
+ // double AbQ = GetAbFromLabel(QLabel.c_str());\r
+ // double AbA = GetAbFromLabel(ALabel.c_str());\r
+ // double AbB = GetAbFromLabel(BLabel.c_str());\r
+ // if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0)\r
+ // {\r
+ // double MinAb = min(AbA, AbB);\r
+ // double Ratio = MinAb/AbQ;\r
+ // double t = Ratio - opt_abx;\r
+ // // double Factor = 2.0/(1.0 + exp(-t));\r
+ // double Factor = min(Ratio, opt_abx)/opt_abx;\r
+ // if (opt_verbose)\r
+ // Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str());\r
+ // Hit.Score *= Factor;\r
+ // }\r
+ // }\r
+\r
+ extern FILE *g_fUChimeAlns;\r
+ if (g_fUChimeAlns != 0 && Hit.Div > 0.0)\r
+ {\r
+ void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit);\r
+ WriteChimeHitX(g_fUChimeAlns, Hit);\r
+ }\r
+ }\r
+\r
+void AlignChime3(const string &Q3, const string &A3, const string &B3,\r
+ const string &QLabel, const string &ALabel, const string &BLabel,\r
+ ChimeHit2 &Hit)\r
+ {\r
+ if (opt_ucl)\r
+ AlignChimeLocal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);\r
+ else\r
+ AlignChimeGlobal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit);\r
+ }\r
+\r
+static void StripGaps(const byte *Seq, unsigned L, string &s)\r
+ {\r
+ s.clear();\r
+ for (unsigned i = 0; i < L; ++i)\r
+ {\r
+ char c = Seq[i];\r
+ if (!isgap(c))\r
+ s.push_back(c);\r
+ }\r
+ }\r
+\r
+static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut)\r
+ {\r
+ SDOut = SDIn;\r
+ byte *s = myalloc(byte, SDIn.L);\r
+ unsigned k = 0;\r
+ for (unsigned i = 0; i < SDIn.L; ++i)\r
+ {\r
+ char c = SDIn.Seq[i];\r
+ if (!isgap(c))\r
+ s[k++] = toupper(c);\r
+ }\r
+ SDOut.Seq = s;\r
+ SDOut.L = k;\r
+ }\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+ const string &PathQA, const string &PathQB, ChimeHit2 &Hit)\r
+ {\r
+ //if (opt_ucl)\r
+ // {\r
+ // AlignChimeLocal(QSD, ASD, BSD, PathQA, PathQB, Hit);\r
+ // return;\r
+ // }\r
+\r
+ string Q3;\r
+ string A3;\r
+ string B3;\r
+ Make3Way(QSD, ASD, BSD, PathQA, PathQB, Q3, A3, B3);\r
+\r
+ AlignChime3(Q3, A3, B3, QSD.Label, ASD.Label, BSD.Label, Hit);\r
+ }\r
+\r
+void AlignChime3SDRealign(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,\r
+ ChimeHit2 &Hit)\r
+ {\r
+ SeqData QSD;\r
+ SeqData ASD;\r
+ SeqData BSD;\r
+ StripGapsAlloc(QSD3, QSD);\r
+ StripGapsAlloc(ASD3, ASD);\r
+ StripGapsAlloc(BSD3, BSD);\r
+\r
+ string PathQA;\r
+ string PathQB;\r
+ bool FoundQA = GlobalAlign(QSD, ASD, PathQA);\r
+ bool FoundQB = GlobalAlign(QSD, BSD, PathQB);\r
+ if (!FoundQA || !FoundQB)\r
+ {\r
+ Hit.Clear();\r
+ Hit.QLabel = QSD3.Label;\r
+ return;\r
+ }\r
+\r
+ AlignChime(QSD, ASD, BSD, PathQA, PathQB, Hit);\r
+\r
+ myfree((void *) QSD.Seq);\r
+ myfree((void *) ASD.Seq);\r
+ myfree((void *) BSD.Seq);\r
+ }\r
+\r
+void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3,\r
+ ChimeHit2 &Hit)\r
+ {\r
+ if (opt_realign)\r
+ {\r
+ AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit);\r
+ return;\r
+ }\r
+\r
+ string Q3;\r
+ string A3;\r
+ string B3;\r
+\r
+ const unsigned ColCount = QSD3.L;\r
+ asserta(ASD3.L == ColCount && BSD3.L == ColCount);\r
+\r
+ Q3.reserve(ColCount);\r
+ A3.reserve(ColCount);\r
+ B3.reserve(ColCount);\r
+\r
+ const byte *QS = QSD3.Seq;\r
+ const byte *AS = ASD3.Seq;\r
+ const byte *BS = BSD3.Seq;\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ byte q = toupper(QS[Col]);\r
+ byte a = toupper(AS[Col]);\r
+ byte b = toupper(BS[Col]);\r
+\r
+ if (isgap(q) && isgap(a) && isgap(b))\r
+ continue;\r
+\r
+ Q3.push_back(q);\r
+ A3.push_back(a);\r
+ B3.push_back(b);\r
+ }\r
+\r
+ AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit);\r
+ }\r
--- /dev/null
+#include "myutils.h"\r
+#include "seq.h"\r
+#include "chime.h"\r
+\r
+#define TRACE 0\r
+\r
+/***\r
+Let:\r
+ S[i] = Score of col i: 0=no SNP, +1 = Y, -3 = N or A.\r
+\r
+ V[k] = Best segment score from j, j+1 .. k for all possible j\r
+ max(j) Sum i=j..k S[i]\r
+\r
+Recursion relation:\r
+ V[k] = S[k] + max (V[k-1], 0)\r
+***/\r
+\r
+void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3,\r
+ const string &QLabel, const string &ALabel, const string &BLabel,\r
+ ChimeHit2 &Hit);\r
+\r
+void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB,\r
+ const string &PathQA, const string &PathQB,\r
+ string &Q3, string &A3, string &B3);\r
+\r
+double GetScore2(double Y, double N, double A);\r
+\r
+void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3,\r
+ const string &QLabel, const string &ALabel, const string &BLabel,\r
+ ChimeHit2 &Hit)\r
+ {\r
+ Hit.Clear();\r
+\r
+ const byte *Q3Seq = (const byte *) Q3.c_str();\r
+ const byte *A3Seq = (const byte *) A3.c_str();\r
+ const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+ const unsigned ColCount = SIZE(Q3);\r
+ asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+ vector<float> ColScoresA(ColCount, 0.0f);\r
+ vector<float> ColScoresB(ColCount, 0.0f);\r
+\r
+ float ScoreN = -(float) opt_xn;\r
+ unsigned QL = 0;\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (!isgap(q))\r
+ ++QL;\r
+\r
+ if (q == a && q == b && a == b)\r
+ continue;\r
+\r
+ if (isgap(q) || isgap(a) || isgap(b))\r
+ continue;\r
+\r
+ if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+ continue;\r
+\r
+ if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+ continue;\r
+\r
+ if (q == a && q != b)\r
+ ColScoresA[Col] = 1;\r
+ else\r
+ ColScoresA[Col] = ScoreN;\r
+\r
+ if (q == b && q != a)\r
+ ColScoresB[Col] = 1;\r
+ else\r
+ ColScoresB[Col] = ScoreN;\r
+ }\r
+\r
+ vector<float> LVA(ColCount, 0.0f);\r
+ vector<float> LVB(ColCount, 0.0f);\r
+\r
+ LVA[0] = ColScoresA[0];\r
+ LVB[0] = ColScoresB[0];\r
+ for (unsigned Col = 1; Col < ColCount; ++Col)\r
+ {\r
+ LVA[Col] = max(LVA[Col-1], 0.0f) + ColScoresA[Col];\r
+ LVB[Col] = max(LVB[Col-1], 0.0f) + ColScoresB[Col];\r
+ }\r
+\r
+ vector<float> RVA(ColCount, 0.0f);\r
+ vector<float> RVB(ColCount, 0.0f);\r
+\r
+ RVA[ColCount-1] = ColScoresA[ColCount-1];\r
+ RVB[ColCount-1] = ColScoresB[ColCount-1];\r
+ for (int Col = ColCount-2; Col >= 0; --Col)\r
+ {\r
+ RVA[Col] = max(RVA[Col+1], 0.0f) + ColScoresA[Col];\r
+ RVB[Col] = max(RVB[Col+1], 0.0f) + ColScoresB[Col];\r
+ }\r
+\r
+ bool FirstA = true;\r
+ float MaxSum = 0.0;\r
+ unsigned ColX = UINT_MAX;\r
+ for (unsigned Col = 1; Col < ColCount-1; ++Col)\r
+ {\r
+ float Sum = LVA[Col] + RVB[Col+1];\r
+ if (Sum > MaxSum)\r
+ {\r
+ FirstA = true;\r
+ MaxSum = Sum;\r
+ ColX = Col;\r
+ }\r
+ }\r
+\r
+ for (unsigned Col = 1; Col < ColCount-1; ++Col)\r
+ {\r
+ float Sum = LVB[Col] + RVA[Col+1];\r
+ if (Sum > MaxSum)\r
+ {\r
+ FirstA = false;\r
+ MaxSum = Sum;\r
+ ColX = Col;\r
+ }\r
+ }\r
+ if (ColX == UINT_MAX)\r
+ return;\r
+\r
+ unsigned ColLo = UINT_MAX;\r
+ unsigned ColHi = UINT_MAX;\r
+ if (FirstA)\r
+ {\r
+ float Sum = 0.0f;\r
+ for (int Col = ColX; Col >= 0; --Col)\r
+ {\r
+ Sum += ColScoresA[Col];\r
+ if (Sum >= LVA[ColX])\r
+ {\r
+ ColLo = Col;\r
+ break;\r
+ }\r
+ }\r
+ asserta(Sum >= LVA[ColX]);\r
+ Sum = 0.0f;\r
+ for (unsigned Col = ColX+1; Col < ColCount; ++Col)\r
+ {\r
+ Sum += ColScoresB[Col];\r
+ if (Sum >= RVB[ColX])\r
+ {\r
+ ColHi = Col;\r
+ break;\r
+ }\r
+ }\r
+ asserta(Sum >= RVB[ColX]);\r
+ }\r
+ else\r
+ {\r
+ float Sum = 0.0f;\r
+ for (int Col = ColX; Col >= 0; --Col)\r
+ {\r
+ Sum += ColScoresB[Col];\r
+ if (Sum >= LVB[ColX])\r
+ {\r
+ ColLo = Col;\r
+ break;\r
+ }\r
+ }\r
+ asserta(Sum >= LVB[ColX]);\r
+ Sum = 0.0f;\r
+ for (unsigned Col = ColX+1; Col < ColCount; ++Col)\r
+ {\r
+ Sum += ColScoresA[Col];\r
+ if (Sum >= RVA[ColX])\r
+ {\r
+ ColHi = Col;\r
+ break;\r
+ }\r
+ }\r
+ asserta(Sum >= RVA[ColX]);\r
+ }\r
+\r
+ unsigned ColXHi = ColX;\r
+ for (unsigned Col = ColX + 1; Col < ColCount; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ \r
+ if (q == a && q == b && !isgap(q))\r
+ ColXHi = Col;\r
+ else\r
+ break;\r
+ }\r
+\r
+ unsigned ColXLo = ColX;\r
+ for (int Col = (int) ColX - 1; Col >= 0; --Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ \r
+ if (q == a && q == b && !isgap(q))\r
+ ColXLo = Col;\r
+ else\r
+ break;\r
+ }\r
+\r
+ unsigned IdQA = 0;\r
+ unsigned IdQB = 0;\r
+ unsigned IdAB = 0;\r
+ unsigned NQA = 0;\r
+ unsigned NQB = 0;\r
+ unsigned NAB = 0;\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (!isgap(q) && !isgap(a))\r
+ {\r
+ ++NQA;\r
+ if (q == a)\r
+ ++IdQA;\r
+ }\r
+\r
+ if (!isgap(q) && !isgap(b))\r
+ {\r
+ ++NQB;\r
+ if (q == b)\r
+ ++IdQB;\r
+ }\r
+\r
+ if (!isgap(a) && !isgap(b))\r
+ {\r
+ ++NAB;\r
+ if (a == b)\r
+ ++IdAB;\r
+ }\r
+ }\r
+\r
+ Hit.PctIdQA = Pct(IdQA, NQA);\r
+ Hit.PctIdQB = Pct(IdQB, NQB);\r
+ Hit.PctIdAB = Pct(IdAB, NAB);\r
+\r
+ unsigned LIdQA = 0;\r
+ unsigned LIdQB = 0;\r
+ for (unsigned Col = ColLo; Col < ColXLo; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (!isgap(q) && !isgap(a))\r
+ {\r
+ if (q == a)\r
+ ++LIdQA;\r
+ }\r
+\r
+ if (!isgap(q) && !isgap(b))\r
+ {\r
+ if (q == b)\r
+ ++LIdQB;\r
+ }\r
+ }\r
+\r
+ unsigned RIdQA = 0;\r
+ unsigned RIdQB = 0;\r
+ for (unsigned Col = ColXHi+1; Col <= ColHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ if (!isgap(q) && !isgap(a))\r
+ {\r
+ if (q == a)\r
+ ++RIdQA;\r
+ }\r
+\r
+ if (!isgap(q) && !isgap(b))\r
+ {\r
+ if (q == b)\r
+ ++RIdQB;\r
+ }\r
+ }\r
+\r
+ unsigned IdDiffL = max(LIdQA, LIdQB) - min(LIdQA, LIdQB);\r
+ unsigned IdDiffR = max(RIdQA, RIdQB) - min(RIdQA, RIdQB);\r
+ unsigned MinIdDiff = min(IdDiffL, IdDiffR);\r
+ unsigned ColRange = ColHi - ColLo + 1;\r
+ if (opt_queryfract > 0.0f && float(ColRange)/float(QL) < opt_queryfract)\r
+ return;\r
+\r
+// double Div = Pct(MinIdDiff, QSD.L);\r
+\r
+#if TRACE\r
+ {\r
+ Log(" Col A Q B ScoreA ScoreB LVA LVB RVA RVB\n");\r
+ Log("----- - - - ------- ------- ------- ------- ------- -------\n");\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ if (ColScoresA[Col] == 0.0 && ColScoresB[Col] == 0.0)\r
+ continue;\r
+\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ Log("%5u %c %c %c", Col, a, q, b);\r
+\r
+ if (ColScoresA[Col] == 0.0)\r
+ Log(" %7.7s", "");\r
+ else\r
+ Log(" %7.1f", ColScoresA[Col]);\r
+\r
+ if (ColScoresB[Col] == 0.0)\r
+ Log(" %7.7s", "");\r
+ else\r
+ Log(" %7.1f", ColScoresB[Col]);\r
+\r
+ Log(" %7.1f %7.1f %7.1f %7.1f", LVA[Col], LVB[Col], RVA[Col], RVB[Col]);\r
+\r
+ Log("\n");\r
+ }\r
+ Log("\n");\r
+ Log("MaxSum %.1f, ColLo %u, ColXLo %u, ColX %u, ColXHi %u, ColHi %u, AF %c\n",\r
+ MaxSum, ColLo, ColXLo, ColX, ColXHi, ColHi, tof(FirstA));\r
+ Log(" LIdQA %u, LIdQB %u, RIdQA %u, RIdQB %u\n", LIdQA, LIdQB, RIdQA, RIdQB);\r
+ }\r
+#endif\r
+\r
+ string Q3L;\r
+ string A3L;\r
+ string B3L;\r
+ for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+ {\r
+ char q = Q3[Col];\r
+ char a = A3[Col];\r
+ char b = B3[Col];\r
+\r
+ Q3L += q;\r
+ A3L += a;\r
+ B3L += b;\r
+ }\r
+\r
+ AlignChimeGlobal3(Q3L, A3L, B3L, QLabel, ALabel, BLabel, Hit);\r
+\r
+#if 0\r
+// CS SNPs\r
+ Hit.CS_LY = 0;\r
+ Hit.CS_LN = 0;\r
+ Hit.CS_RY = 0;\r
+ Hit.CS_RN = 0;\r
+ Hit.CS_LA = 0;\r
+ Hit.CS_RA = 0;\r
+ for (unsigned Col = ColLo; Col <= ColHi; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ if (q == a && q == b && a == b)\r
+ continue;\r
+ if (isgap(q) || isgap(a) || isgap(b))\r
+ continue;\r
+ if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])))\r
+ continue;\r
+ if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])))\r
+ continue;\r
+\r
+ if (!FirstA)\r
+ swap(a, b);\r
+\r
+ if (Col < ColXLo)\r
+ {\r
+ if (q == a && q != b)\r
+ ++Hit.CS_LY;\r
+ else if (q == b && q != a)\r
+ ++Hit.CS_LN;\r
+ else\r
+ ++Hit.CS_LA;\r
+ }\r
+ else if (Col > ColXHi)\r
+ {\r
+ if (q == b && q != a)\r
+ ++Hit.CS_RY;\r
+ else if (q == a && q != b)\r
+ ++Hit.CS_RN;\r
+ else\r
+ ++Hit.CS_RA;\r
+ }\r
+ }\r
+\r
+ double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA);\r
+ double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA);\r
+ Hit.Score = ScoreL*ScoreR;\r
+\r
+ //Hit.QSD = QSD;\r
+ //if (FirstA)\r
+ // {\r
+ // Hit.ASD = ASD;\r
+ // Hit.BSD = BSD;\r
+ // Hit.PathQA = PathQA;\r
+ // Hit.PathQB = PathQB;\r
+ // }\r
+ //else\r
+ // {\r
+ // Hit.ASD = BSD;\r
+ // Hit.BSD = ASD;\r
+ // }\r
+\r
+ //Hit.ColLo = ColLo;\r
+ //Hit.ColXLo = ColXLo;\r
+ //Hit.ColXHi = ColXHi;\r
+ //Hit.ColHi = ColHi;\r
+ //Hit.Div = Div;\r
+\r
+// Hit.LogMe();\r
+#endif\r
+ }\r
--- /dev/null
+A(Alpha)\r
+A(Mx)\r
+A(ChainBrute)\r
+A(Chainer)\r
+A(Test)\r
+A(CompressPath)\r
+A(HSPFinder)\r
+A(Main)\r
+A(Clumps)\r
+A(Path)\r
+A(SeqDB)\r
+A(SFasta)\r
+A(SWUngapped)\r
+A(AllocBit)\r
+A(Ultra)\r
+A(UPGMA)\r
+A(Windex)\r
+A(XDropBwd)\r
+A(Xlat)\r
+A(MPath)\r
+A(ScoreCache)\r
+A(TargetHits)\r
+A(Out)\r
+A(Hashdex)\r
--- /dev/null
+#ifndef alnheuristics_h\r
+#define alnheuristics_h\r
+\r
+struct AlnParams;\r
+\r
+struct AlnHeuristics\r
+ {\r
+ unsigned BandRadius;\r
+ unsigned HSPFinderWordLength;\r
+ float SeedT;\r
+\r
+ float XDropG; // GappedBlast default\r
+ float XDropU; // UngappedBlast default\r
+ float XDropUG; // UngappedBlast called by GappedBlast\r
+\r
+ unsigned MinGlobalHSPLength;\r
+\r
+ AlnHeuristics();\r
+ void InitFromCmdLine(const AlnParams &AP);\r
+ void InitGlobalFull();\r
+\r
+ bool IsGlobalFull() const\r
+ {\r
+ return MinGlobalHSPLength == 0 && BandRadius == 0;\r
+ }\r
+\r
+ };\r
+\r
+#endif // alnheuristics_h\r
--- /dev/null
+#include "myutils.h"\r
+#include <float.h> // for FLT_MAX\r
+#include "mx.h"\r
+#include "alnparams.h"\r
+#include "hsp.h"\r
+\r
+#define TEST 0\r
+\r
+void SetBLOSUM62();
+void SetNucSubstMx(double Match, double Mismatch);\r
+void ReadSubstMx(const string &FileName, Mx<float> &Mxf);\r
+
+extern Mx<float> g_SubstMxf;
+extern float **g_SubstMx;
+\r
+void AlnParams::Clear()\r
+ {\r
+ SubstMxName = 0;\r
+ LocalOpen = OBVIOUSLY_WRONG_PENALTY;\r
+ LocalExt = OBVIOUSLY_WRONG_PENALTY;\r
+ OpenA = OBVIOUSLY_WRONG_PENALTY;\r
+ OpenB = OBVIOUSLY_WRONG_PENALTY;\r
+ ExtA = OBVIOUSLY_WRONG_PENALTY;\r
+ ExtB = OBVIOUSLY_WRONG_PENALTY;\r
+ LOpenA = OBVIOUSLY_WRONG_PENALTY;\r
+ LOpenB = OBVIOUSLY_WRONG_PENALTY;\r
+ ROpenA = OBVIOUSLY_WRONG_PENALTY;\r
+ ROpenB = OBVIOUSLY_WRONG_PENALTY;\r
+ LExtA = OBVIOUSLY_WRONG_PENALTY;\r
+ LExtB = OBVIOUSLY_WRONG_PENALTY;\r
+ RExtA = OBVIOUSLY_WRONG_PENALTY;\r
+ RExtB = OBVIOUSLY_WRONG_PENALTY;\r
+ Nucleo = false;\r
+ NucleoSet = false;\r
+ }\r
+\r
+bool AlnParams::Is2() const\r
+ {\r
+ float g = OpenA;\r
+ float e = ExtA;\r
+ if (OpenB != g || LOpenA != g || LOpenB != g || ROpenA != g || ROpenB != g)\r
+ return false;\r
+ if (ExtB != e || LExtA != e || LExtB != e || RExtA != e || RExtB != e)\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+bool AlnParams::Is4() const\r
+ {\r
+ float g = OpenA;\r
+ float tg = LOpenA;\r
+ float e = ExtA;\r
+ float te = LExtA;\r
+ if (OpenB != g || LOpenA != tg || LOpenB != tg || ROpenA != tg || ROpenB != tg)\r
+ return false;\r
+ if (ExtB != e || LExtA != te || LExtB != te || RExtA != te || RExtB != te)\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+const char *AlnParams::GetType() const\r
+ {\r
+ if (Is2())\r
+ return "2";\r
+ else if (Is4())\r
+ return "4";\r
+ return "12";\r
+ }\r
+\r
+void AlnParams::Init2(const float * const *Mx, float Open, float Ext)\r
+ {\r
+ SubstMx = Mx;\r
+ OpenA = OpenB = LOpenA = LOpenB = ROpenA = ROpenB = Open;\r
+ ExtA = ExtB = LExtA = LExtB = RExtA = RExtB = Ext;\r
+ }\r
+\r
+void AlnParams::SetLocal(float Open, float Ext)\r
+ {\r
+ LocalOpen = Open;\r
+ LocalExt = Ext;\r
+ }\r
+\r
+void AlnParams::Init4(const float * const *Mx, float Open, float Ext,\r
+ float TermOpen, float TermExt)\r
+ {\r
+ SubstMx = Mx;\r
+ OpenA = OpenB = Open;\r
+ LOpenA = LOpenB = ROpenA = ROpenB = TermOpen;\r
+ ExtA = ExtB = Ext;\r
+ LExtA = LExtB = RExtA = RExtB = TermExt;\r
+ }\r
+\r
+void AlnParams::Init(const AlnParams &AP, const HSPData &HSP,\r
+ unsigned LA, unsigned LB)\r
+ {\r
+ SubstMx = AP.SubstMx;\r
+ OpenA = AP.OpenA;\r
+ OpenB = AP.OpenB;\r
+ ExtA = AP.ExtA;\r
+ ExtB = AP.ExtB;\r
+\r
+ if (HSP.LeftA())\r
+ {\r
+ LOpenA = AP.LOpenA;\r
+ LExtA = AP.LExtA;\r
+ }\r
+ else\r
+ {\r
+ LOpenA = AP.OpenA;\r
+ LExtA = AP.ExtA;\r
+ }\r
+\r
+ if (HSP.LeftB())\r
+ {\r
+ LOpenB = AP.LOpenB;\r
+ LExtB = AP.LExtB;\r
+ }\r
+ else\r
+ {\r
+ LOpenB = AP.OpenB;\r
+ LExtB = AP.ExtB;\r
+ }\r
+\r
+ if (HSP.RightA(LA))\r
+ {\r
+ ROpenA = AP.ROpenA;\r
+ RExtA = AP.RExtA;\r
+ }\r
+ else\r
+ {\r
+ ROpenA = AP.OpenA;\r
+ RExtA = AP.ExtA;\r
+ }\r
+\r
+ if (HSP.RightB(LB))\r
+ {\r
+ ROpenB = AP.ROpenB;\r
+ RExtB = AP.RExtB;\r
+ }\r
+ else\r
+ {\r
+ ROpenB = AP.OpenB;\r
+ RExtB = AP.ExtB;\r
+ }\r
+ }\r
+\r
+void AlnParams::LogMe() const\r
+ {\r
+ Log("AlnParams(%s)", GetType());\r
+ if (Is2())\r
+ Log(" g=%.1f e=%.1f", -OpenA, -ExtA);\r
+ else if (Is4())\r
+ Log(" g=%.1f tg=%.1f e=%.1f te=%.1f", -OpenA, -ExtA, -LOpenA, -LExtA);\r
+ else\r
+ Log(\r
+" gA=%.1f gB=%.1f gAL=%.1f gBL=%.1f gAR=%.1f gBR=%.1f eA=%.1f eB=%.1f eAL=%.1f eBL=%.1f eAR=%.1f eBR=%.1f",\r
+ OpenA, OpenB, LOpenA, LOpenB, ROpenA, ROpenB, ExtA, ExtB, LExtA, LExtB, RExtA, RExtB);\r
+ Log("\n");\r
+ }\r
+\r
+/***\r
+Open/Ext format string is one or more:\r
+ [<flag><flag>...]<value>\r
+\r
+Value is (positive) penalty or * (disabled).\r
+Flag is:\r
+ Q Query.\r
+ T Target sequence.\r
+ I Internal gaps (defafault internal and terminal).\r
+ E End gaps (default internal and terminal).\r
+ L Left end.\r
+ R Right end.\r
+***/\r
+\r
+static void ParseGapStr(const string &s,\r
+ float &QI, float &QL, float &QR,\r
+ float &TI, float &TL, float &TR)\r
+ {\r
+ if (s.empty())\r
+ return;\r
+\r
+ bool Q = false;\r
+ bool T = false;\r
+ bool I = false;\r
+ bool E = false;\r
+ bool L = false;\r
+ bool R = false;\r
+\r
+ const unsigned K = SIZE(s);\r
+ unsigned Dec = 0;\r
+ float Value = FLT_MAX;\r
+ for (unsigned i = 0; i <= K; ++i)\r
+ {\r
+ char c = s.c_str()[i];\r
+ if (c == 0 || c == '/')\r
+ {\r
+ if (Value == FLT_MAX)\r
+ Die("Invalid gap penalty string, missing penalty '%s'", s.c_str());\r
+ if (!Q && !T && !I && !E && !L && !R)\r
+ {\r
+ Q = true;\r
+ T = true;\r
+ L = true;\r
+ R = true;\r
+ I = true;\r
+ }\r
+\r
+ if (!E && !I && !L && !R)\r
+ {\r
+ E = false;\r
+ I = true;\r
+ L = true;\r
+ R = true;\r
+ }\r
+\r
+ if (E)\r
+ {\r
+ if (L || R)\r
+ Die("Invalid gap penalty string (E and L or R) '%s'", s.c_str());\r
+ L = true;\r
+ R = true;\r
+ }\r
+\r
+ if (!Q && !T)\r
+ {\r
+ Q = true;\r
+ T = true;\r
+ }\r
+\r
+ if (Q && L)\r
+ QL = -Value;\r
+ if (Q && R)\r
+ QR = -Value;\r
+ if (Q && I)\r
+ QI = -Value;\r
+ if (T && L)\r
+ TL = -Value;\r
+ if (T && R)\r
+ TR = -Value;\r
+ if (T && I)\r
+ TI = -Value;\r
+ \r
+ Value = FLT_MAX;\r
+ Dec = 0;\r
+ Q = false;\r
+ T = false;\r
+ I = false;\r
+ E = false;\r
+ L = false;\r
+ R = false;\r
+ }\r
+ else if (c == '*')\r
+ {\r
+ if (Value != FLT_MAX)\r
+ Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str());\r
+ Value = -MINUS_INFINITY;\r
+ }\r
+ else if (isdigit(c))\r
+ {\r
+ if (Value == -MINUS_INFINITY)\r
+ Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str());\r
+ if (Value == FLT_MAX)\r
+ Value = 0.0;\r
+ if (Dec > 0)\r
+ {\r
+ Dec *= 10;\r
+ Value += float(c - '0')/Dec;\r
+ }\r
+ else\r
+ Value = Value*10 + (c - '0');\r
+ }\r
+ else if (c == '.')\r
+ {\r
+ if (Dec > 0)\r
+ Die("Invalid gap penalty (two decimal points) '%s'", s.c_str());\r
+ Dec = 1;\r
+ }\r
+ else\r
+ {\r
+ switch (c)\r
+ {\r
+ case 'Q':\r
+ Q = true;\r
+ break;\r
+ case 'T':\r
+ T = true;\r
+ break;\r
+ case 'I':\r
+ I = true;\r
+ break;\r
+ case 'L':\r
+ L = true;\r
+ break;\r
+ case 'R':\r
+ R = true;\r
+ break;\r
+ case 'E':\r
+ E = true;\r
+ break;\r
+ default:\r
+ Die("Invalid char '%c' in gap penalty string '%s'", c, s.c_str());\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+void AlnParams::SetPenalties(const string &OpenStr, const string &ExtStr)\r
+ {\r
+ ParseGapStr(OpenStr, OpenA, LOpenA, ROpenA, OpenB, LOpenB, ROpenB);\r
+ ParseGapStr(ExtStr, ExtA, LExtA, RExtA, ExtB, LExtB, RExtB);\r
+ }\r
+\r
+void AlnParams::SetMxFromCmdLine(bool IsNucleo)\r
+ {\r
+ if (IsNucleo)\r
+ SetNucSubstMx(opt_match, opt_mismatch);
+ else\r
+ {\r
+ if (opt_matrix == "")\r
+ {\r
+ SubstMxName = "BLOSUM62";\r
+ SetBLOSUM62();
+ }
+ else\r
+ {\r
+ ReadSubstMx(opt_matrix, g_SubstMxf);\r
+ g_SubstMx = g_SubstMxf.GetData();\r
+ g_SubstMxf.LogMe();\r
+ SubstMxName = opt_matrix.c_str();\r
+ }\r
+ }\r
+ SubstMx = g_SubstMx;\r
+ asserta(SubstMx != 0);\r
+ }\r
+\r
+void AlnParams::InitFromCmdLine(bool IsNucleo)\r
+ {\r
+ Clear();\r
+ Nucleo = IsNucleo;\r
+ NucleoSet = true;\r
+\r
+ SetMxFromCmdLine(IsNucleo);\r
+\r
+// Local\r
+ if (optset_lopen || optset_lext)\r
+ {\r
+ if (!optset_lopen || !optset_lext)\r
+ Die("Must set both --lopen and --lext");\r
+ if (opt_lopen < 0.0 || opt_lext < 0.0)\r
+ Die("Invalid --lopen/--lext, gap penalties must be >= 0");\r
+ SetLocal(float(-opt_lopen), float(-opt_lext));\r
+ }\r
+ else\r
+ {\r
+ // Same penalties, if-statement to note could differ.\r
+ if (IsNucleo)\r
+ SetLocal(-10.0f, -1.0f);\r
+ else\r
+ SetLocal(-10.0f, -1.0f);\r
+ }\r
+\r
+// Global\r
+ if (IsNucleo)\r
+ Init4(g_SubstMx, -10.0, -1.0, -0.5, -0.5);
+ else\r
+ Init4(g_SubstMx, -17.0, -1.0, -0.5, -0.5);
+ SetPenalties(opt_gapopen, opt_gapext);\r
+ }\r
+\r
+float AlnParams::GetLocalOpen() const\r
+ {\r
+ return LocalOpen;\r
+ }\r
+\r
+float AlnParams::GetLocalExt() const\r
+ {\r
+ return LocalExt;\r
+ }\r
+\r
+bool AlnParams::GetIsNucleo() const\r
+ {\r
+ asserta(NucleoSet);\r
+ return Nucleo;\r
+ }\r
+\r
+unsigned GetWindexWordLength(bool Nucleo)\r
+ {\r
+ if (optset_w)\r
+ return opt_w;\r
+\r
+ if (Nucleo)\r
+ return 8;\r
+ else\r
+ return 5;\r
+ }\r
+\r
+#if TEST\r
+static void Test1(const string &os, const string &es)\r
+ {\r
+ AlnParams AP;\r
+ Log("\n");\r
+ Log("OpenStr %s\n", os.c_str());\r
+ Log(" ExtStr %s\n", es.c_str());\r
+ AP.SetPenalties(os, es);\r
+ AP.LogMe();\r
+ }\r
+\r
+void TestGapStr()\r
+ {\r
+ Test1("17I/0.5E", "1I/0.5E");\r
+ Test1("17I/0.5L/0.4R", "1Q/2T");\r
+ Test1("1QL/2QR/3QI/4TL/5TR/6TI", ".1QL/.2QR/.3QI/.4TL/.5TR/.6TI");\r
+ }\r
+#endif // TEST\r
--- /dev/null
+#ifndef alnparams_h\r
+#define alnparams_h\r
+\r
+struct HSPData;\r
+\r
+// Gap penalty scores are negative\r
+// (i.e., are scores, not penalties).\r
+struct AlnParams\r
+ {\r
+ const char *SubstMxName;\r
+ const float * const *SubstMx;\r
+\r
+ bool Nucleo;\r
+ bool NucleoSet;\r
+\r
+// Local gaps\r
+ float LocalOpen;\r
+ float LocalExt;\r
+\r
+// Global internal gaps\r
+ float OpenA;\r
+ float OpenB;\r
+\r
+ float ExtA;\r
+ float ExtB;\r
+\r
+// Global terminal gaps\r
+ float LOpenA;\r
+ float LOpenB;\r
+ float ROpenA;\r
+ float ROpenB;\r
+\r
+ float LExtA;\r
+ float LExtB;\r
+ float RExtA;\r
+ float RExtB;\r
+\r
+ void Clear();\r
+ void SetLocal(float Open, float Ext);\r
+ void Init2(const float * const *Mx, float Open, float Ext);\r
+ void Init4(const float * const *Mx, float Open, float Ext, float TermOpen, float TermExt);\r
+ void Init(const AlnParams &AP, const HSPData &HSP, unsigned LA, unsigned LB);\r
+ void InitFromCmdLine(bool Nucleo);\r
+ void SetMxFromCmdLine(bool Nucleo);\r
+ void SetPenalties(const string &OpenStr, const string &ExtStr);\r
+ float GetLocalOpen() const;\r
+ float GetLocalExt() const;\r
+ bool GetIsNucleo() const;\r
+\r
+ bool Is2() const;\r
+ bool Is4() const;\r
+ const char *GetType() const;\r
+\r
+ void LogMe() const;\r
+ };\r
+\r
+const float OBVIOUSLY_WRONG_PENALTY = 1000.0;\r
+\r
+#endif // alnparams_h\r
--- /dev/null
+// Generated by /p/py/alphac.py
+#include "alpha.h"
+
+unsigned g_CharToLetterAminoStop[256] =
+ {
+ INVALID_LETTER, // [ 0] 0x00
+ INVALID_LETTER, // [ 1] 0x01
+ INVALID_LETTER, // [ 2] 0x02
+ INVALID_LETTER, // [ 3] 0x03
+ INVALID_LETTER, // [ 4] 0x04
+ INVALID_LETTER, // [ 5] 0x05
+ INVALID_LETTER, // [ 6] 0x06
+ INVALID_LETTER, // [ 7] 0x07
+ INVALID_LETTER, // [ 8] 0x08
+ INVALID_LETTER, // [ 9] 0x09
+ INVALID_LETTER, // [ 10] 0x0a
+ INVALID_LETTER, // [ 11] 0x0b
+ INVALID_LETTER, // [ 12] 0x0c
+ INVALID_LETTER, // [ 13] 0x0d
+ INVALID_LETTER, // [ 14] 0x0e
+ INVALID_LETTER, // [ 15] 0x0f
+ INVALID_LETTER, // [ 16] 0x10
+ INVALID_LETTER, // [ 17] 0x11
+ INVALID_LETTER, // [ 18] 0x12
+ INVALID_LETTER, // [ 19] 0x13
+ INVALID_LETTER, // [ 20] 0x14
+ INVALID_LETTER, // [ 21] 0x15
+ INVALID_LETTER, // [ 22] 0x16
+ INVALID_LETTER, // [ 23] 0x17
+ INVALID_LETTER, // [ 24] 0x18
+ INVALID_LETTER, // [ 25] 0x19
+ INVALID_LETTER, // [ 26] 0x1a
+ INVALID_LETTER, // [ 27] 0x1b
+ INVALID_LETTER, // [ 28] 0x1c
+ INVALID_LETTER, // [ 29] 0x1d
+ INVALID_LETTER, // [ 30] 0x1e
+ INVALID_LETTER, // [ 31] 0x1f
+ INVALID_LETTER, // [ 32] ' '
+ INVALID_LETTER, // [ 33] '!'
+ INVALID_LETTER, // [ 34] '"'
+ INVALID_LETTER, // [ 35] '#'
+ INVALID_LETTER, // [ 36] '$'
+ INVALID_LETTER, // [ 37] '%'
+ INVALID_LETTER, // [ 38] '&'
+ INVALID_LETTER, // [ 39] '''
+ INVALID_LETTER, // [ 40] '('
+ INVALID_LETTER, // [ 41] ')'
+ 20 , // [ 42] '*' = STP
+ INVALID_LETTER, // [ 43] '+'
+ INVALID_LETTER, // [ 44] ','
+ INVALID_LETTER, // [ 45] '-'
+ INVALID_LETTER, // [ 46] '.'
+ INVALID_LETTER, // [ 47] '/'
+ INVALID_LETTER, // [ 48] '0'
+ INVALID_LETTER, // [ 49] '1'
+ INVALID_LETTER, // [ 50] '2'
+ INVALID_LETTER, // [ 51] '3'
+ INVALID_LETTER, // [ 52] '4'
+ INVALID_LETTER, // [ 53] '5'
+ INVALID_LETTER, // [ 54] '6'
+ INVALID_LETTER, // [ 55] '7'
+ INVALID_LETTER, // [ 56] '8'
+ INVALID_LETTER, // [ 57] '9'
+ INVALID_LETTER, // [ 58] ':'
+ INVALID_LETTER, // [ 59] ';'
+ INVALID_LETTER, // [ 60] '<'
+ INVALID_LETTER, // [ 61] '='
+ INVALID_LETTER, // [ 62] '>'
+ INVALID_LETTER, // [ 63] '?'
+ INVALID_LETTER, // [ 64] '@'
+ 0 , // [ 65] 'A' = Ala
+ INVALID_LETTER, // [ 66] 'B'
+ 1 , // [ 67] 'C' = Cys
+ 2 , // [ 68] 'D' = Asp
+ 3 , // [ 69] 'E' = Glu
+ 4 , // [ 70] 'F' = Phe
+ 5 , // [ 71] 'G' = Gly
+ 6 , // [ 72] 'H' = His
+ 7 , // [ 73] 'I' = Ile
+ INVALID_LETTER, // [ 74] 'J'
+ 8 , // [ 75] 'K' = Lys
+ 9 , // [ 76] 'L' = Leu
+ 10 , // [ 77] 'M' = Met
+ 11 , // [ 78] 'N' = Asn
+ INVALID_LETTER, // [ 79] 'O'
+ 12 , // [ 80] 'P' = Pro
+ 13 , // [ 81] 'Q' = Gln
+ 14 , // [ 82] 'R' = Arg
+ 15 , // [ 83] 'S' = Ser
+ 16 , // [ 84] 'T' = Thr
+ INVALID_LETTER, // [ 85] 'U'
+ 17 , // [ 86] 'V' = Val
+ 18 , // [ 87] 'W' = Trp
+ INVALID_LETTER, // [ 88] 'X'
+ 19 , // [ 89] 'Y' = Tyr
+ INVALID_LETTER, // [ 90] 'Z'
+ INVALID_LETTER, // [ 91] '['
+ INVALID_LETTER, // [ 92] '\'
+ INVALID_LETTER, // [ 93] ']'
+ INVALID_LETTER, // [ 94] '^'
+ INVALID_LETTER, // [ 95] '_'
+ INVALID_LETTER, // [ 96] '`'
+ 0 , // [ 97] 'a' = Ala
+ INVALID_LETTER, // [ 98] 'b'
+ 1 , // [ 99] 'c' = Cys
+ 2 , // [100] 'd' = Asp
+ 3 , // [101] 'e' = Glu
+ 4 , // [102] 'f' = Phe
+ 5 , // [103] 'g' = Gly
+ 6 , // [104] 'h' = His
+ 7 , // [105] 'i' = Ile
+ INVALID_LETTER, // [106] 'j'
+ 8 , // [107] 'k' = Lys
+ 9 , // [108] 'l' = Leu
+ 10 , // [109] 'm' = Met
+ 11 , // [110] 'n' = Asn
+ INVALID_LETTER, // [111] 'o'
+ 12 , // [112] 'p' = Pro
+ 13 , // [113] 'q' = Gln
+ 14 , // [114] 'r' = Arg
+ 15 , // [115] 's' = Ser
+ 16 , // [116] 't' = Thr
+ INVALID_LETTER, // [117] 'u'
+ 17 , // [118] 'v' = Val
+ 18 , // [119] 'w' = Trp
+ INVALID_LETTER, // [120] 'x'
+ 19 , // [121] 'y' = Tyr
+ INVALID_LETTER, // [122] 'z'
+ INVALID_LETTER, // [123] '{'
+ INVALID_LETTER, // [124] '|'
+ INVALID_LETTER, // [125] '}'
+ INVALID_LETTER, // [126] '~'
+ INVALID_LETTER, // [127] 0x7f
+ INVALID_LETTER, // [128] 0x80
+ INVALID_LETTER, // [129] 0x81
+ INVALID_LETTER, // [130] 0x82
+ INVALID_LETTER, // [131] 0x83
+ INVALID_LETTER, // [132] 0x84
+ INVALID_LETTER, // [133] 0x85
+ INVALID_LETTER, // [134] 0x86
+ INVALID_LETTER, // [135] 0x87
+ INVALID_LETTER, // [136] 0x88
+ INVALID_LETTER, // [137] 0x89
+ INVALID_LETTER, // [138] 0x8a
+ INVALID_LETTER, // [139] 0x8b
+ INVALID_LETTER, // [140] 0x8c
+ INVALID_LETTER, // [141] 0x8d
+ INVALID_LETTER, // [142] 0x8e
+ INVALID_LETTER, // [143] 0x8f
+ INVALID_LETTER, // [144] 0x90
+ INVALID_LETTER, // [145] 0x91
+ INVALID_LETTER, // [146] 0x92
+ INVALID_LETTER, // [147] 0x93
+ INVALID_LETTER, // [148] 0x94
+ INVALID_LETTER, // [149] 0x95
+ INVALID_LETTER, // [150] 0x96
+ INVALID_LETTER, // [151] 0x97
+ INVALID_LETTER, // [152] 0x98
+ INVALID_LETTER, // [153] 0x99
+ INVALID_LETTER, // [154] 0x9a
+ INVALID_LETTER, // [155] 0x9b
+ INVALID_LETTER, // [156] 0x9c
+ INVALID_LETTER, // [157] 0x9d
+ INVALID_LETTER, // [158] 0x9e
+ INVALID_LETTER, // [159] 0x9f
+ INVALID_LETTER, // [160] 0xa0
+ INVALID_LETTER, // [161] 0xa1
+ INVALID_LETTER, // [162] 0xa2
+ INVALID_LETTER, // [163] 0xa3
+ INVALID_LETTER, // [164] 0xa4
+ INVALID_LETTER, // [165] 0xa5
+ INVALID_LETTER, // [166] 0xa6
+ INVALID_LETTER, // [167] 0xa7
+ INVALID_LETTER, // [168] 0xa8
+ INVALID_LETTER, // [169] 0xa9
+ INVALID_LETTER, // [170] 0xaa
+ INVALID_LETTER, // [171] 0xab
+ INVALID_LETTER, // [172] 0xac
+ INVALID_LETTER, // [173] 0xad
+ INVALID_LETTER, // [174] 0xae
+ INVALID_LETTER, // [175] 0xaf
+ INVALID_LETTER, // [176] 0xb0
+ INVALID_LETTER, // [177] 0xb1
+ INVALID_LETTER, // [178] 0xb2
+ INVALID_LETTER, // [179] 0xb3
+ INVALID_LETTER, // [180] 0xb4
+ INVALID_LETTER, // [181] 0xb5
+ INVALID_LETTER, // [182] 0xb6
+ INVALID_LETTER, // [183] 0xb7
+ INVALID_LETTER, // [184] 0xb8
+ INVALID_LETTER, // [185] 0xb9
+ INVALID_LETTER, // [186] 0xba
+ INVALID_LETTER, // [187] 0xbb
+ INVALID_LETTER, // [188] 0xbc
+ INVALID_LETTER, // [189] 0xbd
+ INVALID_LETTER, // [190] 0xbe
+ INVALID_LETTER, // [191] 0xbf
+ INVALID_LETTER, // [192] 0xc0
+ INVALID_LETTER, // [193] 0xc1
+ INVALID_LETTER, // [194] 0xc2
+ INVALID_LETTER, // [195] 0xc3
+ INVALID_LETTER, // [196] 0xc4
+ INVALID_LETTER, // [197] 0xc5
+ INVALID_LETTER, // [198] 0xc6
+ INVALID_LETTER, // [199] 0xc7
+ INVALID_LETTER, // [200] 0xc8
+ INVALID_LETTER, // [201] 0xc9
+ INVALID_LETTER, // [202] 0xca
+ INVALID_LETTER, // [203] 0xcb
+ INVALID_LETTER, // [204] 0xcc
+ INVALID_LETTER, // [205] 0xcd
+ INVALID_LETTER, // [206] 0xce
+ INVALID_LETTER, // [207] 0xcf
+ INVALID_LETTER, // [208] 0xd0
+ INVALID_LETTER, // [209] 0xd1
+ INVALID_LETTER, // [210] 0xd2
+ INVALID_LETTER, // [211] 0xd3
+ INVALID_LETTER, // [212] 0xd4
+ INVALID_LETTER, // [213] 0xd5
+ INVALID_LETTER, // [214] 0xd6
+ INVALID_LETTER, // [215] 0xd7
+ INVALID_LETTER, // [216] 0xd8
+ INVALID_LETTER, // [217] 0xd9
+ INVALID_LETTER, // [218] 0xda
+ INVALID_LETTER, // [219] 0xdb
+ INVALID_LETTER, // [220] 0xdc
+ INVALID_LETTER, // [221] 0xdd
+ INVALID_LETTER, // [222] 0xde
+ INVALID_LETTER, // [223] 0xdf
+ INVALID_LETTER, // [224] 0xe0
+ INVALID_LETTER, // [225] 0xe1
+ INVALID_LETTER, // [226] 0xe2
+ INVALID_LETTER, // [227] 0xe3
+ INVALID_LETTER, // [228] 0xe4
+ INVALID_LETTER, // [229] 0xe5
+ INVALID_LETTER, // [230] 0xe6
+ INVALID_LETTER, // [231] 0xe7
+ INVALID_LETTER, // [232] 0xe8
+ INVALID_LETTER, // [233] 0xe9
+ INVALID_LETTER, // [234] 0xea
+ INVALID_LETTER, // [235] 0xeb
+ INVALID_LETTER, // [236] 0xec
+ INVALID_LETTER, // [237] 0xed
+ INVALID_LETTER, // [238] 0xee
+ INVALID_LETTER, // [239] 0xef
+ INVALID_LETTER, // [240] 0xf0
+ INVALID_LETTER, // [241] 0xf1
+ INVALID_LETTER, // [242] 0xf2
+ INVALID_LETTER, // [243] 0xf3
+ INVALID_LETTER, // [244] 0xf4
+ INVALID_LETTER, // [245] 0xf5
+ INVALID_LETTER, // [246] 0xf6
+ INVALID_LETTER, // [247] 0xf7
+ INVALID_LETTER, // [248] 0xf8
+ INVALID_LETTER, // [249] 0xf9
+ INVALID_LETTER, // [250] 0xfa
+ INVALID_LETTER, // [251] 0xfb
+ INVALID_LETTER, // [252] 0xfc
+ INVALID_LETTER, // [253] 0xfd
+ INVALID_LETTER, // [254] 0xfe
+ INVALID_LETTER, // [255] 0xff
+ };
+unsigned g_CharToLetterAmino[256] =
+ {
+ INVALID_LETTER, // [ 0] 0x00
+ INVALID_LETTER, // [ 1] 0x01
+ INVALID_LETTER, // [ 2] 0x02
+ INVALID_LETTER, // [ 3] 0x03
+ INVALID_LETTER, // [ 4] 0x04
+ INVALID_LETTER, // [ 5] 0x05
+ INVALID_LETTER, // [ 6] 0x06
+ INVALID_LETTER, // [ 7] 0x07
+ INVALID_LETTER, // [ 8] 0x08
+ INVALID_LETTER, // [ 9] 0x09
+ INVALID_LETTER, // [ 10] 0x0a
+ INVALID_LETTER, // [ 11] 0x0b
+ INVALID_LETTER, // [ 12] 0x0c
+ INVALID_LETTER, // [ 13] 0x0d
+ INVALID_LETTER, // [ 14] 0x0e
+ INVALID_LETTER, // [ 15] 0x0f
+ INVALID_LETTER, // [ 16] 0x10
+ INVALID_LETTER, // [ 17] 0x11
+ INVALID_LETTER, // [ 18] 0x12
+ INVALID_LETTER, // [ 19] 0x13
+ INVALID_LETTER, // [ 20] 0x14
+ INVALID_LETTER, // [ 21] 0x15
+ INVALID_LETTER, // [ 22] 0x16
+ INVALID_LETTER, // [ 23] 0x17
+ INVALID_LETTER, // [ 24] 0x18
+ INVALID_LETTER, // [ 25] 0x19
+ INVALID_LETTER, // [ 26] 0x1a
+ INVALID_LETTER, // [ 27] 0x1b
+ INVALID_LETTER, // [ 28] 0x1c
+ INVALID_LETTER, // [ 29] 0x1d
+ INVALID_LETTER, // [ 30] 0x1e
+ INVALID_LETTER, // [ 31] 0x1f
+ INVALID_LETTER, // [ 32] ' '
+ INVALID_LETTER, // [ 33] '!'
+ INVALID_LETTER, // [ 34] '"'
+ INVALID_LETTER, // [ 35] '#'
+ INVALID_LETTER, // [ 36] '$'
+ INVALID_LETTER, // [ 37] '%'
+ INVALID_LETTER, // [ 38] '&'
+ INVALID_LETTER, // [ 39] '''
+ INVALID_LETTER, // [ 40] '('
+ INVALID_LETTER, // [ 41] ')'
+ INVALID_LETTER, // [ 42] '*'
+ INVALID_LETTER, // [ 43] '+'
+ INVALID_LETTER, // [ 44] ','
+ INVALID_LETTER, // [ 45] '-'
+ INVALID_LETTER, // [ 46] '.'
+ INVALID_LETTER, // [ 47] '/'
+ INVALID_LETTER, // [ 48] '0'
+ INVALID_LETTER, // [ 49] '1'
+ INVALID_LETTER, // [ 50] '2'
+ INVALID_LETTER, // [ 51] '3'
+ INVALID_LETTER, // [ 52] '4'
+ INVALID_LETTER, // [ 53] '5'
+ INVALID_LETTER, // [ 54] '6'
+ INVALID_LETTER, // [ 55] '7'
+ INVALID_LETTER, // [ 56] '8'
+ INVALID_LETTER, // [ 57] '9'
+ INVALID_LETTER, // [ 58] ':'
+ INVALID_LETTER, // [ 59] ';'
+ INVALID_LETTER, // [ 60] '<'
+ INVALID_LETTER, // [ 61] '='
+ INVALID_LETTER, // [ 62] '>'
+ INVALID_LETTER, // [ 63] '?'
+ INVALID_LETTER, // [ 64] '@'
+ 0 , // [ 65] 'A' = Ala
+ INVALID_LETTER, // [ 66] 'B'
+ 1 , // [ 67] 'C' = Cys
+ 2 , // [ 68] 'D' = Asp
+ 3 , // [ 69] 'E' = Glu
+ 4 , // [ 70] 'F' = Phe
+ 5 , // [ 71] 'G' = Gly
+ 6 , // [ 72] 'H' = His
+ 7 , // [ 73] 'I' = Ile
+ INVALID_LETTER, // [ 74] 'J'
+ 8 , // [ 75] 'K' = Lys
+ 9 , // [ 76] 'L' = Leu
+ 10 , // [ 77] 'M' = Met
+ 11 , // [ 78] 'N' = Asn
+ INVALID_LETTER, // [ 79] 'O'
+ 12 , // [ 80] 'P' = Pro
+ 13 , // [ 81] 'Q' = Gln
+ 14 , // [ 82] 'R' = Arg
+ 15 , // [ 83] 'S' = Ser
+ 16 , // [ 84] 'T' = Thr
+ INVALID_LETTER, // [ 85] 'U'
+ 17 , // [ 86] 'V' = Val
+ 18 , // [ 87] 'W' = Trp
+ INVALID_LETTER, // [ 88] 'X'
+ 19 , // [ 89] 'Y' = Tyr
+ INVALID_LETTER, // [ 90] 'Z'
+ INVALID_LETTER, // [ 91] '['
+ INVALID_LETTER, // [ 92] '\'
+ INVALID_LETTER, // [ 93] ']'
+ INVALID_LETTER, // [ 94] '^'
+ INVALID_LETTER, // [ 95] '_'
+ INVALID_LETTER, // [ 96] '`'
+ 0 , // [ 97] 'a' = Ala
+ INVALID_LETTER, // [ 98] 'b'
+ 1 , // [ 99] 'c' = Cys
+ 2 , // [100] 'd' = Asp
+ 3 , // [101] 'e' = Glu
+ 4 , // [102] 'f' = Phe
+ 5 , // [103] 'g' = Gly
+ 6 , // [104] 'h' = His
+ 7 , // [105] 'i' = Ile
+ INVALID_LETTER, // [106] 'j'
+ 8 , // [107] 'k' = Lys
+ 9 , // [108] 'l' = Leu
+ 10 , // [109] 'm' = Met
+ 11 , // [110] 'n' = Asn
+ INVALID_LETTER, // [111] 'o'
+ 12 , // [112] 'p' = Pro
+ 13 , // [113] 'q' = Gln
+ 14 , // [114] 'r' = Arg
+ 15 , // [115] 's' = Ser
+ 16 , // [116] 't' = Thr
+ INVALID_LETTER, // [117] 'u'
+ 17 , // [118] 'v' = Val
+ 18 , // [119] 'w' = Trp
+ INVALID_LETTER, // [120] 'x'
+ 19 , // [121] 'y' = Tyr
+ INVALID_LETTER, // [122] 'z'
+ INVALID_LETTER, // [123] '{'
+ INVALID_LETTER, // [124] '|'
+ INVALID_LETTER, // [125] '}'
+ INVALID_LETTER, // [126] '~'
+ INVALID_LETTER, // [127] 0x7f
+ INVALID_LETTER, // [128] 0x80
+ INVALID_LETTER, // [129] 0x81
+ INVALID_LETTER, // [130] 0x82
+ INVALID_LETTER, // [131] 0x83
+ INVALID_LETTER, // [132] 0x84
+ INVALID_LETTER, // [133] 0x85
+ INVALID_LETTER, // [134] 0x86
+ INVALID_LETTER, // [135] 0x87
+ INVALID_LETTER, // [136] 0x88
+ INVALID_LETTER, // [137] 0x89
+ INVALID_LETTER, // [138] 0x8a
+ INVALID_LETTER, // [139] 0x8b
+ INVALID_LETTER, // [140] 0x8c
+ INVALID_LETTER, // [141] 0x8d
+ INVALID_LETTER, // [142] 0x8e
+ INVALID_LETTER, // [143] 0x8f
+ INVALID_LETTER, // [144] 0x90
+ INVALID_LETTER, // [145] 0x91
+ INVALID_LETTER, // [146] 0x92
+ INVALID_LETTER, // [147] 0x93
+ INVALID_LETTER, // [148] 0x94
+ INVALID_LETTER, // [149] 0x95
+ INVALID_LETTER, // [150] 0x96
+ INVALID_LETTER, // [151] 0x97
+ INVALID_LETTER, // [152] 0x98
+ INVALID_LETTER, // [153] 0x99
+ INVALID_LETTER, // [154] 0x9a
+ INVALID_LETTER, // [155] 0x9b
+ INVALID_LETTER, // [156] 0x9c
+ INVALID_LETTER, // [157] 0x9d
+ INVALID_LETTER, // [158] 0x9e
+ INVALID_LETTER, // [159] 0x9f
+ INVALID_LETTER, // [160] 0xa0
+ INVALID_LETTER, // [161] 0xa1
+ INVALID_LETTER, // [162] 0xa2
+ INVALID_LETTER, // [163] 0xa3
+ INVALID_LETTER, // [164] 0xa4
+ INVALID_LETTER, // [165] 0xa5
+ INVALID_LETTER, // [166] 0xa6
+ INVALID_LETTER, // [167] 0xa7
+ INVALID_LETTER, // [168] 0xa8
+ INVALID_LETTER, // [169] 0xa9
+ INVALID_LETTER, // [170] 0xaa
+ INVALID_LETTER, // [171] 0xab
+ INVALID_LETTER, // [172] 0xac
+ INVALID_LETTER, // [173] 0xad
+ INVALID_LETTER, // [174] 0xae
+ INVALID_LETTER, // [175] 0xaf
+ INVALID_LETTER, // [176] 0xb0
+ INVALID_LETTER, // [177] 0xb1
+ INVALID_LETTER, // [178] 0xb2
+ INVALID_LETTER, // [179] 0xb3
+ INVALID_LETTER, // [180] 0xb4
+ INVALID_LETTER, // [181] 0xb5
+ INVALID_LETTER, // [182] 0xb6
+ INVALID_LETTER, // [183] 0xb7
+ INVALID_LETTER, // [184] 0xb8
+ INVALID_LETTER, // [185] 0xb9
+ INVALID_LETTER, // [186] 0xba
+ INVALID_LETTER, // [187] 0xbb
+ INVALID_LETTER, // [188] 0xbc
+ INVALID_LETTER, // [189] 0xbd
+ INVALID_LETTER, // [190] 0xbe
+ INVALID_LETTER, // [191] 0xbf
+ INVALID_LETTER, // [192] 0xc0
+ INVALID_LETTER, // [193] 0xc1
+ INVALID_LETTER, // [194] 0xc2
+ INVALID_LETTER, // [195] 0xc3
+ INVALID_LETTER, // [196] 0xc4
+ INVALID_LETTER, // [197] 0xc5
+ INVALID_LETTER, // [198] 0xc6
+ INVALID_LETTER, // [199] 0xc7
+ INVALID_LETTER, // [200] 0xc8
+ INVALID_LETTER, // [201] 0xc9
+ INVALID_LETTER, // [202] 0xca
+ INVALID_LETTER, // [203] 0xcb
+ INVALID_LETTER, // [204] 0xcc
+ INVALID_LETTER, // [205] 0xcd
+ INVALID_LETTER, // [206] 0xce
+ INVALID_LETTER, // [207] 0xcf
+ INVALID_LETTER, // [208] 0xd0
+ INVALID_LETTER, // [209] 0xd1
+ INVALID_LETTER, // [210] 0xd2
+ INVALID_LETTER, // [211] 0xd3
+ INVALID_LETTER, // [212] 0xd4
+ INVALID_LETTER, // [213] 0xd5
+ INVALID_LETTER, // [214] 0xd6
+ INVALID_LETTER, // [215] 0xd7
+ INVALID_LETTER, // [216] 0xd8
+ INVALID_LETTER, // [217] 0xd9
+ INVALID_LETTER, // [218] 0xda
+ INVALID_LETTER, // [219] 0xdb
+ INVALID_LETTER, // [220] 0xdc
+ INVALID_LETTER, // [221] 0xdd
+ INVALID_LETTER, // [222] 0xde
+ INVALID_LETTER, // [223] 0xdf
+ INVALID_LETTER, // [224] 0xe0
+ INVALID_LETTER, // [225] 0xe1
+ INVALID_LETTER, // [226] 0xe2
+ INVALID_LETTER, // [227] 0xe3
+ INVALID_LETTER, // [228] 0xe4
+ INVALID_LETTER, // [229] 0xe5
+ INVALID_LETTER, // [230] 0xe6
+ INVALID_LETTER, // [231] 0xe7
+ INVALID_LETTER, // [232] 0xe8
+ INVALID_LETTER, // [233] 0xe9
+ INVALID_LETTER, // [234] 0xea
+ INVALID_LETTER, // [235] 0xeb
+ INVALID_LETTER, // [236] 0xec
+ INVALID_LETTER, // [237] 0xed
+ INVALID_LETTER, // [238] 0xee
+ INVALID_LETTER, // [239] 0xef
+ INVALID_LETTER, // [240] 0xf0
+ INVALID_LETTER, // [241] 0xf1
+ INVALID_LETTER, // [242] 0xf2
+ INVALID_LETTER, // [243] 0xf3
+ INVALID_LETTER, // [244] 0xf4
+ INVALID_LETTER, // [245] 0xf5
+ INVALID_LETTER, // [246] 0xf6
+ INVALID_LETTER, // [247] 0xf7
+ INVALID_LETTER, // [248] 0xf8
+ INVALID_LETTER, // [249] 0xf9
+ INVALID_LETTER, // [250] 0xfa
+ INVALID_LETTER, // [251] 0xfb
+ INVALID_LETTER, // [252] 0xfc
+ INVALID_LETTER, // [253] 0xfd
+ INVALID_LETTER, // [254] 0xfe
+ INVALID_LETTER, // [255] 0xff
+ };
+
+unsigned char g_LetterToCharAmino[256] =
+ {
+ 'A', // [0]
+ 'C', // [1]
+ 'D', // [2]
+ 'E', // [3]
+ 'F', // [4]
+ 'G', // [5]
+ 'H', // [6]
+ 'I', // [7]
+ 'K', // [8]
+ 'L', // [9]
+ 'M', // [10]
+ 'N', // [11]
+ 'P', // [12]
+ 'Q', // [13]
+ 'R', // [14]
+ 'S', // [15]
+ 'T', // [16]
+ 'V', // [17]
+ 'W', // [18]
+ 'Y', // [19]
+ '*', // [20]
+ INVALID_CHAR, // [21]
+ INVALID_CHAR, // [22]
+ INVALID_CHAR, // [23]
+ INVALID_CHAR, // [24]
+ INVALID_CHAR, // [25]
+ INVALID_CHAR, // [26]
+ INVALID_CHAR, // [27]
+ INVALID_CHAR, // [28]
+ INVALID_CHAR, // [29]
+ INVALID_CHAR, // [30]
+ INVALID_CHAR, // [31]
+ INVALID_CHAR, // [32]
+ INVALID_CHAR, // [33]
+ INVALID_CHAR, // [34]
+ INVALID_CHAR, // [35]
+ INVALID_CHAR, // [36]
+ INVALID_CHAR, // [37]
+ INVALID_CHAR, // [38]
+ INVALID_CHAR, // [39]
+ INVALID_CHAR, // [40]
+ INVALID_CHAR, // [41]
+ INVALID_CHAR, // [42]
+ INVALID_CHAR, // [43]
+ INVALID_CHAR, // [44]
+ INVALID_CHAR, // [45]
+ INVALID_CHAR, // [46]
+ INVALID_CHAR, // [47]
+ INVALID_CHAR, // [48]
+ INVALID_CHAR, // [49]
+ INVALID_CHAR, // [50]
+ INVALID_CHAR, // [51]
+ INVALID_CHAR, // [52]
+ INVALID_CHAR, // [53]
+ INVALID_CHAR, // [54]
+ INVALID_CHAR, // [55]
+ INVALID_CHAR, // [56]
+ INVALID_CHAR, // [57]
+ INVALID_CHAR, // [58]
+ INVALID_CHAR, // [59]
+ INVALID_CHAR, // [60]
+ INVALID_CHAR, // [61]
+ INVALID_CHAR, // [62]
+ INVALID_CHAR, // [63]
+ INVALID_CHAR, // [64]
+ INVALID_CHAR, // [65]
+ INVALID_CHAR, // [66]
+ INVALID_CHAR, // [67]
+ INVALID_CHAR, // [68]
+ INVALID_CHAR, // [69]
+ INVALID_CHAR, // [70]
+ INVALID_CHAR, // [71]
+ INVALID_CHAR, // [72]
+ INVALID_CHAR, // [73]
+ INVALID_CHAR, // [74]
+ INVALID_CHAR, // [75]
+ INVALID_CHAR, // [76]
+ INVALID_CHAR, // [77]
+ INVALID_CHAR, // [78]
+ INVALID_CHAR, // [79]
+ INVALID_CHAR, // [80]
+ INVALID_CHAR, // [81]
+ INVALID_CHAR, // [82]
+ INVALID_CHAR, // [83]
+ INVALID_CHAR, // [84]
+ INVALID_CHAR, // [85]
+ INVALID_CHAR, // [86]
+ INVALID_CHAR, // [87]
+ INVALID_CHAR, // [88]
+ INVALID_CHAR, // [89]
+ INVALID_CHAR, // [90]
+ INVALID_CHAR, // [91]
+ INVALID_CHAR, // [92]
+ INVALID_CHAR, // [93]
+ INVALID_CHAR, // [94]
+ INVALID_CHAR, // [95]
+ INVALID_CHAR, // [96]
+ INVALID_CHAR, // [97]
+ INVALID_CHAR, // [98]
+ INVALID_CHAR, // [99]
+ INVALID_CHAR, // [100]
+ INVALID_CHAR, // [101]
+ INVALID_CHAR, // [102]
+ INVALID_CHAR, // [103]
+ INVALID_CHAR, // [104]
+ INVALID_CHAR, // [105]
+ INVALID_CHAR, // [106]
+ INVALID_CHAR, // [107]
+ INVALID_CHAR, // [108]
+ INVALID_CHAR, // [109]
+ INVALID_CHAR, // [110]
+ INVALID_CHAR, // [111]
+ INVALID_CHAR, // [112]
+ INVALID_CHAR, // [113]
+ INVALID_CHAR, // [114]
+ INVALID_CHAR, // [115]
+ INVALID_CHAR, // [116]
+ INVALID_CHAR, // [117]
+ INVALID_CHAR, // [118]
+ INVALID_CHAR, // [119]
+ INVALID_CHAR, // [120]
+ INVALID_CHAR, // [121]
+ INVALID_CHAR, // [122]
+ INVALID_CHAR, // [123]
+ INVALID_CHAR, // [124]
+ INVALID_CHAR, // [125]
+ INVALID_CHAR, // [126]
+ INVALID_CHAR, // [127]
+ INVALID_CHAR, // [128]
+ INVALID_CHAR, // [129]
+ INVALID_CHAR, // [130]
+ INVALID_CHAR, // [131]
+ INVALID_CHAR, // [132]
+ INVALID_CHAR, // [133]
+ INVALID_CHAR, // [134]
+ INVALID_CHAR, // [135]
+ INVALID_CHAR, // [136]
+ INVALID_CHAR, // [137]
+ INVALID_CHAR, // [138]
+ INVALID_CHAR, // [139]
+ INVALID_CHAR, // [140]
+ INVALID_CHAR, // [141]
+ INVALID_CHAR, // [142]
+ INVALID_CHAR, // [143]
+ INVALID_CHAR, // [144]
+ INVALID_CHAR, // [145]
+ INVALID_CHAR, // [146]
+ INVALID_CHAR, // [147]
+ INVALID_CHAR, // [148]
+ INVALID_CHAR, // [149]
+ INVALID_CHAR, // [150]
+ INVALID_CHAR, // [151]
+ INVALID_CHAR, // [152]
+ INVALID_CHAR, // [153]
+ INVALID_CHAR, // [154]
+ INVALID_CHAR, // [155]
+ INVALID_CHAR, // [156]
+ INVALID_CHAR, // [157]
+ INVALID_CHAR, // [158]
+ INVALID_CHAR, // [159]
+ INVALID_CHAR, // [160]
+ INVALID_CHAR, // [161]
+ INVALID_CHAR, // [162]
+ INVALID_CHAR, // [163]
+ INVALID_CHAR, // [164]
+ INVALID_CHAR, // [165]
+ INVALID_CHAR, // [166]
+ INVALID_CHAR, // [167]
+ INVALID_CHAR, // [168]
+ INVALID_CHAR, // [169]
+ INVALID_CHAR, // [170]
+ INVALID_CHAR, // [171]
+ INVALID_CHAR, // [172]
+ INVALID_CHAR, // [173]
+ INVALID_CHAR, // [174]
+ INVALID_CHAR, // [175]
+ INVALID_CHAR, // [176]
+ INVALID_CHAR, // [177]
+ INVALID_CHAR, // [178]
+ INVALID_CHAR, // [179]
+ INVALID_CHAR, // [180]
+ INVALID_CHAR, // [181]
+ INVALID_CHAR, // [182]
+ INVALID_CHAR, // [183]
+ INVALID_CHAR, // [184]
+ INVALID_CHAR, // [185]
+ INVALID_CHAR, // [186]
+ INVALID_CHAR, // [187]
+ INVALID_CHAR, // [188]
+ INVALID_CHAR, // [189]
+ INVALID_CHAR, // [190]
+ INVALID_CHAR, // [191]
+ INVALID_CHAR, // [192]
+ INVALID_CHAR, // [193]
+ INVALID_CHAR, // [194]
+ INVALID_CHAR, // [195]
+ INVALID_CHAR, // [196]
+ INVALID_CHAR, // [197]
+ INVALID_CHAR, // [198]
+ INVALID_CHAR, // [199]
+ INVALID_CHAR, // [200]
+ INVALID_CHAR, // [201]
+ INVALID_CHAR, // [202]
+ INVALID_CHAR, // [203]
+ INVALID_CHAR, // [204]
+ INVALID_CHAR, // [205]
+ INVALID_CHAR, // [206]
+ INVALID_CHAR, // [207]
+ INVALID_CHAR, // [208]
+ INVALID_CHAR, // [209]
+ INVALID_CHAR, // [210]
+ INVALID_CHAR, // [211]
+ INVALID_CHAR, // [212]
+ INVALID_CHAR, // [213]
+ INVALID_CHAR, // [214]
+ INVALID_CHAR, // [215]
+ INVALID_CHAR, // [216]
+ INVALID_CHAR, // [217]
+ INVALID_CHAR, // [218]
+ INVALID_CHAR, // [219]
+ INVALID_CHAR, // [220]
+ INVALID_CHAR, // [221]
+ INVALID_CHAR, // [222]
+ INVALID_CHAR, // [223]
+ INVALID_CHAR, // [224]
+ INVALID_CHAR, // [225]
+ INVALID_CHAR, // [226]
+ INVALID_CHAR, // [227]
+ INVALID_CHAR, // [228]
+ INVALID_CHAR, // [229]
+ INVALID_CHAR, // [230]
+ INVALID_CHAR, // [231]
+ INVALID_CHAR, // [232]
+ INVALID_CHAR, // [233]
+ INVALID_CHAR, // [234]
+ INVALID_CHAR, // [235]
+ INVALID_CHAR, // [236]
+ INVALID_CHAR, // [237]
+ INVALID_CHAR, // [238]
+ INVALID_CHAR, // [239]
+ INVALID_CHAR, // [240]
+ INVALID_CHAR, // [241]
+ INVALID_CHAR, // [242]
+ INVALID_CHAR, // [243]
+ INVALID_CHAR, // [244]
+ INVALID_CHAR, // [245]
+ INVALID_CHAR, // [246]
+ INVALID_CHAR, // [247]
+ INVALID_CHAR, // [248]
+ INVALID_CHAR, // [249]
+ INVALID_CHAR, // [250]
+ INVALID_CHAR, // [251]
+ INVALID_CHAR, // [252]
+ INVALID_CHAR, // [253]
+ INVALID_CHAR, // [254]
+ INVALID_CHAR, // [255]
+ };
+
+unsigned g_CharToLetterNucleo[256] =
+ {
+ INVALID_LETTER, // [ 0] = 0x00
+ INVALID_LETTER, // [ 1] = 0x01
+ INVALID_LETTER, // [ 2] = 0x02
+ INVALID_LETTER, // [ 3] = 0x03
+ INVALID_LETTER, // [ 4] = 0x04
+ INVALID_LETTER, // [ 5] = 0x05
+ INVALID_LETTER, // [ 6] = 0x06
+ INVALID_LETTER, // [ 7] = 0x07
+ INVALID_LETTER, // [ 8] = 0x08
+ INVALID_LETTER, // [ 9] = 0x09
+ INVALID_LETTER, // [ 10] = 0x0a
+ INVALID_LETTER, // [ 11] = 0x0b
+ INVALID_LETTER, // [ 12] = 0x0c
+ INVALID_LETTER, // [ 13] = 0x0d
+ INVALID_LETTER, // [ 14] = 0x0e
+ INVALID_LETTER, // [ 15] = 0x0f
+ INVALID_LETTER, // [ 16] = 0x10
+ INVALID_LETTER, // [ 17] = 0x11
+ INVALID_LETTER, // [ 18] = 0x12
+ INVALID_LETTER, // [ 19] = 0x13
+ INVALID_LETTER, // [ 20] = 0x14
+ INVALID_LETTER, // [ 21] = 0x15
+ INVALID_LETTER, // [ 22] = 0x16
+ INVALID_LETTER, // [ 23] = 0x17
+ INVALID_LETTER, // [ 24] = 0x18
+ INVALID_LETTER, // [ 25] = 0x19
+ INVALID_LETTER, // [ 26] = 0x1a
+ INVALID_LETTER, // [ 27] = 0x1b
+ INVALID_LETTER, // [ 28] = 0x1c
+ INVALID_LETTER, // [ 29] = 0x1d
+ INVALID_LETTER, // [ 30] = 0x1e
+ INVALID_LETTER, // [ 31] = 0x1f
+ INVALID_LETTER, // [ 32] = 32
+ INVALID_LETTER, // [ 33] = 33
+ INVALID_LETTER, // [ 34] = 34
+ INVALID_LETTER, // [ 35] = 35
+ INVALID_LETTER, // [ 36] = 36
+ INVALID_LETTER, // [ 37] = 37
+ INVALID_LETTER, // [ 38] = 38
+ INVALID_LETTER, // [ 39] = 39
+ INVALID_LETTER, // [ 40] = 40
+ INVALID_LETTER, // [ 41] = 41
+ INVALID_LETTER, // [ 42] = 42
+ INVALID_LETTER, // [ 43] = 43
+ INVALID_LETTER, // [ 44] = 44
+ INVALID_LETTER, // [ 45] = 45
+ INVALID_LETTER, // [ 46] = 46
+ INVALID_LETTER, // [ 47] = 47
+ INVALID_LETTER, // [ 48] = 48
+ INVALID_LETTER, // [ 49] = 49
+ INVALID_LETTER, // [ 50] = 50
+ INVALID_LETTER, // [ 51] = 51
+ INVALID_LETTER, // [ 52] = 52
+ INVALID_LETTER, // [ 53] = 53
+ INVALID_LETTER, // [ 54] = 54
+ INVALID_LETTER, // [ 55] = 55
+ INVALID_LETTER, // [ 56] = 56
+ INVALID_LETTER, // [ 57] = 57
+ INVALID_LETTER, // [ 58] = 58
+ INVALID_LETTER, // [ 59] = 59
+ INVALID_LETTER, // [ 60] = 60
+ INVALID_LETTER, // [ 61] = 61
+ INVALID_LETTER, // [ 62] = 62
+ INVALID_LETTER, // [ 63] = 63
+ INVALID_LETTER, // [ 64] = 64
+ 0 , // [ 65] = A (Nucleotide)
+ INVALID_LETTER, // [ 66] = 66
+ 1 , // [ 67] = C (Nucleotide)
+ INVALID_LETTER, // [ 68] = 68
+ INVALID_LETTER, // [ 69] = 69
+ INVALID_LETTER, // [ 70] = 70
+ 2 , // [ 71] = G (Nucleotide)
+ INVALID_LETTER, // [ 72] = 72
+ INVALID_LETTER, // [ 73] = 73
+ INVALID_LETTER, // [ 74] = 74
+ INVALID_LETTER, // [ 75] = 75
+ INVALID_LETTER, // [ 76] = 76
+ INVALID_LETTER, // [ 77] = 77
+ INVALID_LETTER, // [ 78] = 78
+ INVALID_LETTER, // [ 79] = 79
+ INVALID_LETTER, // [ 80] = 80
+ INVALID_LETTER, // [ 81] = 81
+ INVALID_LETTER, // [ 82] = 82
+ INVALID_LETTER, // [ 83] = 83
+ 3 , // [ 84] = T (Nucleotide)
+ 3 , // [ 85] = U (Nucleotide)
+ INVALID_LETTER, // [ 86] = 86
+ INVALID_LETTER, // [ 87] = 87
+ INVALID_LETTER, // [ 88] = 88
+ INVALID_LETTER, // [ 89] = 89
+ INVALID_LETTER, // [ 90] = 90
+ INVALID_LETTER, // [ 91] = 91
+ INVALID_LETTER, // [ 92] = 92
+ INVALID_LETTER, // [ 93] = 93
+ INVALID_LETTER, // [ 94] = 94
+ INVALID_LETTER, // [ 95] = 95
+ INVALID_LETTER, // [ 96] = 96
+ 0 , // [ 97] = a (Nucleotide)
+ INVALID_LETTER, // [ 98] = 98
+ 1 , // [ 99] = c (Nucleotide)
+ INVALID_LETTER, // [100] = 100
+ INVALID_LETTER, // [101] = 101
+ INVALID_LETTER, // [102] = 102
+ 2 , // [103] = g (Nucleotide)
+ INVALID_LETTER, // [104] = 104
+ INVALID_LETTER, // [105] = 105
+ INVALID_LETTER, // [106] = 106
+ INVALID_LETTER, // [107] = 107
+ INVALID_LETTER, // [108] = 108
+ INVALID_LETTER, // [109] = 109
+ INVALID_LETTER, // [110] = 110
+ INVALID_LETTER, // [111] = 111
+ INVALID_LETTER, // [112] = 112
+ INVALID_LETTER, // [113] = 113
+ INVALID_LETTER, // [114] = 114
+ INVALID_LETTER, // [115] = 115
+ 3 , // [116] = t (Nucleotide)
+ 3 , // [117] = u (Nucleotide)
+ INVALID_LETTER, // [118] = 118
+ INVALID_LETTER, // [119] = 119
+ INVALID_LETTER, // [120] = 120
+ INVALID_LETTER, // [121] = 121
+ INVALID_LETTER, // [122] = 122
+ INVALID_LETTER, // [123] = 123
+ INVALID_LETTER, // [124] = 124
+ INVALID_LETTER, // [125] = 125
+ INVALID_LETTER, // [126] = 126
+ INVALID_LETTER, // [127] = 0x7f
+ INVALID_LETTER, // [128] = 0x80
+ INVALID_LETTER, // [129] = 0x81
+ INVALID_LETTER, // [130] = 0x82
+ INVALID_LETTER, // [131] = 0x83
+ INVALID_LETTER, // [132] = 0x84
+ INVALID_LETTER, // [133] = 0x85
+ INVALID_LETTER, // [134] = 0x86
+ INVALID_LETTER, // [135] = 0x87
+ INVALID_LETTER, // [136] = 0x88
+ INVALID_LETTER, // [137] = 0x89
+ INVALID_LETTER, // [138] = 0x8a
+ INVALID_LETTER, // [139] = 0x8b
+ INVALID_LETTER, // [140] = 0x8c
+ INVALID_LETTER, // [141] = 0x8d
+ INVALID_LETTER, // [142] = 0x8e
+ INVALID_LETTER, // [143] = 0x8f
+ INVALID_LETTER, // [144] = 0x90
+ INVALID_LETTER, // [145] = 0x91
+ INVALID_LETTER, // [146] = 0x92
+ INVALID_LETTER, // [147] = 0x93
+ INVALID_LETTER, // [148] = 0x94
+ INVALID_LETTER, // [149] = 0x95
+ INVALID_LETTER, // [150] = 0x96
+ INVALID_LETTER, // [151] = 0x97
+ INVALID_LETTER, // [152] = 0x98
+ INVALID_LETTER, // [153] = 0x99
+ INVALID_LETTER, // [154] = 0x9a
+ INVALID_LETTER, // [155] = 0x9b
+ INVALID_LETTER, // [156] = 0x9c
+ INVALID_LETTER, // [157] = 0x9d
+ INVALID_LETTER, // [158] = 0x9e
+ INVALID_LETTER, // [159] = 0x9f
+ INVALID_LETTER, // [160] = 0xa0
+ INVALID_LETTER, // [161] = 0xa1
+ INVALID_LETTER, // [162] = 0xa2
+ INVALID_LETTER, // [163] = 0xa3
+ INVALID_LETTER, // [164] = 0xa4
+ INVALID_LETTER, // [165] = 0xa5
+ INVALID_LETTER, // [166] = 0xa6
+ INVALID_LETTER, // [167] = 0xa7
+ INVALID_LETTER, // [168] = 0xa8
+ INVALID_LETTER, // [169] = 0xa9
+ INVALID_LETTER, // [170] = 0xaa
+ INVALID_LETTER, // [171] = 0xab
+ INVALID_LETTER, // [172] = 0xac
+ INVALID_LETTER, // [173] = 0xad
+ INVALID_LETTER, // [174] = 0xae
+ INVALID_LETTER, // [175] = 0xaf
+ INVALID_LETTER, // [176] = 0xb0
+ INVALID_LETTER, // [177] = 0xb1
+ INVALID_LETTER, // [178] = 0xb2
+ INVALID_LETTER, // [179] = 0xb3
+ INVALID_LETTER, // [180] = 0xb4
+ INVALID_LETTER, // [181] = 0xb5
+ INVALID_LETTER, // [182] = 0xb6
+ INVALID_LETTER, // [183] = 0xb7
+ INVALID_LETTER, // [184] = 0xb8
+ INVALID_LETTER, // [185] = 0xb9
+ INVALID_LETTER, // [186] = 0xba
+ INVALID_LETTER, // [187] = 0xbb
+ INVALID_LETTER, // [188] = 0xbc
+ INVALID_LETTER, // [189] = 0xbd
+ INVALID_LETTER, // [190] = 0xbe
+ INVALID_LETTER, // [191] = 0xbf
+ INVALID_LETTER, // [192] = 0xc0
+ INVALID_LETTER, // [193] = 0xc1
+ INVALID_LETTER, // [194] = 0xc2
+ INVALID_LETTER, // [195] = 0xc3
+ INVALID_LETTER, // [196] = 0xc4
+ INVALID_LETTER, // [197] = 0xc5
+ INVALID_LETTER, // [198] = 0xc6
+ INVALID_LETTER, // [199] = 0xc7
+ INVALID_LETTER, // [200] = 0xc8
+ INVALID_LETTER, // [201] = 0xc9
+ INVALID_LETTER, // [202] = 0xca
+ INVALID_LETTER, // [203] = 0xcb
+ INVALID_LETTER, // [204] = 0xcc
+ INVALID_LETTER, // [205] = 0xcd
+ INVALID_LETTER, // [206] = 0xce
+ INVALID_LETTER, // [207] = 0xcf
+ INVALID_LETTER, // [208] = 0xd0
+ INVALID_LETTER, // [209] = 0xd1
+ INVALID_LETTER, // [210] = 0xd2
+ INVALID_LETTER, // [211] = 0xd3
+ INVALID_LETTER, // [212] = 0xd4
+ INVALID_LETTER, // [213] = 0xd5
+ INVALID_LETTER, // [214] = 0xd6
+ INVALID_LETTER, // [215] = 0xd7
+ INVALID_LETTER, // [216] = 0xd8
+ INVALID_LETTER, // [217] = 0xd9
+ INVALID_LETTER, // [218] = 0xda
+ INVALID_LETTER, // [219] = 0xdb
+ INVALID_LETTER, // [220] = 0xdc
+ INVALID_LETTER, // [221] = 0xdd
+ INVALID_LETTER, // [222] = 0xde
+ INVALID_LETTER, // [223] = 0xdf
+ INVALID_LETTER, // [224] = 0xe0
+ INVALID_LETTER, // [225] = 0xe1
+ INVALID_LETTER, // [226] = 0xe2
+ INVALID_LETTER, // [227] = 0xe3
+ INVALID_LETTER, // [228] = 0xe4
+ INVALID_LETTER, // [229] = 0xe5
+ INVALID_LETTER, // [230] = 0xe6
+ INVALID_LETTER, // [231] = 0xe7
+ INVALID_LETTER, // [232] = 0xe8
+ INVALID_LETTER, // [233] = 0xe9
+ INVALID_LETTER, // [234] = 0xea
+ INVALID_LETTER, // [235] = 0xeb
+ INVALID_LETTER, // [236] = 0xec
+ INVALID_LETTER, // [237] = 0xed
+ INVALID_LETTER, // [238] = 0xee
+ INVALID_LETTER, // [239] = 0xef
+ INVALID_LETTER, // [240] = 0xf0
+ INVALID_LETTER, // [241] = 0xf1
+ INVALID_LETTER, // [242] = 0xf2
+ INVALID_LETTER, // [243] = 0xf3
+ INVALID_LETTER, // [244] = 0xf4
+ INVALID_LETTER, // [245] = 0xf5
+ INVALID_LETTER, // [246] = 0xf6
+ INVALID_LETTER, // [247] = 0xf7
+ INVALID_LETTER, // [248] = 0xf8
+ INVALID_LETTER, // [249] = 0xf9
+ INVALID_LETTER, // [250] = 0xfa
+ INVALID_LETTER, // [251] = 0xfb
+ INVALID_LETTER, // [252] = 0xfc
+ INVALID_LETTER, // [253] = 0xfd
+ INVALID_LETTER, // [254] = 0xfe
+ INVALID_LETTER, // [255] = 0xff
+ };
+
+unsigned char g_LetterToCharNucleo[256] =
+ {
+ 'A', // [0]
+ 'C', // [1]
+ 'G', // [2]
+ 'T', // [3]
+ INVALID_CHAR, // [4]
+ INVALID_CHAR, // [5]
+ INVALID_CHAR, // [6]
+ INVALID_CHAR, // [7]
+ INVALID_CHAR, // [8]
+ INVALID_CHAR, // [9]
+ INVALID_CHAR, // [10]
+ INVALID_CHAR, // [11]
+ INVALID_CHAR, // [12]
+ INVALID_CHAR, // [13]
+ INVALID_CHAR, // [14]
+ INVALID_CHAR, // [15]
+ INVALID_CHAR, // [16]
+ INVALID_CHAR, // [17]
+ INVALID_CHAR, // [18]
+ INVALID_CHAR, // [19]
+ INVALID_CHAR, // [20]
+ INVALID_CHAR, // [21]
+ INVALID_CHAR, // [22]
+ INVALID_CHAR, // [23]
+ INVALID_CHAR, // [24]
+ INVALID_CHAR, // [25]
+ INVALID_CHAR, // [26]
+ INVALID_CHAR, // [27]
+ INVALID_CHAR, // [28]
+ INVALID_CHAR, // [29]
+ INVALID_CHAR, // [30]
+ INVALID_CHAR, // [31]
+ INVALID_CHAR, // [32]
+ INVALID_CHAR, // [33]
+ INVALID_CHAR, // [34]
+ INVALID_CHAR, // [35]
+ INVALID_CHAR, // [36]
+ INVALID_CHAR, // [37]
+ INVALID_CHAR, // [38]
+ INVALID_CHAR, // [39]
+ INVALID_CHAR, // [40]
+ INVALID_CHAR, // [41]
+ INVALID_CHAR, // [42]
+ INVALID_CHAR, // [43]
+ INVALID_CHAR, // [44]
+ INVALID_CHAR, // [45]
+ INVALID_CHAR, // [46]
+ INVALID_CHAR, // [47]
+ INVALID_CHAR, // [48]
+ INVALID_CHAR, // [49]
+ INVALID_CHAR, // [50]
+ INVALID_CHAR, // [51]
+ INVALID_CHAR, // [52]
+ INVALID_CHAR, // [53]
+ INVALID_CHAR, // [54]
+ INVALID_CHAR, // [55]
+ INVALID_CHAR, // [56]
+ INVALID_CHAR, // [57]
+ INVALID_CHAR, // [58]
+ INVALID_CHAR, // [59]
+ INVALID_CHAR, // [60]
+ INVALID_CHAR, // [61]
+ INVALID_CHAR, // [62]
+ INVALID_CHAR, // [63]
+ INVALID_CHAR, // [64]
+ INVALID_CHAR, // [65]
+ INVALID_CHAR, // [66]
+ INVALID_CHAR, // [67]
+ INVALID_CHAR, // [68]
+ INVALID_CHAR, // [69]
+ INVALID_CHAR, // [70]
+ INVALID_CHAR, // [71]
+ INVALID_CHAR, // [72]
+ INVALID_CHAR, // [73]
+ INVALID_CHAR, // [74]
+ INVALID_CHAR, // [75]
+ INVALID_CHAR, // [76]
+ INVALID_CHAR, // [77]
+ INVALID_CHAR, // [78]
+ INVALID_CHAR, // [79]
+ INVALID_CHAR, // [80]
+ INVALID_CHAR, // [81]
+ INVALID_CHAR, // [82]
+ INVALID_CHAR, // [83]
+ INVALID_CHAR, // [84]
+ INVALID_CHAR, // [85]
+ INVALID_CHAR, // [86]
+ INVALID_CHAR, // [87]
+ INVALID_CHAR, // [88]
+ INVALID_CHAR, // [89]
+ INVALID_CHAR, // [90]
+ INVALID_CHAR, // [91]
+ INVALID_CHAR, // [92]
+ INVALID_CHAR, // [93]
+ INVALID_CHAR, // [94]
+ INVALID_CHAR, // [95]
+ INVALID_CHAR, // [96]
+ INVALID_CHAR, // [97]
+ INVALID_CHAR, // [98]
+ INVALID_CHAR, // [99]
+ INVALID_CHAR, // [100]
+ INVALID_CHAR, // [101]
+ INVALID_CHAR, // [102]
+ INVALID_CHAR, // [103]
+ INVALID_CHAR, // [104]
+ INVALID_CHAR, // [105]
+ INVALID_CHAR, // [106]
+ INVALID_CHAR, // [107]
+ INVALID_CHAR, // [108]
+ INVALID_CHAR, // [109]
+ INVALID_CHAR, // [110]
+ INVALID_CHAR, // [111]
+ INVALID_CHAR, // [112]
+ INVALID_CHAR, // [113]
+ INVALID_CHAR, // [114]
+ INVALID_CHAR, // [115]
+ INVALID_CHAR, // [116]
+ INVALID_CHAR, // [117]
+ INVALID_CHAR, // [118]
+ INVALID_CHAR, // [119]
+ INVALID_CHAR, // [120]
+ INVALID_CHAR, // [121]
+ INVALID_CHAR, // [122]
+ INVALID_CHAR, // [123]
+ INVALID_CHAR, // [124]
+ INVALID_CHAR, // [125]
+ INVALID_CHAR, // [126]
+ INVALID_CHAR, // [127]
+ INVALID_CHAR, // [128]
+ INVALID_CHAR, // [129]
+ INVALID_CHAR, // [130]
+ INVALID_CHAR, // [131]
+ INVALID_CHAR, // [132]
+ INVALID_CHAR, // [133]
+ INVALID_CHAR, // [134]
+ INVALID_CHAR, // [135]
+ INVALID_CHAR, // [136]
+ INVALID_CHAR, // [137]
+ INVALID_CHAR, // [138]
+ INVALID_CHAR, // [139]
+ INVALID_CHAR, // [140]
+ INVALID_CHAR, // [141]
+ INVALID_CHAR, // [142]
+ INVALID_CHAR, // [143]
+ INVALID_CHAR, // [144]
+ INVALID_CHAR, // [145]
+ INVALID_CHAR, // [146]
+ INVALID_CHAR, // [147]
+ INVALID_CHAR, // [148]
+ INVALID_CHAR, // [149]
+ INVALID_CHAR, // [150]
+ INVALID_CHAR, // [151]
+ INVALID_CHAR, // [152]
+ INVALID_CHAR, // [153]
+ INVALID_CHAR, // [154]
+ INVALID_CHAR, // [155]
+ INVALID_CHAR, // [156]
+ INVALID_CHAR, // [157]
+ INVALID_CHAR, // [158]
+ INVALID_CHAR, // [159]
+ INVALID_CHAR, // [160]
+ INVALID_CHAR, // [161]
+ INVALID_CHAR, // [162]
+ INVALID_CHAR, // [163]
+ INVALID_CHAR, // [164]
+ INVALID_CHAR, // [165]
+ INVALID_CHAR, // [166]
+ INVALID_CHAR, // [167]
+ INVALID_CHAR, // [168]
+ INVALID_CHAR, // [169]
+ INVALID_CHAR, // [170]
+ INVALID_CHAR, // [171]
+ INVALID_CHAR, // [172]
+ INVALID_CHAR, // [173]
+ INVALID_CHAR, // [174]
+ INVALID_CHAR, // [175]
+ INVALID_CHAR, // [176]
+ INVALID_CHAR, // [177]
+ INVALID_CHAR, // [178]
+ INVALID_CHAR, // [179]
+ INVALID_CHAR, // [180]
+ INVALID_CHAR, // [181]
+ INVALID_CHAR, // [182]
+ INVALID_CHAR, // [183]
+ INVALID_CHAR, // [184]
+ INVALID_CHAR, // [185]
+ INVALID_CHAR, // [186]
+ INVALID_CHAR, // [187]
+ INVALID_CHAR, // [188]
+ INVALID_CHAR, // [189]
+ INVALID_CHAR, // [190]
+ INVALID_CHAR, // [191]
+ INVALID_CHAR, // [192]
+ INVALID_CHAR, // [193]
+ INVALID_CHAR, // [194]
+ INVALID_CHAR, // [195]
+ INVALID_CHAR, // [196]
+ INVALID_CHAR, // [197]
+ INVALID_CHAR, // [198]
+ INVALID_CHAR, // [199]
+ INVALID_CHAR, // [200]
+ INVALID_CHAR, // [201]
+ INVALID_CHAR, // [202]
+ INVALID_CHAR, // [203]
+ INVALID_CHAR, // [204]
+ INVALID_CHAR, // [205]
+ INVALID_CHAR, // [206]
+ INVALID_CHAR, // [207]
+ INVALID_CHAR, // [208]
+ INVALID_CHAR, // [209]
+ INVALID_CHAR, // [210]
+ INVALID_CHAR, // [211]
+ INVALID_CHAR, // [212]
+ INVALID_CHAR, // [213]
+ INVALID_CHAR, // [214]
+ INVALID_CHAR, // [215]
+ INVALID_CHAR, // [216]
+ INVALID_CHAR, // [217]
+ INVALID_CHAR, // [218]
+ INVALID_CHAR, // [219]
+ INVALID_CHAR, // [220]
+ INVALID_CHAR, // [221]
+ INVALID_CHAR, // [222]
+ INVALID_CHAR, // [223]
+ INVALID_CHAR, // [224]
+ INVALID_CHAR, // [225]
+ INVALID_CHAR, // [226]
+ INVALID_CHAR, // [227]
+ INVALID_CHAR, // [228]
+ INVALID_CHAR, // [229]
+ INVALID_CHAR, // [230]
+ INVALID_CHAR, // [231]
+ INVALID_CHAR, // [232]
+ INVALID_CHAR, // [233]
+ INVALID_CHAR, // [234]
+ INVALID_CHAR, // [235]
+ INVALID_CHAR, // [236]
+ INVALID_CHAR, // [237]
+ INVALID_CHAR, // [238]
+ INVALID_CHAR, // [239]
+ INVALID_CHAR, // [240]
+ INVALID_CHAR, // [241]
+ INVALID_CHAR, // [242]
+ INVALID_CHAR, // [243]
+ INVALID_CHAR, // [244]
+ INVALID_CHAR, // [245]
+ INVALID_CHAR, // [246]
+ INVALID_CHAR, // [247]
+ INVALID_CHAR, // [248]
+ INVALID_CHAR, // [249]
+ INVALID_CHAR, // [250]
+ INVALID_CHAR, // [251]
+ INVALID_CHAR, // [252]
+ INVALID_CHAR, // [253]
+ INVALID_CHAR, // [254]
+ INVALID_CHAR, // [255]
+ };
+
+unsigned g_CodonWordToAminoLetter[4*4*4] =
+ {
+ 8 , // [ 0] = AAA K (Lys)
+ 11, // [ 1] = AAC N (Asn)
+ 8 , // [ 2] = AAG K (Lys)
+ 11, // [ 3] = AAT N (Asn)
+ 16, // [ 4] = ACA T (Thr)
+ 16, // [ 5] = ACC T (Thr)
+ 16, // [ 6] = ACG T (Thr)
+ 16, // [ 7] = ACT T (Thr)
+ 14, // [ 8] = AGA R (Arg)
+ 15, // [ 9] = AGC S (Ser)
+ 14, // [10] = AGG R (Arg)
+ 15, // [11] = AGT S (Ser)
+ 7 , // [12] = ATA I (Ile)
+ 7 , // [13] = ATC I (Ile)
+ 10, // [14] = ATG M (Met)
+ 7 , // [15] = ATT I (Ile)
+ 13, // [16] = CAA Q (Gln)
+ 6 , // [17] = CAC H (His)
+ 13, // [18] = CAG Q (Gln)
+ 6 , // [19] = CAT H (His)
+ 12, // [20] = CCA P (Pro)
+ 12, // [21] = CCC P (Pro)
+ 12, // [22] = CCG P (Pro)
+ 12, // [23] = CCT P (Pro)
+ 14, // [24] = CGA R (Arg)
+ 14, // [25] = CGC R (Arg)
+ 14, // [26] = CGG R (Arg)
+ 14, // [27] = CGT R (Arg)
+ 9 , // [28] = CTA L (Leu)
+ 9 , // [29] = CTC L (Leu)
+ 9 , // [30] = CTG L (Leu)
+ 9 , // [31] = CTT L (Leu)
+ 3 , // [32] = GAA E (Glu)
+ 2 , // [33] = GAC D (Asp)
+ 3 , // [34] = GAG E (Glu)
+ 2 , // [35] = GAT D (Asp)
+ 0 , // [36] = GCA A (Ala)
+ 0 , // [37] = GCC A (Ala)
+ 0 , // [38] = GCG A (Ala)
+ 0 , // [39] = GCT A (Ala)
+ 5 , // [40] = GGA G (Gly)
+ 5 , // [41] = GGC G (Gly)
+ 5 , // [42] = GGG G (Gly)
+ 5 , // [43] = GGT G (Gly)
+ 17, // [44] = GTA V (Val)
+ 17, // [45] = GTC V (Val)
+ 17, // [46] = GTG V (Val)
+ 17, // [47] = GTT V (Val)
+ 20, // [48] = TAA * (STP)
+ 19, // [49] = TAC Y (Tyr)
+ 20, // [50] = TAG * (STP)
+ 19, // [51] = TAT Y (Tyr)
+ 15, // [52] = TCA S (Ser)
+ 15, // [53] = TCC S (Ser)
+ 15, // [54] = TCG S (Ser)
+ 15, // [55] = TCT S (Ser)
+ 20, // [56] = TGA * (STP)
+ 1 , // [57] = TGC C (Cys)
+ 18, // [58] = TGG W (Trp)
+ 1 , // [59] = TGT C (Cys)
+ 9 , // [60] = TTA L (Leu)
+ 4 , // [61] = TTC F (Phe)
+ 9 , // [62] = TTG L (Leu)
+ 4 , // [63] = TTT F (Phe)
+ };
+
+char g_CodonWordToAminoChar[4*4*4] =
+ {
+ 'K', // [ 0] = AAA (Lys)
+ 'N', // [ 1] = AAC (Asn)
+ 'K', // [ 2] = AAG (Lys)
+ 'N', // [ 3] = AAT (Asn)
+ 'T', // [ 4] = ACA (Thr)
+ 'T', // [ 5] = ACC (Thr)
+ 'T', // [ 6] = ACG (Thr)
+ 'T', // [ 7] = ACT (Thr)
+ 'R', // [ 8] = AGA (Arg)
+ 'S', // [ 9] = AGC (Ser)
+ 'R', // [10] = AGG (Arg)
+ 'S', // [11] = AGT (Ser)
+ 'I', // [12] = ATA (Ile)
+ 'I', // [13] = ATC (Ile)
+ 'M', // [14] = ATG (Met)
+ 'I', // [15] = ATT (Ile)
+ 'Q', // [16] = CAA (Gln)
+ 'H', // [17] = CAC (His)
+ 'Q', // [18] = CAG (Gln)
+ 'H', // [19] = CAT (His)
+ 'P', // [20] = CCA (Pro)
+ 'P', // [21] = CCC (Pro)
+ 'P', // [22] = CCG (Pro)
+ 'P', // [23] = CCT (Pro)
+ 'R', // [24] = CGA (Arg)
+ 'R', // [25] = CGC (Arg)
+ 'R', // [26] = CGG (Arg)
+ 'R', // [27] = CGT (Arg)
+ 'L', // [28] = CTA (Leu)
+ 'L', // [29] = CTC (Leu)
+ 'L', // [30] = CTG (Leu)
+ 'L', // [31] = CTT (Leu)
+ 'E', // [32] = GAA (Glu)
+ 'D', // [33] = GAC (Asp)
+ 'E', // [34] = GAG (Glu)
+ 'D', // [35] = GAT (Asp)
+ 'A', // [36] = GCA (Ala)
+ 'A', // [37] = GCC (Ala)
+ 'A', // [38] = GCG (Ala)
+ 'A', // [39] = GCT (Ala)
+ 'G', // [40] = GGA (Gly)
+ 'G', // [41] = GGC (Gly)
+ 'G', // [42] = GGG (Gly)
+ 'G', // [43] = GGT (Gly)
+ 'V', // [44] = GTA (Val)
+ 'V', // [45] = GTC (Val)
+ 'V', // [46] = GTG (Val)
+ 'V', // [47] = GTT (Val)
+ '*', // [48] = TAA (STP)
+ 'Y', // [49] = TAC (Tyr)
+ '*', // [50] = TAG (STP)
+ 'Y', // [51] = TAT (Tyr)
+ 'S', // [52] = TCA (Ser)
+ 'S', // [53] = TCC (Ser)
+ 'S', // [54] = TCG (Ser)
+ 'S', // [55] = TCT (Ser)
+ '*', // [56] = TGA (STP)
+ 'C', // [57] = TGC (Cys)
+ 'W', // [58] = TGG (Trp)
+ 'C', // [59] = TGT (Cys)
+ 'L', // [60] = TTA (Leu)
+ 'F', // [61] = TTC (Phe)
+ 'L', // [62] = TTG (Leu)
+ 'F', // [63] = TTT (Phe)
+ };
+
+unsigned char g_CharToCompChar[256] =
+ {
+ INVALID_CHAR, // [ 0]
+ INVALID_CHAR, // [ 1]
+ INVALID_CHAR, // [ 2]
+ INVALID_CHAR, // [ 3]
+ INVALID_CHAR, // [ 4]
+ INVALID_CHAR, // [ 5]
+ INVALID_CHAR, // [ 6]
+ INVALID_CHAR, // [ 7]
+ INVALID_CHAR, // [ 8]
+ INVALID_CHAR, // [ 9]
+ INVALID_CHAR, // [ 10]
+ INVALID_CHAR, // [ 11]
+ INVALID_CHAR, // [ 12]
+ INVALID_CHAR, // [ 13]
+ INVALID_CHAR, // [ 14]
+ INVALID_CHAR, // [ 15]
+ INVALID_CHAR, // [ 16]
+ INVALID_CHAR, // [ 17]
+ INVALID_CHAR, // [ 18]
+ INVALID_CHAR, // [ 19]
+ INVALID_CHAR, // [ 20]
+ INVALID_CHAR, // [ 21]
+ INVALID_CHAR, // [ 22]
+ INVALID_CHAR, // [ 23]
+ INVALID_CHAR, // [ 24]
+ INVALID_CHAR, // [ 25]
+ INVALID_CHAR, // [ 26]
+ INVALID_CHAR, // [ 27]
+ INVALID_CHAR, // [ 28]
+ INVALID_CHAR, // [ 29]
+ INVALID_CHAR, // [ 30]
+ INVALID_CHAR, // [ 31]
+ INVALID_CHAR, // [ 32]
+ INVALID_CHAR, // [ 33]
+ INVALID_CHAR, // [ 34]
+ INVALID_CHAR, // [ 35]
+ INVALID_CHAR, // [ 36]
+ INVALID_CHAR, // [ 37]
+ INVALID_CHAR, // [ 38]
+ INVALID_CHAR, // [ 39]
+ INVALID_CHAR, // [ 40]
+ INVALID_CHAR, // [ 41]
+ INVALID_CHAR, // [ 42]
+ INVALID_CHAR, // [ 43]
+ INVALID_CHAR, // [ 44]
+ INVALID_CHAR, // [ 45]
+ INVALID_CHAR, // [ 46]
+ INVALID_CHAR, // [ 47]
+ INVALID_CHAR, // [ 48]
+ INVALID_CHAR, // [ 49]
+ INVALID_CHAR, // [ 50]
+ INVALID_CHAR, // [ 51]
+ INVALID_CHAR, // [ 52]
+ INVALID_CHAR, // [ 53]
+ INVALID_CHAR, // [ 54]
+ INVALID_CHAR, // [ 55]
+ INVALID_CHAR, // [ 56]
+ INVALID_CHAR, // [ 57]
+ INVALID_CHAR, // [ 58]
+ INVALID_CHAR, // [ 59]
+ INVALID_CHAR, // [ 60]
+ INVALID_CHAR, // [ 61]
+ INVALID_CHAR, // [ 62]
+ INVALID_CHAR, // [ 63]
+ INVALID_CHAR, // [ 64]
+ 'T', // [ 65] A -> T
+ INVALID_CHAR, // [ 66]
+ 'G', // [ 67] C -> G
+ INVALID_CHAR, // [ 68]
+ INVALID_CHAR, // [ 69]
+ INVALID_CHAR, // [ 70]
+ 'C', // [ 71] G -> C
+ INVALID_CHAR, // [ 72]
+ INVALID_CHAR, // [ 73]
+ INVALID_CHAR, // [ 74]
+ INVALID_CHAR, // [ 75]
+ INVALID_CHAR, // [ 76]
+ INVALID_CHAR, // [ 77]
+ INVALID_CHAR, // [ 78]
+ INVALID_CHAR, // [ 79]
+ INVALID_CHAR, // [ 80]
+ INVALID_CHAR, // [ 81]
+ INVALID_CHAR, // [ 82]
+ INVALID_CHAR, // [ 83]
+ 'A', // [ 84] T -> A
+ 'A', // [ 85] U -> A
+ INVALID_CHAR, // [ 86]
+ INVALID_CHAR, // [ 87]
+ INVALID_CHAR, // [ 88]
+ INVALID_CHAR, // [ 89]
+ INVALID_CHAR, // [ 90]
+ INVALID_CHAR, // [ 91]
+ INVALID_CHAR, // [ 92]
+ INVALID_CHAR, // [ 93]
+ INVALID_CHAR, // [ 94]
+ INVALID_CHAR, // [ 95]
+ INVALID_CHAR, // [ 96]
+ 'T', // [ 97] a -> T
+ INVALID_CHAR, // [ 98]
+ 'G', // [ 99] c -> G
+ INVALID_CHAR, // [100]
+ INVALID_CHAR, // [101]
+ INVALID_CHAR, // [102]
+ 'C', // [103] g -> C
+ INVALID_CHAR, // [104]
+ INVALID_CHAR, // [105]
+ INVALID_CHAR, // [106]
+ INVALID_CHAR, // [107]
+ INVALID_CHAR, // [108]
+ INVALID_CHAR, // [109]
+ INVALID_CHAR, // [110]
+ INVALID_CHAR, // [111]
+ INVALID_CHAR, // [112]
+ INVALID_CHAR, // [113]
+ INVALID_CHAR, // [114]
+ INVALID_CHAR, // [115]
+ 'A', // [116] t -> A
+ 'A', // [117] u -> A
+ INVALID_CHAR, // [118]
+ INVALID_CHAR, // [119]
+ INVALID_CHAR, // [120]
+ INVALID_CHAR, // [121]
+ INVALID_CHAR, // [122]
+ INVALID_CHAR, // [123]
+ INVALID_CHAR, // [124]
+ INVALID_CHAR, // [125]
+ INVALID_CHAR, // [126]
+ INVALID_CHAR, // [127]
+ INVALID_CHAR, // [128]
+ INVALID_CHAR, // [129]
+ INVALID_CHAR, // [130]
+ INVALID_CHAR, // [131]
+ INVALID_CHAR, // [132]
+ INVALID_CHAR, // [133]
+ INVALID_CHAR, // [134]
+ INVALID_CHAR, // [135]
+ INVALID_CHAR, // [136]
+ INVALID_CHAR, // [137]
+ INVALID_CHAR, // [138]
+ INVALID_CHAR, // [139]
+ INVALID_CHAR, // [140]
+ INVALID_CHAR, // [141]
+ INVALID_CHAR, // [142]
+ INVALID_CHAR, // [143]
+ INVALID_CHAR, // [144]
+ INVALID_CHAR, // [145]
+ INVALID_CHAR, // [146]
+ INVALID_CHAR, // [147]
+ INVALID_CHAR, // [148]
+ INVALID_CHAR, // [149]
+ INVALID_CHAR, // [150]
+ INVALID_CHAR, // [151]
+ INVALID_CHAR, // [152]
+ INVALID_CHAR, // [153]
+ INVALID_CHAR, // [154]
+ INVALID_CHAR, // [155]
+ INVALID_CHAR, // [156]
+ INVALID_CHAR, // [157]
+ INVALID_CHAR, // [158]
+ INVALID_CHAR, // [159]
+ INVALID_CHAR, // [160]
+ INVALID_CHAR, // [161]
+ INVALID_CHAR, // [162]
+ INVALID_CHAR, // [163]
+ INVALID_CHAR, // [164]
+ INVALID_CHAR, // [165]
+ INVALID_CHAR, // [166]
+ INVALID_CHAR, // [167]
+ INVALID_CHAR, // [168]
+ INVALID_CHAR, // [169]
+ INVALID_CHAR, // [170]
+ INVALID_CHAR, // [171]
+ INVALID_CHAR, // [172]
+ INVALID_CHAR, // [173]
+ INVALID_CHAR, // [174]
+ INVALID_CHAR, // [175]
+ INVALID_CHAR, // [176]
+ INVALID_CHAR, // [177]
+ INVALID_CHAR, // [178]
+ INVALID_CHAR, // [179]
+ INVALID_CHAR, // [180]
+ INVALID_CHAR, // [181]
+ INVALID_CHAR, // [182]
+ INVALID_CHAR, // [183]
+ INVALID_CHAR, // [184]
+ INVALID_CHAR, // [185]
+ INVALID_CHAR, // [186]
+ INVALID_CHAR, // [187]
+ INVALID_CHAR, // [188]
+ INVALID_CHAR, // [189]
+ INVALID_CHAR, // [190]
+ INVALID_CHAR, // [191]
+ INVALID_CHAR, // [192]
+ INVALID_CHAR, // [193]
+ INVALID_CHAR, // [194]
+ INVALID_CHAR, // [195]
+ INVALID_CHAR, // [196]
+ INVALID_CHAR, // [197]
+ INVALID_CHAR, // [198]
+ INVALID_CHAR, // [199]
+ INVALID_CHAR, // [200]
+ INVALID_CHAR, // [201]
+ INVALID_CHAR, // [202]
+ INVALID_CHAR, // [203]
+ INVALID_CHAR, // [204]
+ INVALID_CHAR, // [205]
+ INVALID_CHAR, // [206]
+ INVALID_CHAR, // [207]
+ INVALID_CHAR, // [208]
+ INVALID_CHAR, // [209]
+ INVALID_CHAR, // [210]
+ INVALID_CHAR, // [211]
+ INVALID_CHAR, // [212]
+ INVALID_CHAR, // [213]
+ INVALID_CHAR, // [214]
+ INVALID_CHAR, // [215]
+ INVALID_CHAR, // [216]
+ INVALID_CHAR, // [217]
+ INVALID_CHAR, // [218]
+ INVALID_CHAR, // [219]
+ INVALID_CHAR, // [220]
+ INVALID_CHAR, // [221]
+ INVALID_CHAR, // [222]
+ INVALID_CHAR, // [223]
+ INVALID_CHAR, // [224]
+ INVALID_CHAR, // [225]
+ INVALID_CHAR, // [226]
+ INVALID_CHAR, // [227]
+ INVALID_CHAR, // [228]
+ INVALID_CHAR, // [229]
+ INVALID_CHAR, // [230]
+ INVALID_CHAR, // [231]
+ INVALID_CHAR, // [232]
+ INVALID_CHAR, // [233]
+ INVALID_CHAR, // [234]
+ INVALID_CHAR, // [235]
+ INVALID_CHAR, // [236]
+ INVALID_CHAR, // [237]
+ INVALID_CHAR, // [238]
+ INVALID_CHAR, // [239]
+ INVALID_CHAR, // [240]
+ INVALID_CHAR, // [241]
+ INVALID_CHAR, // [242]
+ INVALID_CHAR, // [243]
+ INVALID_CHAR, // [244]
+ INVALID_CHAR, // [245]
+ INVALID_CHAR, // [246]
+ INVALID_CHAR, // [247]
+ INVALID_CHAR, // [248]
+ INVALID_CHAR, // [249]
+ INVALID_CHAR, // [250]
+ INVALID_CHAR, // [251]
+ INVALID_CHAR, // [252]
+ INVALID_CHAR, // [253]
+ INVALID_CHAR, // [254]
+ INVALID_CHAR, // [255]
+};
+
+unsigned g_CharToCompLetter[256] =
+ {
+ INVALID_LETTER, // [ 0]
+ INVALID_LETTER, // [ 1]
+ INVALID_LETTER, // [ 2]
+ INVALID_LETTER, // [ 3]
+ INVALID_LETTER, // [ 4]
+ INVALID_LETTER, // [ 5]
+ INVALID_LETTER, // [ 6]
+ INVALID_LETTER, // [ 7]
+ INVALID_LETTER, // [ 8]
+ INVALID_LETTER, // [ 9]
+ INVALID_LETTER, // [ 10]
+ INVALID_LETTER, // [ 11]
+ INVALID_LETTER, // [ 12]
+ INVALID_LETTER, // [ 13]
+ INVALID_LETTER, // [ 14]
+ INVALID_LETTER, // [ 15]
+ INVALID_LETTER, // [ 16]
+ INVALID_LETTER, // [ 17]
+ INVALID_LETTER, // [ 18]
+ INVALID_LETTER, // [ 19]
+ INVALID_LETTER, // [ 20]
+ INVALID_LETTER, // [ 21]
+ INVALID_LETTER, // [ 22]
+ INVALID_LETTER, // [ 23]
+ INVALID_LETTER, // [ 24]
+ INVALID_LETTER, // [ 25]
+ INVALID_LETTER, // [ 26]
+ INVALID_LETTER, // [ 27]
+ INVALID_LETTER, // [ 28]
+ INVALID_LETTER, // [ 29]
+ INVALID_LETTER, // [ 30]
+ INVALID_LETTER, // [ 31]
+ INVALID_LETTER, // [ 32]
+ INVALID_LETTER, // [ 33]
+ INVALID_LETTER, // [ 34]
+ INVALID_LETTER, // [ 35]
+ INVALID_LETTER, // [ 36]
+ INVALID_LETTER, // [ 37]
+ INVALID_LETTER, // [ 38]
+ INVALID_LETTER, // [ 39]
+ INVALID_LETTER, // [ 40]
+ INVALID_LETTER, // [ 41]
+ INVALID_LETTER, // [ 42]
+ INVALID_LETTER, // [ 43]
+ INVALID_LETTER, // [ 44]
+ INVALID_LETTER, // [ 45]
+ INVALID_LETTER, // [ 46]
+ INVALID_LETTER, // [ 47]
+ INVALID_LETTER, // [ 48]
+ INVALID_LETTER, // [ 49]
+ INVALID_LETTER, // [ 50]
+ INVALID_LETTER, // [ 51]
+ INVALID_LETTER, // [ 52]
+ INVALID_LETTER, // [ 53]
+ INVALID_LETTER, // [ 54]
+ INVALID_LETTER, // [ 55]
+ INVALID_LETTER, // [ 56]
+ INVALID_LETTER, // [ 57]
+ INVALID_LETTER, // [ 58]
+ INVALID_LETTER, // [ 59]
+ INVALID_LETTER, // [ 60]
+ INVALID_LETTER, // [ 61]
+ INVALID_LETTER, // [ 62]
+ INVALID_LETTER, // [ 63]
+ INVALID_LETTER, // [ 64]
+ 3, // [ 65] A -> T
+ INVALID_LETTER, // [ 66]
+ 2, // [ 67] C -> G
+ INVALID_LETTER, // [ 68]
+ INVALID_LETTER, // [ 69]
+ INVALID_LETTER, // [ 70]
+ 1, // [ 71] G -> C
+ INVALID_LETTER, // [ 72]
+ INVALID_LETTER, // [ 73]
+ INVALID_LETTER, // [ 74]
+ INVALID_LETTER, // [ 75]
+ INVALID_LETTER, // [ 76]
+ INVALID_LETTER, // [ 77]
+ INVALID_LETTER, // [ 78]
+ INVALID_LETTER, // [ 79]
+ INVALID_LETTER, // [ 80]
+ INVALID_LETTER, // [ 81]
+ INVALID_LETTER, // [ 82]
+ INVALID_LETTER, // [ 83]
+ 0, // [ 84] T -> A
+ 0, // [ 85] U -> A
+ INVALID_LETTER, // [ 86]
+ INVALID_LETTER, // [ 87]
+ INVALID_LETTER, // [ 88]
+ INVALID_LETTER, // [ 89]
+ INVALID_LETTER, // [ 90]
+ INVALID_LETTER, // [ 91]
+ INVALID_LETTER, // [ 92]
+ INVALID_LETTER, // [ 93]
+ INVALID_LETTER, // [ 94]
+ INVALID_LETTER, // [ 95]
+ INVALID_LETTER, // [ 96]
+ 3, // [ 97] a -> T
+ INVALID_LETTER, // [ 98]
+ 2, // [ 99] c -> G
+ INVALID_LETTER, // [100]
+ INVALID_LETTER, // [101]
+ INVALID_LETTER, // [102]
+ 1, // [103] g -> C
+ INVALID_LETTER, // [104]
+ INVALID_LETTER, // [105]
+ INVALID_LETTER, // [106]
+ INVALID_LETTER, // [107]
+ INVALID_LETTER, // [108]
+ INVALID_LETTER, // [109]
+ INVALID_LETTER, // [110]
+ INVALID_LETTER, // [111]
+ INVALID_LETTER, // [112]
+ INVALID_LETTER, // [113]
+ INVALID_LETTER, // [114]
+ INVALID_LETTER, // [115]
+ 0, // [116] t -> A
+ 0, // [117] u -> A
+ INVALID_LETTER, // [118]
+ INVALID_LETTER, // [119]
+ INVALID_LETTER, // [120]
+ INVALID_LETTER, // [121]
+ INVALID_LETTER, // [122]
+ INVALID_LETTER, // [123]
+ INVALID_LETTER, // [124]
+ INVALID_LETTER, // [125]
+ INVALID_LETTER, // [126]
+ INVALID_LETTER, // [127]
+ INVALID_LETTER, // [128]
+ INVALID_LETTER, // [129]
+ INVALID_LETTER, // [130]
+ INVALID_LETTER, // [131]
+ INVALID_LETTER, // [132]
+ INVALID_LETTER, // [133]
+ INVALID_LETTER, // [134]
+ INVALID_LETTER, // [135]
+ INVALID_LETTER, // [136]
+ INVALID_LETTER, // [137]
+ INVALID_LETTER, // [138]
+ INVALID_LETTER, // [139]
+ INVALID_LETTER, // [140]
+ INVALID_LETTER, // [141]
+ INVALID_LETTER, // [142]
+ INVALID_LETTER, // [143]
+ INVALID_LETTER, // [144]
+ INVALID_LETTER, // [145]
+ INVALID_LETTER, // [146]
+ INVALID_LETTER, // [147]
+ INVALID_LETTER, // [148]
+ INVALID_LETTER, // [149]
+ INVALID_LETTER, // [150]
+ INVALID_LETTER, // [151]
+ INVALID_LETTER, // [152]
+ INVALID_LETTER, // [153]
+ INVALID_LETTER, // [154]
+ INVALID_LETTER, // [155]
+ INVALID_LETTER, // [156]
+ INVALID_LETTER, // [157]
+ INVALID_LETTER, // [158]
+ INVALID_LETTER, // [159]
+ INVALID_LETTER, // [160]
+ INVALID_LETTER, // [161]
+ INVALID_LETTER, // [162]
+ INVALID_LETTER, // [163]
+ INVALID_LETTER, // [164]
+ INVALID_LETTER, // [165]
+ INVALID_LETTER, // [166]
+ INVALID_LETTER, // [167]
+ INVALID_LETTER, // [168]
+ INVALID_LETTER, // [169]
+ INVALID_LETTER, // [170]
+ INVALID_LETTER, // [171]
+ INVALID_LETTER, // [172]
+ INVALID_LETTER, // [173]
+ INVALID_LETTER, // [174]
+ INVALID_LETTER, // [175]
+ INVALID_LETTER, // [176]
+ INVALID_LETTER, // [177]
+ INVALID_LETTER, // [178]
+ INVALID_LETTER, // [179]
+ INVALID_LETTER, // [180]
+ INVALID_LETTER, // [181]
+ INVALID_LETTER, // [182]
+ INVALID_LETTER, // [183]
+ INVALID_LETTER, // [184]
+ INVALID_LETTER, // [185]
+ INVALID_LETTER, // [186]
+ INVALID_LETTER, // [187]
+ INVALID_LETTER, // [188]
+ INVALID_LETTER, // [189]
+ INVALID_LETTER, // [190]
+ INVALID_LETTER, // [191]
+ INVALID_LETTER, // [192]
+ INVALID_LETTER, // [193]
+ INVALID_LETTER, // [194]
+ INVALID_LETTER, // [195]
+ INVALID_LETTER, // [196]
+ INVALID_LETTER, // [197]
+ INVALID_LETTER, // [198]
+ INVALID_LETTER, // [199]
+ INVALID_LETTER, // [200]
+ INVALID_LETTER, // [201]
+ INVALID_LETTER, // [202]
+ INVALID_LETTER, // [203]
+ INVALID_LETTER, // [204]
+ INVALID_LETTER, // [205]
+ INVALID_LETTER, // [206]
+ INVALID_LETTER, // [207]
+ INVALID_LETTER, // [208]
+ INVALID_LETTER, // [209]
+ INVALID_LETTER, // [210]
+ INVALID_LETTER, // [211]
+ INVALID_LETTER, // [212]
+ INVALID_LETTER, // [213]
+ INVALID_LETTER, // [214]
+ INVALID_LETTER, // [215]
+ INVALID_LETTER, // [216]
+ INVALID_LETTER, // [217]
+ INVALID_LETTER, // [218]
+ INVALID_LETTER, // [219]
+ INVALID_LETTER, // [220]
+ INVALID_LETTER, // [221]
+ INVALID_LETTER, // [222]
+ INVALID_LETTER, // [223]
+ INVALID_LETTER, // [224]
+ INVALID_LETTER, // [225]
+ INVALID_LETTER, // [226]
+ INVALID_LETTER, // [227]
+ INVALID_LETTER, // [228]
+ INVALID_LETTER, // [229]
+ INVALID_LETTER, // [230]
+ INVALID_LETTER, // [231]
+ INVALID_LETTER, // [232]
+ INVALID_LETTER, // [233]
+ INVALID_LETTER, // [234]
+ INVALID_LETTER, // [235]
+ INVALID_LETTER, // [236]
+ INVALID_LETTER, // [237]
+ INVALID_LETTER, // [238]
+ INVALID_LETTER, // [239]
+ INVALID_LETTER, // [240]
+ INVALID_LETTER, // [241]
+ INVALID_LETTER, // [242]
+ INVALID_LETTER, // [243]
+ INVALID_LETTER, // [244]
+ INVALID_LETTER, // [245]
+ INVALID_LETTER, // [246]
+ INVALID_LETTER, // [247]
+ INVALID_LETTER, // [248]
+ INVALID_LETTER, // [249]
+ INVALID_LETTER, // [250]
+ INVALID_LETTER, // [251]
+ INVALID_LETTER, // [252]
+ INVALID_LETTER, // [253]
+ INVALID_LETTER, // [254]
+ INVALID_LETTER, // [255]
+};
+
+bool g_IsAminoChar[256] =
+ {
+ false, // [ 0] 0x00
+ false, // [ 1] 0x01
+ false, // [ 2] 0x02
+ false, // [ 3] 0x03
+ false, // [ 4] 0x04
+ false, // [ 5] 0x05
+ false, // [ 6] 0x06
+ false, // [ 7] 0x07
+ false, // [ 8] 0x08
+ false, // [ 9] 0x09
+ false, // [ 10] 0x0a
+ false, // [ 11] 0x0b
+ false, // [ 12] 0x0c
+ false, // [ 13] 0x0d
+ false, // [ 14] 0x0e
+ false, // [ 15] 0x0f
+ false, // [ 16] 0x10
+ false, // [ 17] 0x11
+ false, // [ 18] 0x12
+ false, // [ 19] 0x13
+ false, // [ 20] 0x14
+ false, // [ 21] 0x15
+ false, // [ 22] 0x16
+ false, // [ 23] 0x17
+ false, // [ 24] 0x18
+ false, // [ 25] 0x19
+ false, // [ 26] 0x1a
+ false, // [ 27] 0x1b
+ false, // [ 28] 0x1c
+ false, // [ 29] 0x1d
+ false, // [ 30] 0x1e
+ false, // [ 31] 0x1f
+ false, // [ 32] ' '
+ false, // [ 33] '!'
+ false, // [ 34] '"'
+ false, // [ 35] '#'
+ false, // [ 36] '$'
+ false, // [ 37] '%'
+ false, // [ 38] '&'
+ false, // [ 39] '''
+ false, // [ 40] '('
+ false, // [ 41] ')'
+ true, // [ 42] '*' = STP
+ false, // [ 43] '+'
+ false, // [ 44] ','
+ false, // [ 45] '-'
+ false, // [ 46] '.'
+ false, // [ 47] '/'
+ false, // [ 48] '0'
+ false, // [ 49] '1'
+ false, // [ 50] '2'
+ false, // [ 51] '3'
+ false, // [ 52] '4'
+ false, // [ 53] '5'
+ false, // [ 54] '6'
+ false, // [ 55] '7'
+ false, // [ 56] '8'
+ false, // [ 57] '9'
+ false, // [ 58] ':'
+ false, // [ 59] ';'
+ false, // [ 60] '<'
+ false, // [ 61] '='
+ false, // [ 62] '>'
+ false, // [ 63] '?'
+ false, // [ 64] '@'
+ true, // [ 65] 'A' = Ala
+ false, // [ 66] 'B'
+ true, // [ 67] 'C' = Cys
+ true, // [ 68] 'D' = Asp
+ true, // [ 69] 'E' = Glu
+ true, // [ 70] 'F' = Phe
+ true, // [ 71] 'G' = Gly
+ true, // [ 72] 'H' = His
+ true, // [ 73] 'I' = Ile
+ false, // [ 74] 'J'
+ true, // [ 75] 'K' = Lys
+ true, // [ 76] 'L' = Leu
+ true, // [ 77] 'M' = Met
+ true, // [ 78] 'N' = Asn
+ false, // [ 79] 'O'
+ true, // [ 80] 'P' = Pro
+ true, // [ 81] 'Q' = Gln
+ true, // [ 82] 'R' = Arg
+ true, // [ 83] 'S' = Ser
+ true, // [ 84] 'T' = Thr
+ false, // [ 85] 'U'
+ true, // [ 86] 'V' = Val
+ true, // [ 87] 'W' = Trp
+ false, // [ 88] 'X'
+ true, // [ 89] 'Y' = Tyr
+ false, // [ 90] 'Z'
+ false, // [ 91] '['
+ false, // [ 92] '\'
+ false, // [ 93] ']'
+ false, // [ 94] '^'
+ false, // [ 95] '_'
+ false, // [ 96] '`'
+ true, // [ 97] 'A' = Ala
+ false, // [ 98] 'B'
+ true, // [ 99] 'C' = Cys
+ true, // [100] 'D' = Asp
+ true, // [101] 'E' = Glu
+ true, // [102] 'F' = Phe
+ true, // [103] 'G' = Gly
+ true, // [104] 'H' = His
+ true, // [105] 'I' = Ile
+ false, // [106] 'J'
+ true, // [107] 'K' = Lys
+ true, // [108] 'L' = Leu
+ true, // [109] 'M' = Met
+ true, // [110] 'N' = Asn
+ false, // [111] 'O'
+ true, // [112] 'P' = Pro
+ true, // [113] 'Q' = Gln
+ true, // [114] 'R' = Arg
+ true, // [115] 'S' = Ser
+ true, // [116] 'T' = Thr
+ false, // [117] 'U'
+ true, // [118] 'V' = Val
+ true, // [119] 'W' = Trp
+ false, // [120] 'X'
+ true, // [121] 'Y' = Tyr
+ false, // [122] 'Z'
+ false, // [123] '{'
+ false, // [124] '|'
+ false, // [125] '}'
+ false, // [126] '~'
+ false, // [127] 0x7f
+ false, // [128] 0x80
+ false, // [129] 0x81
+ false, // [130] 0x82
+ false, // [131] 0x83
+ false, // [132] 0x84
+ false, // [133] 0x85
+ false, // [134] 0x86
+ false, // [135] 0x87
+ false, // [136] 0x88
+ false, // [137] 0x89
+ false, // [138] 0x8a
+ false, // [139] 0x8b
+ false, // [140] 0x8c
+ false, // [141] 0x8d
+ false, // [142] 0x8e
+ false, // [143] 0x8f
+ false, // [144] 0x90
+ false, // [145] 0x91
+ false, // [146] 0x92
+ false, // [147] 0x93
+ false, // [148] 0x94
+ false, // [149] 0x95
+ false, // [150] 0x96
+ false, // [151] 0x97
+ false, // [152] 0x98
+ false, // [153] 0x99
+ false, // [154] 0x9a
+ false, // [155] 0x9b
+ false, // [156] 0x9c
+ false, // [157] 0x9d
+ false, // [158] 0x9e
+ false, // [159] 0x9f
+ false, // [160] 0xa0
+ false, // [161] 0xa1
+ false, // [162] 0xa2
+ false, // [163] 0xa3
+ false, // [164] 0xa4
+ false, // [165] 0xa5
+ false, // [166] 0xa6
+ false, // [167] 0xa7
+ false, // [168] 0xa8
+ false, // [169] 0xa9
+ false, // [170] 0xaa
+ false, // [171] 0xab
+ false, // [172] 0xac
+ false, // [173] 0xad
+ false, // [174] 0xae
+ false, // [175] 0xaf
+ false, // [176] 0xb0
+ false, // [177] 0xb1
+ false, // [178] 0xb2
+ false, // [179] 0xb3
+ false, // [180] 0xb4
+ false, // [181] 0xb5
+ false, // [182] 0xb6
+ false, // [183] 0xb7
+ false, // [184] 0xb8
+ false, // [185] 0xb9
+ false, // [186] 0xba
+ false, // [187] 0xbb
+ false, // [188] 0xbc
+ false, // [189] 0xbd
+ false, // [190] 0xbe
+ false, // [191] 0xbf
+ false, // [192] 0xc0
+ false, // [193] 0xc1
+ false, // [194] 0xc2
+ false, // [195] 0xc3
+ false, // [196] 0xc4
+ false, // [197] 0xc5
+ false, // [198] 0xc6
+ false, // [199] 0xc7
+ false, // [200] 0xc8
+ false, // [201] 0xc9
+ false, // [202] 0xca
+ false, // [203] 0xcb
+ false, // [204] 0xcc
+ false, // [205] 0xcd
+ false, // [206] 0xce
+ false, // [207] 0xcf
+ false, // [208] 0xd0
+ false, // [209] 0xd1
+ false, // [210] 0xd2
+ false, // [211] 0xd3
+ false, // [212] 0xd4
+ false, // [213] 0xd5
+ false, // [214] 0xd6
+ false, // [215] 0xd7
+ false, // [216] 0xd8
+ false, // [217] 0xd9
+ false, // [218] 0xda
+ false, // [219] 0xdb
+ false, // [220] 0xdc
+ false, // [221] 0xdd
+ false, // [222] 0xde
+ false, // [223] 0xdf
+ false, // [224] 0xe0
+ false, // [225] 0xe1
+ false, // [226] 0xe2
+ false, // [227] 0xe3
+ false, // [228] 0xe4
+ false, // [229] 0xe5
+ false, // [230] 0xe6
+ false, // [231] 0xe7
+ false, // [232] 0xe8
+ false, // [233] 0xe9
+ false, // [234] 0xea
+ false, // [235] 0xeb
+ false, // [236] 0xec
+ false, // [237] 0xed
+ false, // [238] 0xee
+ false, // [239] 0xef
+ false, // [240] 0xf0
+ false, // [241] 0xf1
+ false, // [242] 0xf2
+ false, // [243] 0xf3
+ false, // [244] 0xf4
+ false, // [245] 0xf5
+ false, // [246] 0xf6
+ false, // [247] 0xf7
+ false, // [248] 0xf8
+ false, // [249] 0xf9
+ false, // [250] 0xfa
+ false, // [251] 0xfb
+ false, // [252] 0xfc
+ false, // [253] 0xfd
+ false, // [254] 0xfe
+ false, // [255] 0xff
+ };
+
+bool g_IsNucleoChar[256] =
+ {
+ false, // [ 0] 0x00
+ false, // [ 1] 0x01
+ false, // [ 2] 0x02
+ false, // [ 3] 0x03
+ false, // [ 4] 0x04
+ false, // [ 5] 0x05
+ false, // [ 6] 0x06
+ false, // [ 7] 0x07
+ false, // [ 8] 0x08
+ false, // [ 9] 0x09
+ false, // [ 10] 0x0a
+ false, // [ 11] 0x0b
+ false, // [ 12] 0x0c
+ false, // [ 13] 0x0d
+ false, // [ 14] 0x0e
+ false, // [ 15] 0x0f
+ false, // [ 16] 0x10
+ false, // [ 17] 0x11
+ false, // [ 18] 0x12
+ false, // [ 19] 0x13
+ false, // [ 20] 0x14
+ false, // [ 21] 0x15
+ false, // [ 22] 0x16
+ false, // [ 23] 0x17
+ false, // [ 24] 0x18
+ false, // [ 25] 0x19
+ false, // [ 26] 0x1a
+ false, // [ 27] 0x1b
+ false, // [ 28] 0x1c
+ false, // [ 29] 0x1d
+ false, // [ 30] 0x1e
+ false, // [ 31] 0x1f
+ false, // [ 32] ' '
+ false, // [ 33] '!'
+ false, // [ 34] '"'
+ false, // [ 35] '#'
+ false, // [ 36] '$'
+ false, // [ 37] '%'
+ false, // [ 38] '&'
+ false, // [ 39] '''
+ false, // [ 40] '('
+ false, // [ 41] ')'
+ false, // [ 42] '*'
+ false, // [ 43] '+'
+ false, // [ 44] ','
+ false, // [ 45] '-'
+ false, // [ 46] '.'
+ false, // [ 47] '/'
+ false, // [ 48] '0'
+ false, // [ 49] '1'
+ false, // [ 50] '2'
+ false, // [ 51] '3'
+ false, // [ 52] '4'
+ false, // [ 53] '5'
+ false, // [ 54] '6'
+ false, // [ 55] '7'
+ false, // [ 56] '8'
+ false, // [ 57] '9'
+ false, // [ 58] ':'
+ false, // [ 59] ';'
+ false, // [ 60] '<'
+ false, // [ 61] '='
+ false, // [ 62] '>'
+ false, // [ 63] '?'
+ false, // [ 64] '@'
+ true, // [ 65] 'A' (Nucleotide)
+ false, // [ 66] 'B'
+ true, // [ 67] 'C' (Nucleotide)
+ false, // [ 68] 'D'
+ false, // [ 69] 'E'
+ false, // [ 70] 'F'
+ true, // [ 71] 'G' (Nucleotide)
+ false, // [ 72] 'H'
+ false, // [ 73] 'I'
+ false, // [ 74] 'J'
+ false, // [ 75] 'K'
+ false, // [ 76] 'L'
+ false, // [ 77] 'M'
+ true, // [ 78] 'N' (Nucleotide)
+ false, // [ 79] 'O'
+ false, // [ 80] 'P'
+ false, // [ 81] 'Q'
+ false, // [ 82] 'R'
+ false, // [ 83] 'S'
+ true, // [ 84] 'T' (Nucleotide)
+ true, // [ 85] 'U' (Nucleotide)
+ false, // [ 86] 'V'
+ false, // [ 87] 'W'
+ false, // [ 88] 'X'
+ false, // [ 89] 'Y'
+ false, // [ 90] 'Z'
+ false, // [ 91] '['
+ false, // [ 92] '\'
+ false, // [ 93] ']'
+ false, // [ 94] '^'
+ false, // [ 95] '_'
+ false, // [ 96] '`'
+ true, // [ 97] 'A' (Nucleotide)
+ false, // [ 98] 'B'
+ true, // [ 99] 'C' (Nucleotide)
+ false, // [100] 'D'
+ false, // [101] 'E'
+ false, // [102] 'F'
+ true, // [103] 'G' (Nucleotide)
+ false, // [104] 'H'
+ false, // [105] 'I'
+ false, // [106] 'J'
+ false, // [107] 'K'
+ false, // [108] 'L'
+ false, // [109] 'M'
+ true, // [110] 'N' (Nucleotide)
+ false, // [111] 'O'
+ false, // [112] 'P'
+ false, // [113] 'Q'
+ false, // [114] 'R'
+ false, // [115] 'S'
+ true, // [116] 'T' (Nucleotide)
+ true, // [117] 'U' (Nucleotide)
+ false, // [118] 'V'
+ false, // [119] 'W'
+ false, // [120] 'X'
+ false, // [121] 'Y'
+ false, // [122] 'Z'
+ false, // [123] '{'
+ false, // [124] '|'
+ false, // [125] '}'
+ false, // [126] '~'
+ false, // [127] 0x7f
+ false, // [128] 0x80
+ false, // [129] 0x81
+ false, // [130] 0x82
+ false, // [131] 0x83
+ false, // [132] 0x84
+ false, // [133] 0x85
+ false, // [134] 0x86
+ false, // [135] 0x87
+ false, // [136] 0x88
+ false, // [137] 0x89
+ false, // [138] 0x8a
+ false, // [139] 0x8b
+ false, // [140] 0x8c
+ false, // [141] 0x8d
+ false, // [142] 0x8e
+ false, // [143] 0x8f
+ false, // [144] 0x90
+ false, // [145] 0x91
+ false, // [146] 0x92
+ false, // [147] 0x93
+ false, // [148] 0x94
+ false, // [149] 0x95
+ false, // [150] 0x96
+ false, // [151] 0x97
+ false, // [152] 0x98
+ false, // [153] 0x99
+ false, // [154] 0x9a
+ false, // [155] 0x9b
+ false, // [156] 0x9c
+ false, // [157] 0x9d
+ false, // [158] 0x9e
+ false, // [159] 0x9f
+ false, // [160] 0xa0
+ false, // [161] 0xa1
+ false, // [162] 0xa2
+ false, // [163] 0xa3
+ false, // [164] 0xa4
+ false, // [165] 0xa5
+ false, // [166] 0xa6
+ false, // [167] 0xa7
+ false, // [168] 0xa8
+ false, // [169] 0xa9
+ false, // [170] 0xaa
+ false, // [171] 0xab
+ false, // [172] 0xac
+ false, // [173] 0xad
+ false, // [174] 0xae
+ false, // [175] 0xaf
+ false, // [176] 0xb0
+ false, // [177] 0xb1
+ false, // [178] 0xb2
+ false, // [179] 0xb3
+ false, // [180] 0xb4
+ false, // [181] 0xb5
+ false, // [182] 0xb6
+ false, // [183] 0xb7
+ false, // [184] 0xb8
+ false, // [185] 0xb9
+ false, // [186] 0xba
+ false, // [187] 0xbb
+ false, // [188] 0xbc
+ false, // [189] 0xbd
+ false, // [190] 0xbe
+ false, // [191] 0xbf
+ false, // [192] 0xc0
+ false, // [193] 0xc1
+ false, // [194] 0xc2
+ false, // [195] 0xc3
+ false, // [196] 0xc4
+ false, // [197] 0xc5
+ false, // [198] 0xc6
+ false, // [199] 0xc7
+ false, // [200] 0xc8
+ false, // [201] 0xc9
+ false, // [202] 0xca
+ false, // [203] 0xcb
+ false, // [204] 0xcc
+ false, // [205] 0xcd
+ false, // [206] 0xce
+ false, // [207] 0xcf
+ false, // [208] 0xd0
+ false, // [209] 0xd1
+ false, // [210] 0xd2
+ false, // [211] 0xd3
+ false, // [212] 0xd4
+ false, // [213] 0xd5
+ false, // [214] 0xd6
+ false, // [215] 0xd7
+ false, // [216] 0xd8
+ false, // [217] 0xd9
+ false, // [218] 0xda
+ false, // [219] 0xdb
+ false, // [220] 0xdc
+ false, // [221] 0xdd
+ false, // [222] 0xde
+ false, // [223] 0xdf
+ false, // [224] 0xe0
+ false, // [225] 0xe1
+ false, // [226] 0xe2
+ false, // [227] 0xe3
+ false, // [228] 0xe4
+ false, // [229] 0xe5
+ false, // [230] 0xe6
+ false, // [231] 0xe7
+ false, // [232] 0xe8
+ false, // [233] 0xe9
+ false, // [234] 0xea
+ false, // [235] 0xeb
+ false, // [236] 0xec
+ false, // [237] 0xed
+ false, // [238] 0xee
+ false, // [239] 0xef
+ false, // [240] 0xf0
+ false, // [241] 0xf1
+ false, // [242] 0xf2
+ false, // [243] 0xf3
+ false, // [244] 0xf4
+ false, // [245] 0xf5
+ false, // [246] 0xf6
+ false, // [247] 0xf7
+ false, // [248] 0xf8
+ false, // [249] 0xf9
+ false, // [250] 0xfa
+ false, // [251] 0xfb
+ false, // [252] 0xfc
+ false, // [253] 0xfd
+ false, // [254] 0xfe
+ false, // [255] 0xff
+ };
+
+bool g_IsACGTU[256] =
+ {
+ false, // [ 0] 0x00
+ false, // [ 1] 0x01
+ false, // [ 2] 0x02
+ false, // [ 3] 0x03
+ false, // [ 4] 0x04
+ false, // [ 5] 0x05
+ false, // [ 6] 0x06
+ false, // [ 7] 0x07
+ false, // [ 8] 0x08
+ false, // [ 9] 0x09
+ false, // [ 10] 0x0a
+ false, // [ 11] 0x0b
+ false, // [ 12] 0x0c
+ false, // [ 13] 0x0d
+ false, // [ 14] 0x0e
+ false, // [ 15] 0x0f
+ false, // [ 16] 0x10
+ false, // [ 17] 0x11
+ false, // [ 18] 0x12
+ false, // [ 19] 0x13
+ false, // [ 20] 0x14
+ false, // [ 21] 0x15
+ false, // [ 22] 0x16
+ false, // [ 23] 0x17
+ false, // [ 24] 0x18
+ false, // [ 25] 0x19
+ false, // [ 26] 0x1a
+ false, // [ 27] 0x1b
+ false, // [ 28] 0x1c
+ false, // [ 29] 0x1d
+ false, // [ 30] 0x1e
+ false, // [ 31] 0x1f
+ false, // [ 32] ' '
+ false, // [ 33] '!'
+ false, // [ 34] '"'
+ false, // [ 35] '#'
+ false, // [ 36] '$'
+ false, // [ 37] '%'
+ false, // [ 38] '&'
+ false, // [ 39] '''
+ false, // [ 40] '('
+ false, // [ 41] ')'
+ false, // [ 42] '*'
+ false, // [ 43] '+'
+ false, // [ 44] ','
+ false, // [ 45] '-'
+ false, // [ 46] '.'
+ false, // [ 47] '/'
+ false, // [ 48] '0'
+ false, // [ 49] '1'
+ false, // [ 50] '2'
+ false, // [ 51] '3'
+ false, // [ 52] '4'
+ false, // [ 53] '5'
+ false, // [ 54] '6'
+ false, // [ 55] '7'
+ false, // [ 56] '8'
+ false, // [ 57] '9'
+ false, // [ 58] ':'
+ false, // [ 59] ';'
+ false, // [ 60] '<'
+ false, // [ 61] '='
+ false, // [ 62] '>'
+ false, // [ 63] '?'
+ false, // [ 64] '@'
+ true, // [ 65] 'A' (ACGT)
+ false, // [ 66] 'B'
+ true, // [ 67] 'C' (ACGT)
+ false, // [ 68] 'D'
+ false, // [ 69] 'E'
+ false, // [ 70] 'F'
+ true, // [ 71] 'G' (ACGT)
+ false, // [ 72] 'H'
+ false, // [ 73] 'I'
+ false, // [ 74] 'J'
+ false, // [ 75] 'K'
+ false, // [ 76] 'L'
+ false, // [ 77] 'M'
+ false, // [ 78] 'N'
+ false, // [ 79] 'O'
+ false, // [ 80] 'P'
+ false, // [ 81] 'Q'
+ false, // [ 82] 'R'
+ false, // [ 83] 'S'
+ true, // [ 84] 'T' (ACGT)
+ true, // [ 85] 'U' (ACGT)
+ false, // [ 86] 'V'
+ false, // [ 87] 'W'
+ false, // [ 88] 'X'
+ false, // [ 89] 'Y'
+ false, // [ 90] 'Z'
+ false, // [ 91] '['
+ false, // [ 92] '\'
+ false, // [ 93] ']'
+ false, // [ 94] '^'
+ false, // [ 95] '_'
+ false, // [ 96] '`'
+ true, // [ 97] 'A' (ACGT)
+ false, // [ 98] 'B'
+ true, // [ 99] 'C' (ACGT)
+ false, // [100] 'D'
+ false, // [101] 'E'
+ false, // [102] 'F'
+ true, // [103] 'G' (ACGT)
+ false, // [104] 'H'
+ false, // [105] 'I'
+ false, // [106] 'J'
+ false, // [107] 'K'
+ false, // [108] 'L'
+ false, // [109] 'M'
+ false, // [110] 'N'
+ false, // [111] 'O'
+ false, // [112] 'P'
+ false, // [113] 'Q'
+ false, // [114] 'R'
+ false, // [115] 'S'
+ true, // [116] 'T' (ACGT)
+ true, // [117] 'U' (ACGT)
+ false, // [118] 'V'
+ false, // [119] 'W'
+ false, // [120] 'X'
+ false, // [121] 'Y'
+ false, // [122] 'Z'
+ false, // [123] '{'
+ false, // [124] '|'
+ false, // [125] '}'
+ false, // [126] '~'
+ false, // [127] 0x7f
+ false, // [128] 0x80
+ false, // [129] 0x81
+ false, // [130] 0x82
+ false, // [131] 0x83
+ false, // [132] 0x84
+ false, // [133] 0x85
+ false, // [134] 0x86
+ false, // [135] 0x87
+ false, // [136] 0x88
+ false, // [137] 0x89
+ false, // [138] 0x8a
+ false, // [139] 0x8b
+ false, // [140] 0x8c
+ false, // [141] 0x8d
+ false, // [142] 0x8e
+ false, // [143] 0x8f
+ false, // [144] 0x90
+ false, // [145] 0x91
+ false, // [146] 0x92
+ false, // [147] 0x93
+ false, // [148] 0x94
+ false, // [149] 0x95
+ false, // [150] 0x96
+ false, // [151] 0x97
+ false, // [152] 0x98
+ false, // [153] 0x99
+ false, // [154] 0x9a
+ false, // [155] 0x9b
+ false, // [156] 0x9c
+ false, // [157] 0x9d
+ false, // [158] 0x9e
+ false, // [159] 0x9f
+ false, // [160] 0xa0
+ false, // [161] 0xa1
+ false, // [162] 0xa2
+ false, // [163] 0xa3
+ false, // [164] 0xa4
+ false, // [165] 0xa5
+ false, // [166] 0xa6
+ false, // [167] 0xa7
+ false, // [168] 0xa8
+ false, // [169] 0xa9
+ false, // [170] 0xaa
+ false, // [171] 0xab
+ false, // [172] 0xac
+ false, // [173] 0xad
+ false, // [174] 0xae
+ false, // [175] 0xaf
+ false, // [176] 0xb0
+ false, // [177] 0xb1
+ false, // [178] 0xb2
+ false, // [179] 0xb3
+ false, // [180] 0xb4
+ false, // [181] 0xb5
+ false, // [182] 0xb6
+ false, // [183] 0xb7
+ false, // [184] 0xb8
+ false, // [185] 0xb9
+ false, // [186] 0xba
+ false, // [187] 0xbb
+ false, // [188] 0xbc
+ false, // [189] 0xbd
+ false, // [190] 0xbe
+ false, // [191] 0xbf
+ false, // [192] 0xc0
+ false, // [193] 0xc1
+ false, // [194] 0xc2
+ false, // [195] 0xc3
+ false, // [196] 0xc4
+ false, // [197] 0xc5
+ false, // [198] 0xc6
+ false, // [199] 0xc7
+ false, // [200] 0xc8
+ false, // [201] 0xc9
+ false, // [202] 0xca
+ false, // [203] 0xcb
+ false, // [204] 0xcc
+ false, // [205] 0xcd
+ false, // [206] 0xce
+ false, // [207] 0xcf
+ false, // [208] 0xd0
+ false, // [209] 0xd1
+ false, // [210] 0xd2
+ false, // [211] 0xd3
+ false, // [212] 0xd4
+ false, // [213] 0xd5
+ false, // [214] 0xd6
+ false, // [215] 0xd7
+ false, // [216] 0xd8
+ false, // [217] 0xd9
+ false, // [218] 0xda
+ false, // [219] 0xdb
+ false, // [220] 0xdc
+ false, // [221] 0xdd
+ false, // [222] 0xde
+ false, // [223] 0xdf
+ false, // [224] 0xe0
+ false, // [225] 0xe1
+ false, // [226] 0xe2
+ false, // [227] 0xe3
+ false, // [228] 0xe4
+ false, // [229] 0xe5
+ false, // [230] 0xe6
+ false, // [231] 0xe7
+ false, // [232] 0xe8
+ false, // [233] 0xe9
+ false, // [234] 0xea
+ false, // [235] 0xeb
+ false, // [236] 0xec
+ false, // [237] 0xed
+ false, // [238] 0xee
+ false, // [239] 0xef
+ false, // [240] 0xf0
+ false, // [241] 0xf1
+ false, // [242] 0xf2
+ false, // [243] 0xf3
+ false, // [244] 0xf4
+ false, // [245] 0xf5
+ false, // [246] 0xf6
+ false, // [247] 0xf7
+ false, // [248] 0xf8
+ false, // [249] 0xf9
+ false, // [250] 0xfa
+ false, // [251] 0xfb
+ false, // [252] 0xfc
+ false, // [253] 0xfd
+ false, // [254] 0xfe
+ false, // [255] 0xff
+ };
+
+float g_AminoFreqs[20] =
+ {
+ 0.0777f, // 'A' = Ala
+ 0.0161f, // 'C' = Cys
+ 0.0527f, // 'D' = Asp
+ 0.0631f, // 'E' = Glu
+ 0.0417f, // 'F' = Phe
+ 0.0718f, // 'G' = Gly
+ 0.0238f, // 'H' = His
+ 0.0606f, // 'I' = Ile
+ 0.0601f, // 'K' = Lys
+ 0.0906f, // 'L' = Leu
+ 0.0233f, // 'M' = Met
+ 0.0439f, // 'N' = Asn
+ 0.0456f, // 'P' = Pro
+ 0.0368f, // 'Q' = Gln
+ 0.0526f, // 'R' = Arg
+ 0.0639f, // 'S' = Ser
+ 0.0570f, // 'T' = Thr
+ 0.0712f, // 'V' = Val
+ 0.0134f, // 'W' = Trp
+ 0.0339f, // 'Y' = Tyr
+ };
--- /dev/null
+#ifndef alpha_h\r
+#define alpha_h\r
+\r
+#include <limits.h>\r
+#include <string>\r
+\r
+using namespace std;\r
+\r
+const unsigned INVALID_LETTER = 0;\r
+const unsigned char INVALID_CHAR = '?';\r
+\r
+extern unsigned g_CharToLetterAmino[];\r
+extern unsigned g_CharToLetterAminoStop[];\r
+extern unsigned char g_LetterToCharAmino[];\r
+extern unsigned g_CharToLetterNucleo[];\r
+extern unsigned char g_LetterToCharNucleo[];\r
+extern unsigned g_CodonWordToAminoLetter[];\r
+extern char g_CodonWordToAminoChar[];\r
+extern unsigned char g_CharToCompChar[];\r
+extern unsigned g_CharToCompLetter[];\r
+extern bool g_IsAminoChar[];\r
+extern bool g_IsNucleoChar[];\r
+extern bool g_IsACGTU[];\r
+extern float g_AminoFreqs[];\r
+\r
+extern unsigned g_CharToLetterRed[];\r
+extern unsigned char g_LetterToCharRed[];\r
+extern unsigned g_RedAlphaSize;\r
+\r
+void LogRedAlphaRed();\r
+void ReadRedAlphaFromFile(const string &FileName);\r
+unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2,\r
+ unsigned char c3);\r
+\r
+static inline bool AminoLetterIsStartCodon(unsigned char Letter)\r
+ {\r
+ return Letter == 10;\r
+ }\r
+\r
+static inline bool AminoLetterIsStopCodon(unsigned char Letter)\r
+ {\r
+ return Letter == 20;\r
+ }\r
+\r
+const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo);\r
+const char *WordToStrNucleo(unsigned Word, unsigned WordLength);\r
+const char *WordToStrAmino(unsigned Word, unsigned WordLength);\r
+const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str);\r
+\r
+#endif // alpha_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+bool isgap(byte c)\r
+ {\r
+ return c == '-' || c == '.';\r
+ }\r
+\r
+const char *WordToStrAmino(unsigned Word, unsigned WordLength)\r
+ {\r
+ static char Str[32];\r
+ for (unsigned i = 0; i < WordLength; ++i)\r
+ {\r
+ unsigned Letter = Word%20;\r
+ Str[WordLength-i-1] = g_LetterToCharAmino[Letter];\r
+ Word /= 20;\r
+ }\r
+ Str[WordLength] = 0;\r
+ return Str;\r
+ }\r
+\r
+const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str)\r
+ {\r
+ for (unsigned i = 0; i < WordLength; ++i)\r
+ {\r
+ unsigned Letter = Word%20;\r
+ Str[WordLength-i-1] = g_LetterToCharAmino[Letter];\r
+ Word /= 20;\r
+ }\r
+ Str[WordLength] = 0;\r
+ return Str;\r
+ }\r
+\r
+const char *WordToStrNucleo(unsigned Word, unsigned WordLength)\r
+ {\r
+ static char Str[32];\r
+ for (unsigned i = 0; i < WordLength; ++i)\r
+ {\r
+ unsigned Letter = Word%4;\r
+ Str[WordLength-i-1] = g_LetterToCharNucleo[Letter];\r
+ Word /= 4;\r
+ }\r
+ Str[WordLength] = 0;\r
+ return Str;\r
+ }\r
+\r
+const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo)\r
+ {\r
+ return (Nucleo ? WordToStrNucleo : WordToStrAmino)(Word, WordLength);\r
+ }\r
+\r
+byte *RevCompAlloc(const byte *Seq, unsigned L)\r
+ {\r
+ byte *RCSeq = MYALLOC(byte, L, Alpha);\r
+\r
+ for (unsigned i = 0; i < L; ++i)\r
+ RCSeq[L-i-1] = g_CharToCompChar[Seq[i]];\r
+\r
+ return RCSeq;\r
+ }\r
+\r
+void RevCompInPlace(byte *Seq, unsigned L)\r
+ {\r
+ unsigned L1 = L - 1;\r
+ unsigned L2 = L/2;\r
+ for (unsigned i = 0; i < L2; ++i)\r
+ {\r
+ unsigned j = L1 - i;\r
+ unsigned ci = Seq[i];\r
+ unsigned cj = Seq[j];\r
+\r
+ unsigned ri = g_CharToCompChar[ci];\r
+ unsigned rj = g_CharToCompChar[cj];\r
+\r
+ Seq[i] = rj;\r
+ Seq[j] = ri;\r
+ }\r
+\r
+ if (L%2 == 1)\r
+ Seq[L2] = g_CharToCompChar[Seq[L2]];\r
+ }\r
+\r
+void RevComp(const byte *Seq, unsigned L, byte *RCSeq)\r
+ {\r
+ for (unsigned i = 0; i < L; ++i)\r
+ RCSeq[L-i-1] = g_CharToCompChar[Seq[i]];\r
+ }\r
+\r
+unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2,\r
+ unsigned char c3)\r
+ {\r
+ unsigned Letter1 = g_CharToLetterNucleo[c1];\r
+ unsigned Letter2 = g_CharToLetterNucleo[c2];\r
+ unsigned Letter3 = g_CharToLetterNucleo[c3];\r
+ unsigned Word = Letter1*(4*4) + Letter2*4 + Letter3;\r
+\r
+ unsigned Letter = g_CodonWordToAminoLetter[Word];\r
+ return g_LetterToCharAmino[Letter];\r
+ }\r
--- /dev/null
+#ifndef chainer_h\r
+#define chainer_h\r
+\r
+#include "hsp.h"\r
+#include "seq.h"\r
+#include <list>\r
+\r
+const float BAD_SCORE = -9e9f;\r
+\r
+struct TargetHit\r
+ {\r
+ unsigned TargetIndex;\r
+ unsigned TargetLo;\r
+ unsigned TargetHi;\r
+ int QueryFrame;\r
+ float RawScore; // SOMETIMES USED FOR BIT SCORE!!!\r
+// unsigned TargetLength;\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("lo %u, hi %u, frame %d, score %.1f\n",\r
+ TargetLo, TargetHi, QueryFrame, RawScore);\r
+ }\r
+ };\r
+\r
+struct ChainData\r
+ {\r
+ unsigned LastHSPIndex;\r
+ unsigned Ahi;\r
+ unsigned Bhi;\r
+ float Score;\r
+ };\r
+\r
+class Chainer\r
+ {\r
+public:\r
+ HSPData **m_HSPs; // memory owned elsewhere\r
+ unsigned m_HSPCount;\r
+ unsigned m_MaxHSPCount;\r
+\r
+ BPData *m_BPs;\r
+\r
+ unsigned *m_PrevHSPIndexes; // Predecessor in chain\r
+ float *m_HSPIndexToChainScore;\r
+\r
+ list<unsigned> m_Chains; // Live HSP indexes\r
+\r
+public:\r
+ Chainer();\r
+ ~Chainer();\r
+ void Reset();\r
+ void Clear(bool ctor = false);\r
+ float Chain(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain,\r
+ unsigned &OptChainLength);\r
+ bool ResolveOverlaps(const SeqData &SA, const SeqData &SB, double MinScore,\r
+ const float * const *SubstMx, HSPData **InHSPs, unsigned InHSPCount,\r
+ HSPData **OutHSPs, unsigned &OutHSPCount);\r
+ void ResolveOverlap(HSPData &HSP1, HSPData &HSP2);\r
+\r
+ float ChainBrute(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain,\r
+ unsigned &OptChainLength);\r
+ void LogMe() const;\r
+ void LogHSPs(HSPData **HSPs, unsigned HSPCount) const;\r
+ void LogBPs() const;\r
+\r
+ static bool IsValidChain(HSPData **HSPs, unsigned HSPCount);\r
+ static void AssertValidChain(HSPData **HSPs, unsigned HSPCount);\r
+ static void LogChain(HSPData **HSPs, unsigned HSPCount);\r
+ static void LogChain2(HSPData **HSPs, unsigned HSPCount);\r
+ static float GetChainScore(HSPData **HSPs, unsigned HSPCount);\r
+\r
+private:\r
+ void AllocHSPCount(unsigned MaxHSPCount);\r
+ void SetBPs();\r
+ void SortBPs();\r
+ unsigned FindBestChainLT(unsigned Ahi, unsigned Bhi);\r
+ };\r
+\r
+#endif // chainer_h\r
--- /dev/null
+#ifndef chime_h\r
+#define chime_h\r
+\r
+#include "seq.h"\r
+\r
+struct ChimeHit2\r
+ {\r
+ string QLabel;\r
+ string ALabel;\r
+ string BLabel;\r
+ string Q3;\r
+ string A3;\r
+ string B3;\r
+\r
+ //unsigned LY, LN, LA, LD;\r
+ //unsigned RY, RN, RA, RD;\r
+ double PctIdQT, PctIdQA, PctIdQB, PctIdQM, PctIdAB;\r
+\r
+ unsigned ColLo;\r
+ unsigned ColXLo;\r
+ unsigned ColXHi;\r
+ unsigned ColHi;\r
+ unsigned QXLo;\r
+ unsigned QXHi;\r
+\r
+ double Div;\r
+ double Score;\r
+ double H;\r
+\r
+ unsigned CS_LY, CS_LN, CS_LA, CS_RY, CS_RN, CS_RA;\r
+\r
+ float AbQ;\r
+ float AbA;\r
+ float AbB;\r
+\r
+ ChimeHit2()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+ void Clear()\r
+ {\r
+ Q3.clear();\r
+ A3.clear();\r
+ B3.clear();\r
+ QLabel.clear();\r
+ ALabel.clear();\r
+ BLabel.clear();\r
+\r
+ //LY = LN = LA = LD = UINT_MAX;\r
+ //RY = RN = RA = RD = UINT_MAX;\r
+ ColLo = ColHi = QXLo = QXHi = ColXLo = ColXHi = UINT_MAX;\r
+ CS_LY = CS_LN = CS_LA = CS_RY = CS_RN = CS_RA = UINT_MAX;\r
+ PctIdQT = PctIdQA = PctIdQB = PctIdQM = PctIdAB = -1.0;\r
+ Div = -1.0;\r
+ H = -1.0;\r
+ Score = -1.0;\r
+ AbQ = AbA = AbB = -1.0f;\r
+ };\r
+\r
+ bool Accept() const\r
+ {\r
+ return Score >= opt_minh && Div >= opt_mindiv && CS_LY >= opt_mindiffs && CS_RY >= opt_mindiffs;\r
+ }\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("@L %c ", yon(Score >= 1.0 && Div >= 1.0));\r
+ Log(" %.4f", Score);\r
+ Log(" LY %u LN %u LA %u", CS_LY, CS_LN, CS_LA);\r
+ Log(" RY %u RN %u RA %u", CS_RY, CS_RN, CS_RA);\r
+ Log(" Div %.1f%%", Div);\r
+ Log(" Q=%s", QLabel.c_str());\r
+ Log(" A=%s", ALabel.c_str());\r
+ Log(" B=%s", BLabel.c_str());\r
+ Log(" QA %.1f%% QB=%.1f%% AB=%.1f%% QM=%.1f%%", PctIdQA, PctIdQB, PctIdAB, PctIdQM);\r
+ Log("\n");\r
+ }\r
+\r
+ bool operator<(const ChimeHit2 &rhs) const\r
+ {\r
+ if (Score == rhs.Score)\r
+ return Div > rhs.Div;\r
+ return Score > rhs.Score;\r
+ }\r
+ };\r
+\r
+static inline bool isacgt(char c)\r
+ {\r
+ return c == 'A' || c == 'C' || c == 'G' || c == 'T';\r
+ }\r
+\r
+static bool inline isgap(char c)\r
+ {\r
+ return c == '-' || c == '.';\r
+ }\r
+\r
+void GetChunkInfo(unsigned L, unsigned &Length, vector<unsigned> &Los);\r
+float GetAbFromLabel(const string &Label);\r
+void WriteChimeHitCS(FILE *f, const ChimeHit2 &Hit);\r
+void WriteChimeHit(FILE *f, const ChimeHit2 &Hit);\r
+void WriteChimeFileHdr(FILE *f);\r
+\r
+#endif // chime_h\r
--- /dev/null
+C(Search)\r
+C(SearchBlast)\r
+C(HotHits)\r
+C(HotHits2)\r
+C(WindexAccepts)\r
+C(WindexRejects)\r
+C(AlnAccepts)\r
+C(AlnRejects)\r
+C(Seqs)\r
+C(FilterAccepts)\r
+C(FilterRejects)\r
+C(DiagRejects)\r
+C(DPTooBig)\r
+C(HotHitCut)\r
+C(FastRejects)\r
+C(FastRejects2)\r
+C(Step)\r
+C(HSPConflict)\r
+C(DPArea)\r
+C(DPArea2)\r
+C(DPArea3)\r
+C(DPArea4)\r
+C(DPArea5)\r
+C(HSPIdRejects)\r
+C(NoHSPRejects)\r
+C(NoHSPAccepts)\r
+C(BandRejects)\r
+C(FractIdBestSeg)\r
+C(FractIdHSPs)\r
+C(Excludes)\r
+C(NonExcludes)\r
+C(AlignQueryToSeed)\r
+C(PWA_Align)\r
+C(HitExtends)\r
+C(FailedExtends)\r
+C(HitExtendLetters)\r
+C(FailedExtendLetters)\r
+C(AddWords)\r
+C(AddWordGrows)\r
--- /dev/null
+#ifndef diagbox_h\r
+#define diagbox_h\r
+\r
+struct DiagBox;\r
+\r
+void GetDiagBox(unsigned LA, unsigned LB, unsigned DiagLo, unsigned DiagHi, DiagBox &Box);\r
+void GetDiagRange(unsigned LA, unsigned LB, unsigned d,\r
+ unsigned &mini, unsigned &minj, unsigned &maxi, unsigned &maxj);\r
+void GetDiagLoHi(unsigned LA, unsigned LB, const char *Path,\r
+ unsigned &dlo, unsigned &dhi);\r
+\r
+struct DiagBox\r
+ {\r
+ DiagBox()\r
+ {\r
+ }\r
+\r
+ DiagBox(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi)\r
+ {\r
+ //GetDiagBox(LA, LB, DiagLo, DiagHi, *this);\r
+ //Validate();\r
+ Init(LA_, LB_, DiagLo, DiagHi);\r
+ }\r
+\r
+ void Init(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi)\r
+ {\r
+ GetDiagBox(LA_, LB_, DiagLo, DiagHi, *this);\r
+ Validate();\r
+ }\r
+\r
+ unsigned LA;\r
+ unsigned LB;\r
+\r
+ unsigned dlo;\r
+ unsigned dhi;\r
+\r
+ unsigned dlo_mini;\r
+ unsigned dlo_minj;\r
+\r
+ unsigned dlo_maxi;\r
+ unsigned dlo_maxj;\r
+\r
+ unsigned dhi_mini;\r
+ unsigned dhi_minj;\r
+\r
+ unsigned dhi_maxi;\r
+ unsigned dhi_maxj;\r
+\r
+ unsigned GetDiag(unsigned i, unsigned j) const\r
+ {\r
+ return LA - i + j;\r
+ }\r
+\r
+// i, j are positions 0..LA-1, 0..LB-1.\r
+ bool InBox(unsigned i, unsigned j) const\r
+ {\r
+ unsigned d = GetDiag(i, j);\r
+ return d >= dlo && d <= dhi;\r
+ }\r
+\r
+/***\r
+i, j are 0-based prefix lengths 0..LA, 0..LB.\r
+\r
+A full path is in the box iff all match pairs are in the box.\r
+\r
+A partial path that aligns a prefix of A to a prefix of B as\r
+in D.P.) is in the box iff it is is the prefix of at least\r
+one full path that is in the box.\r
+\r
+A D.P. matrix entry X[i][j] is in the box iff there is at\r
+least one full path aligning the first i letters of A and\r
+the first j letters of B ending in a column of type X, i.e.\r
+if there exists a partial path in the box that ends in X.\r
+\r
+Assume terminals appear in all paths, and DI/ID forbidden.\r
+\r
+Intuitively seems that by these definitions D is in box iff\r
+DM or MD is in box, I is in box iff IM or MI is in box.\r
+Don't have proof..\r
+***/\r
+ bool InBoxDPM(unsigned i, unsigned j) const\r
+ {\r
+ // Special case for M[0][0]\r
+ if (i == 0 && j == 0)\r
+ return true;\r
+ if (i == 0 || j == 0)\r
+ return false;\r
+ unsigned d = GetDiag(i-1, j-1);\r
+ return d >= dlo && d <= dhi;\r
+ }\r
+\r
+ bool InBoxDPD(unsigned i, unsigned j) const\r
+ {\r
+ bool MD = i == 0 ? false : InBoxDPM(i-1, j);\r
+ bool DM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1);\r
+ return MD || DM;\r
+ }\r
+\r
+ bool InBoxDPI(unsigned i, unsigned j) const\r
+ {\r
+ bool MI = j == 0 ? false : InBoxDPM(i, j-1);\r
+ bool IM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1);\r
+ return MI || IM;\r
+ }\r
+\r
+ // d = LA - i + j = 1 .. LA+LB-1\r
+ void Validate() const\r
+ {\r
+ asserta(dlo <= dhi);\r
+ asserta(dlo >= GetDiag(LA-1, 0));\r
+ asserta(dhi <= GetDiag(0, LB-1));\r
+\r
+ asserta(GetDiag(dlo_mini, dlo_minj) == dlo);\r
+ asserta(GetDiag(dlo_maxi, dlo_maxj) == dlo);\r
+ asserta(GetDiag(dhi_mini, dhi_minj) == dhi);\r
+ asserta(GetDiag(dhi_maxi, dhi_maxj) == dhi);\r
+\r
+ asserta(dlo_mini >= dhi_mini);\r
+ asserta(dlo_minj <= dhi_minj);\r
+ asserta(dlo_maxi >= dhi_maxi);\r
+ asserta(dlo_maxj <= dhi_maxj);\r
+ }\r
+\r
+ unsigned GetMini() const\r
+ {\r
+ return dhi_mini;\r
+ }\r
+\r
+ unsigned GetMaxi() const\r
+ {\r
+ return dlo_maxi;\r
+ }\r
+\r
+ unsigned GetMinj() const\r
+ {\r
+ return dlo_minj;\r
+ }\r
+\r
+ unsigned GetMaxj() const\r
+ {\r
+ return dhi_maxj;\r
+ }\r
+/***\r
+ i = 0..LA-1\r
+ j = 0..LB-1\r
+ d = LA - i + j = 1 .. LA+LB-1\r
+ j = d - LA + i\r
+ i = LA - d + j\r
+***/\r
+ void GetRange_j(unsigned i, unsigned &Startj, unsigned &Endj) const\r
+ {\r
+ // j = d - LA + i\r
+ if (dlo + i >= LA)\r
+ Startj = dlo + i - LA;\r
+ else\r
+ Startj = 0;\r
+\r
+ if (Startj >= LB)\r
+ Startj = LB - 1;\r
+\r
+ if (dhi + i + 1 >= LA)\r
+ Endj = dhi + i + 1 - LA;\r
+ else\r
+ Endj = 0;\r
+\r
+ if (Endj > LB)\r
+ Endj = LB;\r
+\r
+ asserta(Endj >= Startj);\r
+ }\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("LA=%u LB=%d dlo(%u): (%u,%u)-(%u,%u) dhi(%u): (%u,%u)-(%u,%u) i=[%u-%u] j=[%u-%u]\n",\r
+ LA, LB,\r
+ dlo,\r
+ dlo_mini, dlo_minj,\r
+ dlo_maxi, dlo_maxj,\r
+ dhi,\r
+ dhi_mini, dhi_minj,\r
+ dhi_maxi, dhi_maxj,\r
+ GetMini(), GetMaxi(),\r
+ GetMinj(), GetMaxj());\r
+ }\r
+ };\r
+\r
+typedef const char *(*NWDIAG)(const byte *A, unsigned LA, const byte *B, unsigned LB,
+ unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm);
+
+const char *NWBandWrap(NWDIAG NW, const byte *A, unsigned LA, const byte *B, unsigned LB,
+ unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm);
+\r
+#endif // diagbox_h\r
--- /dev/null
+#ifndef dp_h\r
+#define dp_h\r
+\r
+#define SAVE_FAST 0\r
+\r
+#include "myutils.h"\r
+#include "mx.h"\r
+#include "seqdb.h"\r
+#include "diagbox.h"\r
+#include "path.h"\r
+#include "alnparams.h"\r
+#include "alnheuristics.h"\r
+#include "hspfinder.h"\r
+\r
+typedef void (*OnPathFn)(const string &Path, bool Full);\r
+\r
+enum XType\r
+ {\r
+ XType_Full=1,\r
+ XType_Fwd=2,\r
+ XType_Bwd=3,\r
+ };\r
+\r
+// public\r
+float ViterbiBrute(const byte *A, unsigned LA, const byte *B, unsigned LB, \r
+ unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiSimpleBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, unsigned DiagLo, unsigned DiagHi, PathData &PD);\r
+\r
+float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiFastBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD);\r
+\r
+float ViterbiFastMainDiag(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ unsigned BandRadius, const AlnParams &AP, PathData &PD);\r
+\r
+float XDropFwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropBwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropFwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+float XDropBwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+\r
+void XDropAlign(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ unsigned AncLoi, unsigned AncLoj, unsigned AncLen, const AlnParams &AP,\r
+ float XDrop, HSPData &HSP, PathData &PD);\r
+\r
+float SWSimple(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj,\r
+ unsigned &Hij, PathData &PD);\r
+\r
+float SWFast(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj,\r
+ unsigned &Hij, PathData &PD);\r
+\r
+void SWFast2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+ HSPData &HSP, PathData &PD);\r
+\r
+void SWSimple2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+ HSPData &HSP, PathData &PD);\r
+\r
+float SWUngapped(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const float * const *SubstMx, unsigned &LoA, unsigned &LoB, unsigned &Len);\r
+\r
+void SWUngapped2(const SeqData &SA, const SeqData &SB, const AlnParams &AP,\r
+ HSPData &HSP);\r
+\r
+float SWFastNTB(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP);\r
+\r
+void GlobalAlignBand(const byte *A, unsigned LA, const byte *B, unsigned LB,\r
+ const AlnParams &AP, unsigned BandRadius, PathData &PD);\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &AP,\r
+ const AlnHeuristics &AH, HSPFinder &HF, float MinFractId, float &HSPFractId,\r
+ PathData &PD);\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path);\r
+\r
+void GetBruteMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetSimpleDPMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetSimpleBandMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetXDropFwdSimpleDPMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+#if SAVE_FAST\r
+void GetFastMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+void GetFastBandMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I);\r
+#endif\r
+\r
+// private\r
+void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD);\r
+void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj,\r
+ unsigned &Leni, unsigned &Lenj, PathData &PD);\r
+void EnumPaths(unsigned L1, unsigned L2, bool SubPaths, OnPathFn OnPath);\r
+void AllocBit(unsigned LA, unsigned LB);\r
+\r
+const byte TRACEBITS_DM = 0x01;\r
+const byte TRACEBITS_IM = 0x02;\r
+const byte TRACEBITS_MD = 0x04;\r
+const byte TRACEBITS_MI = 0x08;\r
+const byte TRACEBITS_SM = 0x10;\r
+const byte TRACEBITS_UNINIT = ~0x1f;\r
+\r
+extern Mx<byte> g_Mx_TBBit;\r
+extern float *g_DPRow1;\r
+extern float *g_DPRow2;\r
+extern byte **g_TBBit;\r
+\r
+static inline void Max_xM(float &Score, float MM, float DM, float IM, byte &State)\r
+ {\r
+ Score = MM;\r
+ State = 'M';\r
+\r
+ if (DM > Score)\r
+ {\r
+ Score = DM;\r
+ State = 'D';\r
+ }\r
+ if (IM > Score)\r
+ {\r
+ Score = IM;\r
+ State = 'I';\r
+ }\r
+ }\r
+\r
+static inline void Max_xD(float &Score, float MD, float DD, byte &State)\r
+ {\r
+ if (MD >= DD)\r
+ {\r
+ Score = MD;\r
+ State = 'M';\r
+ }\r
+ else\r
+ {\r
+ Score = DD;\r
+ State = 'D';\r
+ }\r
+ }\r
+\r
+static inline void Max_xI(float &Score, float MI, float II, byte &State)\r
+ {\r
+ if (MI >= II)\r
+ {\r
+ Score = MI;\r
+ State = 'M';\r
+ }\r
+ else\r
+ {\r
+ Score = II;\r
+ State = 'I';\r
+ }\r
+ }\r
+\r
+#endif // dp_h\r
--- /dev/null
+#ifndef evalue_h\r
+#define evalue_h\r
+\r
+#include <float.h>\r
+\r
+void SetKarlin(double GappedLambda, double UngappedLambda,\r
+ double GappedK, double UngappedK, double DBLength);\\r
+\r
+double GetKarlinDBLength();\r
+void SetKarlinDBLength(double DBLength);\r
+void LogKarlin();\r
+void SetKarlinAmino(double DBLength);\r
+void SetKarlinNucleo(double DBLength);\r
+void SetKarlin(double DBLength, bool Nucleo);\r
+double ComputeBitScoreGapped(double Score);\r
+double ComputeBitScoreUngapped(double Score);\r
+double ComputeEvalueGapped(double Score, unsigned QueryLength);\r
+double ComputeEvalueUngapped(double Score, unsigned QueryLength);\r
+double ComputeMinScoreGivenEvalueAGapped(double Evalue, unsigned Area);\r
+double ComputeMinScoreGivenEvalueAUngapped(double Evalue, unsigned Area);\r
+double ComputeMinScoreGivenEvalueQGapped(double Evalue, unsigned QueryLength);\r
+double ComputeMinScoreGivenEvalueQUngapped(double Evalue, unsigned QueryLength);\r
+double ComputeEvalueGappedFromBitScore(double BitScore, unsigned QueryLength);\r
+\r
+#endif // evalue_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "alpha.h"\r
+\r
+//unsigned g_MaxL = 0;\r
+\r
+static bool *g_IsChar = g_IsAminoChar;\r
+\r
+// Term gaps allowed in query (A) only\r
+static double GetFractIdGivenPathDerep(const byte *A, const byte *B, const char *Path,\r
+ char *ptrDesc)\r
+ {\r
+ if (*Path == 'D')\r
+ {\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(term gap in Query)");\r
+ return 0;\r
+ }\r
+\r
+ const char *LastM = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ if (*p == 'M')\r
+ LastM = p;\r
+\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Ids = 0;\r
+ unsigned Diffs = 0;\r
+ unsigned Cols = 0;\r
+ for (const char *p = Path; *p && p != LastM; ++p)\r
+ {\r
+ ++Cols;\r
+ char c = *p;\r
+ if (c == 'M')\r
+ {\r
+ byte a = toupper(A[PosA]);\r
+ byte b = toupper(B[PosB]);\r
+ if (g_IsChar[a] && g_IsChar[b])\r
+ {\r
+ if (a == b)\r
+ ++Ids;\r
+ else\r
+ ++Diffs;\r
+ }\r
+ else\r
+ --Cols;\r
+ }\r
+ if (c == 'D' || c == 'I')\r
+ ++Diffs;\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+\r
+ double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+ return FractId;\r
+ }\r
+\r
+static double GetFractIdGivenPathAllDiffs(const byte *A, const byte *B, const char *Path,\r
+ char *ptrDesc)\r
+ {\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Ids = 0;\r
+ unsigned Diffs = 0;\r
+ unsigned Cols = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ {\r
+ ++Cols;\r
+ char c = *p;\r
+ if (c == 'M')\r
+ {\r
+ byte a = toupper(A[PosA]);\r
+ byte b = toupper(B[PosB]);\r
+ if (g_IsChar[a] && g_IsChar[b])\r
+ {\r
+ if (a == b)\r
+ ++Ids;\r
+ else\r
+ ++Diffs;\r
+ }\r
+ else\r
+ --Cols;\r
+ }\r
+ if (c == 'D' || c == 'I')\r
+ ++Diffs;\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+\r
+ double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+ return FractId;\r
+ }\r
+\r
+static double GetFractIdGivenPathInternalDiffs(const byte *A, const byte *B,\r
+ const char *Path, char *ptrDesc)\r
+ {\r
+ unsigned i = 0;\r
+ unsigned FirstM = UINT_MAX;\r
+ unsigned LastM = UINT_MAX;\r
+ for (const char *p = Path; *p; ++p)\r
+ {\r
+ if (*p == 'M')\r
+ {\r
+ if (FirstM == UINT_MAX)\r
+ FirstM = i;\r
+ LastM = i;\r
+ }\r
+ ++i;\r
+ }\r
+ if (FirstM == UINT_MAX)\r
+ {\r
+ if (ptrDesc != 0)\r
+ strcpy(ptrDesc, "(no matches)");\r
+ return 0.0;\r
+ }\r
+\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Ids = 0;\r
+ unsigned Diffs = 0;\r
+ unsigned Cols = 0;\r
+ for (unsigned i = 0; i < FirstM; ++i)\r
+ {\r
+ char c = Path[i];\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+\r
+ for (unsigned i = FirstM; i <= LastM; ++i)\r
+ {\r
+ ++Cols;\r
+ char c = Path[i];\r
+ if (c == 'M')\r
+ {\r
+ byte a = toupper(A[PosA]);\r
+ byte b = toupper(B[PosB]);\r
+ if (g_IsChar[a] && g_IsChar[b])\r
+ {\r
+ if (a == b)\r
+ ++Ids;\r
+ else\r
+ ++Diffs;\r
+ }\r
+ else\r
+ --Cols;\r
+ }\r
+ if (c == 'D' || c == 'I')\r
+ ++Diffs;\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+\r
+ double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+ return FractId;\r
+ }\r
+\r
+static double GetFractIdGivenPathMBL(const byte *A, const byte *B, const char *Path,\r
+ char *ptrDesc)\r
+ {\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Mismatches = 0;\r
+ unsigned Gaps = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ {\r
+ char c = *p;\r
+ if (c == 'M' && toupper(A[PosA]) != toupper(B[PosB]))\r
+ ++Mismatches;\r
+ if (c == 'D' || c == 'I' && (p == Path || p[-1] == 'M'))\r
+ ++Gaps;\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+ unsigned Diffs = Gaps + Mismatches;\r
+ double FractDiffs = (PosB == 0 ? 0.0 : double(Diffs)/double(PosB));\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "Gap opens %u, Id=1 - [(diffs=%u)/(target_length=%u)]",\r
+ Gaps, Diffs, PosB);\r
+ double FractId = 1.0 - FractDiffs;\r
+ if (FractId < 0.0)\r
+ return 0.0;\r
+ return FractId;\r
+ }\r
+\r
+static double GetFractIdGivenPathBLAST(const byte *A, const byte *B, const char *Path,\r
+ char *ptrDesc)\r
+ {\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Ids = 0;\r
+ unsigned Wilds = 0;\r
+ unsigned Cols = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ {\r
+ ++Cols;\r
+ char c = *p;\r
+ if (c == 'M')\r
+ {\r
+ byte a = toupper(A[PosA]);\r
+ byte b = toupper(B[PosB]);\r
+ if (g_IsChar[a] && g_IsChar[b])\r
+ {\r
+ if (a == b)\r
+ ++Ids;\r
+ }\r
+ else\r
+ ++Wilds;\r
+ }\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+ asserta(Cols >= Wilds);\r
+ Cols -= Wilds;\r
+ double FractId = Cols == 0 ? 0.0f : float(Ids)/float(Cols);\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
+ return FractId;\r
+ }\r
+\r
+static double GetFractIdGivenPathDefault(const byte *A, const byte *B, const char *Path,\r
+ char *ptrDesc)\r
+ {\r
+ unsigned PosA = 0;\r
+ unsigned PosB = 0;\r
+ unsigned Ids = 0;\r
+ unsigned Wilds = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ {\r
+ char c = *p;\r
+ if (c == 'M')\r
+ {\r
+ byte a = toupper(A[PosA]);\r
+ byte b = toupper(B[PosB]);\r
+ if (g_IsChar[a] && g_IsChar[b])\r
+ {\r
+ if (a == b)\r
+ ++Ids;\r
+ }\r
+ else\r
+ ++Wilds;\r
+ }\r
+ if (c == 'M' || c == 'D')\r
+ ++PosA;\r
+ if (c == 'M' || c == 'I')\r
+ ++PosB;\r
+ }\r
+ unsigned MinLen = min(PosA, PosB) - Wilds;\r
+ double FractId = (MinLen == 0 ? 0.0 : double(Ids)/double(MinLen));\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(ids=%u/shorter_length=%u)", Ids, MinLen);\r
+ return FractId;\r
+ }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
+ bool Nucleo, char *ptrDesc, unsigned IdDef)\r
+ {\r
+ if (Nucleo)\r
+ g_IsChar = g_IsACGTU;\r
+ else\r
+ g_IsChar = g_IsAminoChar;\r
+\r
+ if (Path == 0)\r
+ {\r
+ if (ptrDesc != 0)\r
+ strcpy(ptrDesc, "(NULL path)");\r
+ return 0.0;\r
+ }\r
+\r
+ unsigned ColCount = (unsigned) strlen(Path);\r
+ if (ColCount == 0)\r
+ return 0.0;\r
+\r
+ if (opt_leftjust)\r
+ {\r
+ if (Path[0] != 'M' || Path[ColCount-1] == 'D')\r
+ {\r
+ if (ptrDesc != 0)\r
+ strcpy(ptrDesc, "(leftjust)");\r
+ return 0.0;\r
+ }\r
+ }\r
+\r
+ if (opt_rightjust)\r
+ {\r
+ if (Path[0] == 'D' || Path[ColCount-1] != 'M')\r
+ {\r
+ if (ptrDesc != 0)\r
+ strcpy(ptrDesc, "(rightjust)");\r
+ return 0.0;\r
+ }\r
+ }\r
+\r
+ double FractId = 0.0;\r
+ //if (opt_idprefix > 0)\r
+ // {\r
+ // for (unsigned i = 0; i < opt_idprefix; ++i)\r
+ // {\r
+ // char c = Path[i];\r
+ // if (c != 'M' || toupper(A[i]) != toupper(B[i]))\r
+ // {\r
+ // if (ptrDesc != 0)\r
+ // sprintf(ptrDesc, "Prefix ids %u < idprefix(%u)",\r
+ // i, opt_idprefix);\r
+ // return 0.0;\r
+ // }\r
+ // }\r
+ // }\r
+\r
+ //if (opt_idsuffix > 0)\r
+ // {\r
+ // unsigned Cols = strlen(Path);\r
+ // for (unsigned i = 0; i < opt_idsuffix && i > Cols; ++i)\r
+ // {\r
+ // unsigned k = Cols - 1 - i;\r
+ // char c = Path[k];\r
+ // if (c != 'M' || toupper(A[k]) != toupper(B[k]))\r
+ // {\r
+ // if (ptrDesc != 0)\r
+ // sprintf(ptrDesc, "Suffix ids %u < idsuffix(%u)",\r
+ // i, opt_idsuffix);\r
+ // return 0.0;\r
+ // }\r
+ // }\r
+ // }\r
+\r
+ if (opt_maxqgap > 0 || opt_maxtgap > 0)\r
+ {\r
+ unsigned L = 0;\r
+ const char *LastM = 0;\r
+ for (const char *p = Path; *p; ++p)\r
+ if (*p == 'M')\r
+ LastM = p;\r
+\r
+// g_MaxL = 0;\r
+ for (const char *p = Path; *p && p != LastM; ++p)\r
+ {\r
+ char c = *p;\r
+ switch (c)\r
+ {\r
+ case 'M':\r
+ if (L > 0)\r
+ {\r
+ if (p[-1] == 'D')\r
+ {\r
+ if (L > opt_maxtgap)\r
+ {\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(maxtgap)");\r
+ return 0.0;\r
+ }\r
+ }\r
+ else if (p[-1] == 'I')\r
+ {\r
+ if (L > opt_maxqgap)\r
+ {\r
+ if (ptrDesc != 0)\r
+ sprintf(ptrDesc, "(maxqgap)");\r
+ return 0.0;\r
+ }\r
+ }\r
+ else\r
+ asserta(false);\r
+ }\r
+ L = 0;\r
+ break;\r
+\r
+ case 'D':\r
+ case 'I':\r
+ ++L;\r
+ //if (L > g_MaxL)\r
+ // g_MaxL = L;\r
+ break;\r
+\r
+ default:\r
+ asserta(false);\r
+ }\r
+ }\r
+ }\r
+\r
+ switch (IdDef)\r
+ {\r
+ case 0:\r
+ FractId = GetFractIdGivenPathDefault(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ case 1:\r
+ FractId = GetFractIdGivenPathAllDiffs(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ case 2:\r
+ FractId = GetFractIdGivenPathInternalDiffs(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ case 3:\r
+ FractId = GetFractIdGivenPathMBL(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ case 4:\r
+ FractId = GetFractIdGivenPathBLAST(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ case 5:\r
+ FractId = GetFractIdGivenPathDerep(A, B, Path, ptrDesc);\r
+ break;\r
+\r
+ default:\r
+ Die("--iddef %u invalid", opt_iddef);\r
+ }\r
+\r
+ return FractId;\r
+ }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
+ bool Nucleo, char *ptrDesc)\r
+ {\r
+ return GetFractIdGivenPath(A, B, Path, Nucleo, ptrDesc, opt_iddef);\r
+ }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo)\r
+ {\r
+ return GetFractIdGivenPath(A, B, Path, Nucleo, (char *) 0);\r
+ }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const string &Path)\r
+ {\r
+ return GetFractIdGivenPath(A, B, Path.c_str(), true);\r
+ }\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path)\r
+ {\r
+ return GetFractIdGivenPath(A, B, Path, true);\r
+ }\r
--- /dev/null
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "ultra.h"\r
+#include <set>\r
+\r
+void AddTargets(Ultra &U, const SeqData &Query, set<unsigned> &TargetIndexes);\r
+\r
+void GetChunkInfo(unsigned L, unsigned &Length, vector<unsigned> &Los)\r
+ {\r
+ Los.clear();\r
+\r
+ if (L <= opt_minchunk)\r
+ {\r
+ Length = L;\r
+ Los.push_back(0);\r
+ return;\r
+ }\r
+\r
+ Length = (L - 1)/opt_chunks + 1;\r
+ if (Length < opt_minchunk)\r
+ Length = opt_minchunk;\r
+\r
+ unsigned Lo = 0;\r
+ for (;;)\r
+ {\r
+ if (Lo + Length >= L)\r
+ {\r
+ Lo = L - Length - 1;\r
+ Los.push_back(Lo);\r
+ return;\r
+ }\r
+ Los.push_back(Lo);\r
+ Lo += Length;\r
+ }\r
+ }\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+ vector<unsigned> &Parents)\r
+ {\r
+ Parents.clear();\r
+\r
+ set<unsigned> TargetIndexes;\r
+\r
+ unsigned QL = QSD.L;\r
+\r
+ SeqData QuerySD = QSD;\r
+\r
+ unsigned ChunkLength;\r
+ vector<unsigned> ChunkLos;\r
+ GetChunkInfo(QL, ChunkLength, ChunkLos);\r
+ unsigned ChunkCount = SIZE(ChunkLos);\r
+ for (unsigned ChunkIndex = 0; ChunkIndex < ChunkCount; ++ChunkIndex)\r
+ {\r
+ unsigned Lo = ChunkLos[ChunkIndex];\r
+ asserta(Lo + ChunkLength <= QL);\r
+\r
+ const byte *Chunk = QSD.Seq + Lo;\r
+\r
+ // THIS MESSES UP --self!!\r
+ //char Prefix[32];\r
+ //sprintf(Prefix, "%u|", Lo);\r
+ //string ChunkLabel = string(Prefix) + string(QSD.Label);\r
+\r
+ //QuerySD.Label = ChunkLabel.c_str();\r
+ QuerySD.Seq = Chunk;\r
+ QuerySD.L = ChunkLength;\r
+\r
+ AddTargets(U, QuerySD, TargetIndexes);\r
+\r
+ Lo += ChunkLength;\r
+ }\r
+\r
+ for (set<unsigned>::const_iterator p = TargetIndexes.begin();\r
+ p != TargetIndexes.end(); ++p)\r
+ {\r
+ unsigned TargetIndex = *p;\r
+ bool Accept = true;\r
+ if (AbQ > 0.0f)\r
+ {\r
+ const char *TargetLabel = U.GetSeedLabel(TargetIndex);\r
+ float AbT = GetAbFromLabel(string(TargetLabel));\r
+ if (AbT > 0.0f && AbT < opt_abskew*AbQ)\r
+ Accept = false;\r
+ }\r
+\r
+ if (Accept)\r
+ Parents.push_back(TargetIndex);\r
+ }\r
+ }\r
--- /dev/null
+#if UCHIMES
+
+#include "dp.h"
+#include "seq.h"
+
+static AlnParams g_AP;
+static bool g_APInitDone = false;
+
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, PathData &PD)\r
+ {\r
+ if (!g_APInitDone)\r
+ {\r
+ g_AP.InitFromCmdLine(true);\r
+ g_APInitDone = true;\r
+ }\r
+\r
+ ViterbiFast(Query.Seq, Query.L, Target.Seq, Target.L, g_AP, PD);\r
+ return true;\r
+ }\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path)\r
+ {\r
+ PathData PD;\r
+ GlobalAlign(Query, Target, PD);\r
+ Path = string(PD.Start);\r
+ return true;\r
+ }\r
+\r
+bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &/*AP*/,\r
+ const AlnHeuristics &AH, HSPFinder &/*HF*/, float /*MinFractId*/, float &/*HSPId*/, PathData &PD)\r
+ {\r
+ PD.Clear();\r
+ string Path;\r
+ bool Found = GlobalAlign(Query, Target, Path);\r
+ if (!Found)\r
+ return false;\r
+ unsigned n = SIZE(Path);\r
+ PD.Alloc(n+1);\r
+ memcpy(PD.Front, Path.c_str(), n);\r
+ PD.Start = PD.Front;\r
+ PD.Start[n] = 0;\r
+ return true;\r
+ }\r
+\r
+#endif // UCHIMES\r
--- /dev/null
+"\n"
+"Usage\n"
+"-----\n"
+"\n"
+"uchime --input query.fasta [--db db.fasta] [--uchimeout results.uchime]\n"
+" [--uchimealns results.alns]\n"
+"\n"
+"Options\n"
+"-------\n"
+"\n"
+"--input filename\n"
+" Query sequences in FASTA format.\n"
+" If the --db option is not specificed, uchime uses de novo\n"
+" detection. In de novo mode, relative abundance must be given\n"
+" by a string /ab=xxx/ somewhere in the label, where xxx is a\n"
+" floating-point number, e.g. >F00QGH67HG/ab=1.2/.\n"
+"\n"
+"--db filename\n"
+" Reference database in FASTA format.\n"
+" Optional, if not specified uchime uses de novo mode.\n"
+"\n"
+" ***WARNING*** The database is searched ONLY on the plus strand.\n"
+" You MUST include reverse-complemented sequences in the database\n"
+" if you want both strands to be searched.\n"
+"\n"
+"--abskew x\n"
+" Minimum abundance skew. Default 1.9. De novo mode only.\n"
+" Abundance skew is:\n"
+" min [ abund(parent1), abund(parent2) ] / abund(query).\n"
+"\n"
+"--uchimeout filename\n"
+" Output in tabbed format with one record per query sequence.\n"
+" First field is score (h), second field is query label.\n"
+" For details, see manual.\n"
+"\n"
+"--uchimealns filename\n"
+" Multiple alignments of query sequences to parents in human-\n"
+" readable format. Alignments show columns with differences\n"
+" that support or contradict a chimeric model.\n"
+"\n"
+"--minh h\n"
+" Mininum score to report chimera. Default 0.3. Values from 0.1\n"
+" to 5 might be reasonable. Lower values increase sensitivity\n"
+" but may report more false positives. If you decrease --xn,\n"
+" you may need to increase --minh, and vice versa.\n"
+"\n"
+"--mindiv div\n"
+" Minimum divergence ratio, default 0.5. Div ratio is 100%% - \n"
+" %%identity between query sequence and the closest candidate for\n"
+" being a parent. If you don't care about very close chimeras,\n"
+" then you could increase --mindiv to, say, 1.0 or 2.0, and\n"
+" also decrease --min h, say to 0.1, to increase sensitivity.\n"
+" How well this works will depend on your data. Best is to\n"
+" tune parameters on a good benchmark.\n"
+"\n"
+"--xn beta\n"
+" Weight of a no vote, also called the beta parameter. Default 8.0.\n"
+" Decreasing this weight to around 3 or 4 may give better\n"
+" performance on denoised data.\n"
+"\n"
+"--dn n\n"
+" Pseudo-count prior on number of no votes. Default 1.4. Probably\n"
+" no good reason to change this unless you can retune to a good\n"
+" benchmark for your data. Reasonable values are probably in the\n"
+" range from 0.2 to 2.\n"
+"\n"
+"--xa w\n"
+" Weight of an abstain vote. Default 1. So far, results do not\n"
+" seem to be very sensitive to this parameter, but if you have\n"
+" a good training set might be worth trying. Reasonable values\n"
+" might range from 0.1 to 2.\n"
+"\n"
+"--chunks n\n"
+" Number of chunks to extract from the query sequence when searching\n"
+" for parents. Default 4.\n"
+"\n"
+"--[no]ovchunks\n"
+" [Do not] use overlapping chunks. Default do not.\n"
+"\n"
+"--minchunk n\n"
+" Minimum length of a chunk. Default 64.\n"
+"\n"
+"--idsmoothwindow w\n"
+" Length of id smoothing window. Default 32.\n"
+"\n"
+"--minsmoothid f\n"
+" Minimum factional identity over smoothed window of candidate parent.\n"
+" Default 0.95.\n"
+"\n"
+"--maxp n\n"
+" Maximum number of candidate parents to consider. Default 2. In tests so\n"
+" far, increasing --maxp gives only a very small improvement in sensivity\n"
+" but tends to increase the error rate quite a bit.\n"
+"\n"
+"--[no]skipgaps\n"
+"--[no]skipgaps2\n"
+" These options control how gapped columns affect counting of diffs.\n"
+" If --skipgaps is specified, columns containing gaps do not found as diffs.\n"
+" If --skipgaps2 is specified, if column is immediately adjacent to\n"
+" a column containing a gap, it is not counted as a diff.\n"
+" Default is --skipgaps --skipgaps2.\n"
+"\n"
+"--minlen L\n"
+"--maxlen L\n"
+" Minimum and maximum sequence length. Defaults 10, 10000.\n"
+" Applies to both query and reference sequences.\n"
+"\n"
+"--ucl\n"
+" Use local-X alignments. Default is global-X. On tests so far, global-X\n"
+" is always better; this option is retained because it just might work\n"
+" well on some future type of data.\n"
+"\n"
+"--queryfract f\n"
+" Minimum fraction of the query sequence that must be covered by a local-X\n"
+" alignment. Default 0.5. Applies only when --ucl is specified.\n"
+"\n"
+"--quiet\n"
+" Do not display progress messages on stderr.\n"
+"\n"
+"--log filename\n"
+" Write miscellaneous information to the log file. Mostly of interest\n"
+" to me (the algorithm developer). Use --verbose to get more info.\n"
+"\n"
+"--self\n"
+" In reference database mode, exclude a reference sequence if it has\n"
+" the same label as the query. This is useful for benchmarking by using\n"
+" the ref db as a query to test for false positives.\n"
--- /dev/null
+#ifndef hsp_h\r
+#define hsp_h 1\r
+\r
+struct HSPData\r
+ {\r
+ unsigned Loi;\r
+ unsigned Loj;\r
+ unsigned Leni;\r
+ unsigned Lenj;\r
+ float Score;\r
+ unsigned User;\r
+\r
+ unsigned GetLength() const\r
+ {\r
+ if (Leni != Lenj)\r
+ Die("HSP::GetLength(): Leni %u, Lenj %u, Loi %u, Loj %u, Score %.1f",\r
+ Leni, Lenj, Loi, Loj, Score);\r
+\r
+ return Leni;\r
+ }\r
+\r
+ unsigned GetHii() const\r
+ {\r
+ assert(Leni > 0);\r
+ return Loi + Leni - 1;\r
+ }\r
+\r
+ unsigned GetHij() const\r
+ {\r
+ assert(Lenj > 0);\r
+ return Loj + Lenj - 1;\r
+ }\r
+\r
+ bool LeftA() const\r
+ {\r
+ return Loi == 0;\r
+ }\r
+\r
+ bool LeftB() const\r
+ {\r
+ return Loj == 0;\r
+ }\r
+\r
+ bool RightA(unsigned LA) const\r
+ {\r
+ return Loi + Leni == LA;\r
+ }\r
+\r
+ bool RightB(unsigned LB) const\r
+ {\r
+ return Loj + Lenj == LB;\r
+ }\r
+\r
+ unsigned GetIdCount(const byte *A, const byte *B) const\r
+ {\r
+ unsigned Count = 0;\r
+ unsigned K = GetLength();\r
+ for (unsigned k = 0; k < K; ++k)\r
+ {\r
+ byte a = A[Loi+k];\r
+ byte b = B[Loj+k];\r
+ if (toupper(a) == toupper(b))\r
+ Count++;\r
+ }\r
+ return Count;\r
+ }\r
+\r
+ double OverlapFract(const HSPData &HSP) const\r
+ {\r
+ if (Leni == 0 || Lenj == 0)\r
+ return 0.0;\r
+\r
+ unsigned MaxLoi = max(Loi, HSP.Loi);\r
+ unsigned MaxLoj = max(Loj, HSP.Loj);\r
+ unsigned MinHii = min(GetHii(), HSP.GetHii());\r
+ unsigned MinHij = min(GetHij(), HSP.GetHij());\r
+\r
+ unsigned Ovi = (MinHii < MaxLoi) ? 0 : MinHii - MaxLoi;\r
+ unsigned Ovj = (MinHij < MaxLoj) ? 0 : MinHij - MaxLoj;\r
+\r
+ asserta(Ovi <= Leni && Ovj <= Lenj);\r
+ return double(Ovi*Ovj)/double(Leni*Lenj);\r
+ }\r
+\r
+ bool operator<(const HSPData &rhs) const\r
+ {\r
+ return Loi < rhs.Loi;\r
+ }\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("Loi=%u Loj=%u Li=%u Lj=%u Score=%.1f\n", Loi, Loj, Leni, Lenj, Score);\r
+ }\r
+\r
+ void LogMe2() const\r
+ {\r
+ Log("(%u-%u,%u-%u/%.1f)", Loi, GetHii(), Loj, GetHij(), Score);\r
+ }\r
+ };\r
+\r
+// Bendpoint\r
+struct BPData\r
+ {\r
+ unsigned Pos;\r
+ bool IsLo;\r
+ unsigned Index;\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("BP%s Pos %u Ix %u", (IsLo ? "lo" : "hi"), Pos, Index);\r
+ }\r
+ };\r
+\r
+#endif // hsp_h\r
--- /dev/null
+#ifndef hspfinder_h
+#define hspfinder_h
+
+#include "seq.h"
+
+class HSPFinder
+ {
+public:
+ void SetA(const SeqData &/*SD*/) {}
+ void SetB(const SeqData &/*SD*/) {}
+ };
+
+#endif // hspfinder_h
--- /dev/null
+#include "myutils.h"\r
+#include "sfasta.h"\r
+#include "path.h"\r
+#include "dp.h"\r
+\r
+void Make3Way(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+ const string &PathQA, const string &PathQB,\r
+ string &Q3, string &A3, string &B3)\r
+ {\r
+ Q3.clear();\r
+ A3.clear();\r
+ B3.clear();\r
+\r
+#if DEBUG\r
+ {\r
+ unsigned QLen = 0;\r
+ unsigned ALen = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+ {\r
+ char c = PathQA[i];\r
+ if (c == 'M' || c == 'D')\r
+ ++QLen;\r
+ if (c == 'M' || c == 'I')\r
+ ++ALen;\r
+ }\r
+ asserta(QLen == QSD.L);\r
+ asserta(ALen == ASD.L);\r
+ }\r
+ {\r
+ unsigned QLen = 0;\r
+ unsigned BLen = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+ {\r
+ char c = PathQB[i];\r
+ if (c == 'M' || c == 'D')\r
+ ++QLen;\r
+ if (c == 'M' || c == 'I')\r
+ ++BLen;\r
+ }\r
+ asserta(QLen == QSD.L);\r
+ asserta(BLen == BSD.L);\r
+ }\r
+#endif\r
+\r
+ const byte *Q = QSD.Seq;\r
+ const byte *A = ASD.Seq;\r
+ const byte *B = BSD.Seq;\r
+\r
+ unsigned LQ = QSD.L;\r
+ unsigned LA = ASD.L;\r
+ unsigned LB = BSD.L;\r
+\r
+ vector<unsigned> InsertCountsA(LQ+1, 0);\r
+ unsigned QPos = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+ {\r
+ char c = PathQA[i];\r
+ if (c == 'M' || c == 'D')\r
+ ++QPos;\r
+ else\r
+ {\r
+ asserta(c == 'I');\r
+ asserta(QPos <= LQ);\r
+ ++(InsertCountsA[QPos]);\r
+ }\r
+ }\r
+\r
+ vector<unsigned> InsertCountsB(LQ+1, 0);\r
+ QPos = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+ {\r
+ char c = PathQB[i];\r
+ if (c == 'M' || c == 'D')\r
+ ++QPos;\r
+ else\r
+ {\r
+ asserta(c == 'I');\r
+ asserta(QPos <= LQ);\r
+ ++(InsertCountsB[QPos]);\r
+ }\r
+ }\r
+\r
+ vector<unsigned> InsertCounts;\r
+ for (unsigned i = 0; i <= LQ; ++i)\r
+ {\r
+ unsigned is = max(InsertCountsA[i], InsertCountsB[i]);\r
+ InsertCounts.push_back(is);\r
+ }\r
+\r
+ for (unsigned i = 0; i < LQ; ++i)\r
+ {\r
+ for (unsigned k = 0; k < InsertCounts[i]; ++k)\r
+ Q3.push_back('-');\r
+ asserta(i < LQ);\r
+ Q3.push_back(toupper(Q[i]));\r
+ }\r
+ for (unsigned k = 0; k < InsertCounts[LQ]; ++k)\r
+ Q3.push_back('-');\r
+\r
+// A\r
+ QPos = 0;\r
+ unsigned APos = 0;\r
+ unsigned is = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQA); ++i)\r
+ {\r
+ char c = PathQA[i];\r
+ if (c == 'M' || c == 'D')\r
+ {\r
+ unsigned isq = InsertCounts[QPos];\r
+ asserta(is <= isq);\r
+ for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i)\r
+ A3.push_back('-');\r
+ is = 0;\r
+ ++QPos;\r
+ }\r
+ if (c == 'M')\r
+ {\r
+ asserta(APos < LA);\r
+ A3.push_back(toupper(A[APos++]));\r
+ }\r
+ else if (c == 'D')\r
+ A3.push_back('-');\r
+ else if (c == 'I')\r
+ {\r
+ ++is;\r
+ asserta(APos < LA);\r
+ A3.push_back(toupper(A[APos++]));\r
+ }\r
+ }\r
+ asserta(is <= InsertCounts[LQ]);\r
+ for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k)\r
+ A3.push_back('-');\r
+ asserta(QPos == LQ);\r
+ asserta(APos == LA);\r
+\r
+// B\r
+ QPos = 0;\r
+ unsigned BPos = 0;\r
+ is = 0;\r
+ for (unsigned i = 0; i < SIZE(PathQB); ++i)\r
+ {\r
+ char c = PathQB[i];\r
+ if (c == 'M' || c == 'D')\r
+ {\r
+ asserta(is <= InsertCounts[QPos]);\r
+ for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i)\r
+ B3.push_back('-');\r
+ is = 0;\r
+ ++QPos;\r
+ }\r
+ if (c == 'M')\r
+ {\r
+ asserta(BPos < LB);\r
+ B3.push_back(toupper(B[BPos++]));\r
+ }\r
+ else if (c == 'D')\r
+ B3.push_back('-');\r
+ else if (c == 'I')\r
+ {\r
+ ++is;\r
+ asserta(BPos < LB);\r
+ B3.push_back(toupper(B[BPos++]));\r
+ }\r
+ }\r
+ asserta(is <= InsertCounts[LQ]);\r
+ for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k)\r
+ B3.push_back('-');\r
+ asserta(APos == LA);\r
+ asserta(BPos == LB);\r
+\r
+ asserta(SIZE(Q3) == SIZE(A3));\r
+ asserta(SIZE(Q3) == SIZE(B3));\r
+ }\r
--- /dev/null
+#!/bin/bash
+CPPNames='addtargets2 alignchime alignchimel alnparams alpha alpha2 fractid getparents globalalign2 make3way mx myutils path searchchime seqdb setnucmx sfasta tracebackbit uchime_main usort viterbifast writechhit'
+ObjNames='addtargets2.o alignchime.o alignchimel.o alnparams.o alpha.o alpha2.o fractid.o getparents.o globalalign2.o make3way.o mx.o myutils.o path.o searchchime.o seqdb.o setnucmx.o sfasta.o tracebackbit.o uchime_main.o usort.o viterbifast.o writechhit.o'
+
+rm -f *.o mk.stdout mk.stderr tmp.stderr
+
+for CPPName in $CPPNames
+do
+ echo $CPPName >> /dev/tty
+ g++ $ENV_GCC_OPTS -c -O3 -msse2 -mfpmath=sse -D_FILE_OFFSET_BITS=64 -DNDEBUG=1 -DUCHIMES=1 $CPPName.cpp -o $CPPName.o >> mk.stdout 2>> tmp.stderr
+ cat tmp.stderr
+ cat tmp.stderr >> mk.stderr
+ rm -f tmp.stderr
+done
+
+LINK_OPTS=
+if [ `uname -s` == Linux ] ; then
+ LINK_OPTS=-static
+fi
+g++ $LINK_OPTS $ENV_LINK_OPTS -g -o uchime $ObjNames >> mk.stdout 2>> tmp.stderr
+cat tmp.stderr
+cat tmp.stderr >> mk.stderr
+rm -f tmp.stderr
+
+strip uchime
+ls -lh uchime
+sum uchime
--- /dev/null
+#include "myutils.h"\r
+#include "mx.h"\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+\r
+char ProbToChar(float p);\r
+\r
+list<MxBase *> *MxBase::m_Matrices = 0;\r
+unsigned MxBase::m_AllocCount;\r
+unsigned MxBase::m_ZeroAllocCount;\r
+unsigned MxBase::m_GrowAllocCount;\r
+double MxBase::m_TotalBytes;\r
+double MxBase::m_MaxBytes;\r
+\r
+static const char *LogizeStr(const char *s)\r
+ {\r
+ double d = atof(s);\r
+ d = log(d);\r
+ return TypeToStr<float>(float(d));\r
+ }\r
+\r
+static const char *ExpizeStr(const char *s)\r
+ {\r
+ double d = atof(s);\r
+ d = exp(d);\r
+ return TypeToStr<float>(float(d));\r
+ }\r
+\r
+void MxBase::OnCtor(MxBase *Mx)\r
+ {\r
+ if (m_Matrices == 0)\r
+ m_Matrices = new list<MxBase *>;\r
+ asserta(m_Matrices != 0);\r
+ m_Matrices->push_front(Mx);\r
+ }\r
+\r
+void MxBase::OnDtor(MxBase *Mx)\r
+ {\r
+ if (m_Matrices == 0)\r
+ {\r
+ Warning("MxBase::OnDtor, m_Matrices = 0");\r
+ return;\r
+ }\r
+ for (list<MxBase*>::iterator p = m_Matrices->begin();\r
+ p != m_Matrices->end(); ++p)\r
+ {\r
+ if (*p == Mx)\r
+ {\r
+ m_Matrices->erase(p);\r
+ if (m_Matrices->empty())\r
+ delete m_Matrices;\r
+ return;\r
+ }\r
+ }\r
+ Warning("MxBase::OnDtor, not found");\r
+ }\r
+\r
+//float **MxBase::Getf(const string &Name)\r
+// {\r
+// Mx<float> *m = (Mx<float> *) Get(Name);\r
+// asserta(m->GetTypeSize() == sizeof(float));\r
+// return m->GetData();\r
+// }\r
+//\r
+//double **MxBase::Getd(const string &Name)\r
+// {\r
+// Mx<double> *m = (Mx<double> *) Get(Name);\r
+// asserta(m->GetTypeSize() == sizeof(double));\r
+// return m->GetData();\r
+// }\r
+//\r
+//char **MxBase::Getc(const string &Name)\r
+// {\r
+// Mx<char> *m = (Mx<char> *) Get(Name);\r
+// asserta(m->GetTypeSize() == sizeof(char));\r
+// return m->GetData();\r
+// }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqDB *DB, unsigned IdA, unsigned IdB)\r
+ {\r
+ Alloc(Name, RowCount, ColCount, DB, IdA, IdB, 0, 0);\r
+ }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqData *SA, const SeqData *SB)\r
+ {\r
+ Alloc(Name, RowCount, ColCount, 0, UINT_MAX, UINT_MAX, SA, SB);\r
+ }\r
+\r
+void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqDB *DB, unsigned IdA, unsigned IdB, const SeqData *SA, const SeqData *SB)\r
+ {\r
+ StartTimer(MxBase_Alloc);\r
+\r
+ ++m_AllocCount;\r
+ if (m_AllocatedRowCount == 0)\r
+ ++m_ZeroAllocCount;\r
+\r
+ if (DB != 0)\r
+ {\r
+ asserta(IdA != UINT_MAX);\r
+ asserta(IdB != UINT_MAX);\r
+ asserta(RowCount >= DB->GetSeqLength(IdA) + 1);\r
+ asserta(ColCount >= DB->GetSeqLength(IdB) + 1);\r
+ }\r
+ if (RowCount > m_AllocatedRowCount || ColCount > m_AllocatedColCount)\r
+ {\r
+ if (m_AllocatedRowCount > 0)\r
+ {\r
+ if (opt_logmemgrows)\r
+ Log("MxBase::Alloc grow %s %u x %u -> %u x %u, %s bytes\n",\r
+ Name, m_AllocatedRowCount, m_AllocatedColCount,\r
+ RowCount, ColCount,\r
+ IntToStr(GetBytes()));\r
+ ++m_GrowAllocCount;\r
+ }\r
+\r
+ m_TotalBytes -= GetBytes();\r
+\r
+ PauseTimer(MxBase_Alloc);\r
+ StartTimer(MxBase_FreeData);\r
+ FreeData();\r
+ EndTimer(MxBase_FreeData);\r
+ StartTimer(MxBase_Alloc);\r
+\r
+ unsigned N = max(RowCount + 16, m_AllocatedRowCount);\r
+ unsigned M = max(ColCount + 16, m_AllocatedColCount);\r
+ N = max(N, M);\r
+\r
+ PauseTimer(MxBase_Alloc);\r
+ StartTimer(MxBase_AllocData);\r
+ AllocData(N, N);\r
+ EndTimer(MxBase_AllocData);\r
+ StartTimer(MxBase_Alloc);\r
+\r
+ m_TotalBytes += GetBytes();\r
+ if (m_TotalBytes > m_MaxBytes)\r
+ m_MaxBytes = m_TotalBytes;\r
+ }\r
+ \r
+ unsigned n = sizeof(m_Name)-1;\r
+ strncpy(m_Name, Name, n);\r
+ m_Name[n] = 0;\r
+ m_RowCount = RowCount;\r
+ m_ColCount = ColCount;\r
+ m_SeqDB = DB;\r
+ m_IdA = IdA;\r
+ m_IdB = IdB;\r
+ m_SA = SA;\r
+ m_SB = SB;\r
+\r
+ EndTimer(MxBase_Alloc);\r
+ }\r
+\r
+void MxBase::LogMe(bool WithData, int Opts) const\r
+ {\r
+ Log("\n");\r
+ if (Opts & OPT_EXP)\r
+ Log("Exp ");\r
+ else if (Opts & OPT_LOG)\r
+ Log("Log ");\r
+ bool ZeroBased = ((Opts & OPT_ZERO_BASED) != 0);\r
+ Log("%s(%p) Rows %u/%u, Cols %u/%u",\r
+ m_Name, this,\r
+ m_RowCount, m_AllocatedRowCount,\r
+ m_ColCount, m_AllocatedColCount);\r
+ if (m_SeqDB != 0 && m_IdA != UINT_MAX)\r
+ Log(", A=%s", m_SeqDB->GetLabel(m_IdA));\r
+ else if (m_SA != 0)\r
+ Log(", A=%s", m_SA->Label);\r
+ if (m_SeqDB != 0 && m_IdB != UINT_MAX)\r
+ Log(", B=%s", m_SeqDB->GetLabel(m_IdB));\r
+ else if (m_SB != 0)\r
+ Log(", B=%s", m_SB->Label);\r
+ Log("\n");\r
+ if (!WithData || m_RowCount == 0 || m_ColCount == 0)\r
+ return;\r
+\r
+ const char *z = GetAsStr(0, 0);\r
+ unsigned Width = strlen(z);\r
+ unsigned Mod = 1;\r
+ for (unsigned i = 0; i < Width; ++i)\r
+ Mod *= 10;\r
+\r
+ if (m_Alpha[0] != 0)\r
+ {\r
+ Log("// Alphabet=%s\n", m_Alpha);\r
+ Log("// ");\r
+ unsigned n = strlen(m_Alpha);\r
+ for (unsigned j = 0; j < n; ++j)\r
+ Log(" %*c", Width, m_Alpha[j]);\r
+ Log("\n");\r
+ for (unsigned i = 0; i < n; ++i)\r
+ {\r
+ Log("/* %c */ {", m_Alpha[i]);\r
+ unsigned ci = m_Alpha[i];\r
+ for (unsigned j = 0; j < n; ++j)\r
+ {\r
+ unsigned cj = m_Alpha[j];\r
+ Log("%s,", GetAsStr(ci, cj));\r
+ }\r
+ Log("}, // %c\n", m_Alpha[i]);\r
+ }\r
+ return;\r
+ }\r
+ else if (m_Alpha2[0] != 0)\r
+ {\r
+ unsigned n = strlen(m_Alpha2);\r
+ Log("// Alphabet=%s\n", m_Alpha2);\r
+ Log("// ");\r
+ for (unsigned j = 0; j < n; ++j)\r
+ Log(" %*c", Width, m_Alpha2[j]);\r
+ Log("\n");\r
+ for (unsigned i = 0; i < n; ++i)\r
+ {\r
+ Log("/* %c */ {", m_Alpha2[i]);\r
+ unsigned ci = m_Alpha2[i];\r
+ for (unsigned j = 0; j < n; ++j)\r
+ Log("%s,", GetAsStr(i, j));\r
+ Log("}, // %c\n", m_Alpha2[i]);\r
+ }\r
+ return;\r
+ }\r
+\r
+ const byte *A = 0;\r
+ const byte *B = 0;\r
+ if (m_SeqDB != 0 && m_IdA != UINT_MAX)\r
+ A = m_SeqDB->GetSeq(m_IdA);\r
+ else if (m_SA != 0)\r
+ A = m_SA->Seq;\r
+ if (m_SeqDB != 0 && m_IdB != UINT_MAX)\r
+ B = m_SeqDB->GetSeq(m_IdB);\r
+ else if (m_SB != 0)\r
+ B = m_SB->Seq;\r
+\r
+ if (B != 0)\r
+ {\r
+ if (A != 0)\r
+ Log(" ");\r
+ Log("%5.5s", "");\r
+ if (ZeroBased)\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ Log("%*c", Width, B[j]);\r
+ else\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ Log("%*c", Width, j == 0 ? ' ' : B[j-1]);\r
+ Log("\n");\r
+ }\r
+\r
+ if (A != 0)\r
+ Log(" ");\r
+ Log("%5.5s", "");\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ Log("%*u", Width, j%Mod);\r
+ Log("\n");\r
+\r
+ for (unsigned i = 0; i < m_RowCount; ++i)\r
+ {\r
+ if (A != 0)\r
+ {\r
+ if (ZeroBased)\r
+ Log("%c ", A[i]);\r
+ else\r
+ Log("%c ", i == 0 ? ' ' : A[i-1]);\r
+ }\r
+ Log("%4u ", i);\r
+ \r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ {\r
+ const char *s = GetAsStr(i, j);\r
+ if (Opts & OPT_LOG)\r
+ s = LogizeStr(s);\r
+ else if (Opts & OPT_EXP)\r
+ s = ExpizeStr(s);\r
+ Log("%s", s);\r
+ }\r
+ Log("\n");\r
+ }\r
+ }\r
+static unsigned g_MatrixFileCount;\r
+\r
+void MxBase::LogCounts()\r
+ {\r
+ Log("\n");\r
+ Log("MxBase::LogCounts()\n");\r
+ Log(" What N\n");\r
+ Log("---------- ----------\n");\r
+ Log(" Allocs %10u\n", m_AllocCount);\r
+ Log("ZeroAllocs %10u\n", m_ZeroAllocCount);\r
+ Log(" Grows %10u\n", m_GrowAllocCount);\r
+ Log(" Bytes %10.10s\n", MemBytesToStr(m_TotalBytes));\r
+ Log(" Max bytes %10.10s\n", MemBytesToStr(m_MaxBytes));\r
+ }\r
--- /dev/null
+#ifndef mx_h\r
+#define mx_h\r
+\r
+#include <list>\r
+#include <limits.h>\r
+#include <math.h>\r
+#include "timing.h"\r
+#include "myutils.h"\r
+\r
+const int OPT_LOG = 0x01;\r
+const int OPT_EXP = 0x02;\r
+const int OPT_ZERO_BASED = 0x04;\r
+const float MINUS_INFINITY = -9e9f;\r
+const float UNINIT = -8e8f;\r
+\r
+struct SeqData;\r
+\r
+template<class T> const char *TypeToStr(T t)\r
+ {\r
+ Die("Unspecialised TypeToStr() called");\r
+ ureturn(0);\r
+ }\r
+\r
+template<> inline const char *TypeToStr<unsigned short>(unsigned short f)\r
+ {\r
+ static char s[16];\r
+\r
+ sprintf(s, "%12u", f);\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<short>(short f)\r
+ {\r
+ static char s[16];\r
+\r
+ sprintf(s, "%12d", f);\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<int>(int f)\r
+ {\r
+ static char s[16];\r
+\r
+ sprintf(s, "%5d", f);\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<float>(float f)\r
+ {\r
+ static char s[16];\r
+\r
+ if (f == UNINIT)\r
+ sprintf(s, "%12.12s", "?");\r
+ else if (f < MINUS_INFINITY/2)\r
+ sprintf(s, "%12.12s", "*");\r
+ else if (f == 0.0f)\r
+ sprintf(s, "%12.12s", ".");\r
+ else if (f >= -1e5 && f <= 1e5)\r
+ sprintf(s, "%12.5f", f);\r
+ else\r
+ sprintf(s, "%12.4g", f);\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<double>(double f)\r
+ {\r
+ static char s[16];\r
+\r
+ if (f < -1e9)\r
+ sprintf(s, "%12.12s", "*");\r
+ else if (f == 0.0f)\r
+ sprintf(s, "%12.12s", ".");\r
+ else if (f >= -1e-5 && f <= 1e5)\r
+ sprintf(s, "%12.5f", f);\r
+ else\r
+ sprintf(s, "%12.4g", f);\r
+ return s;\r
+ }\r
+\r
+static inline const char *FloatToStr(float f, string &s)\r
+ {\r
+ s = TypeToStr<float>(f);\r
+ return s.c_str();\r
+ }\r
+\r
+template<> inline const char *TypeToStr<char>(char c)\r
+ {\r
+ static char s[2];\r
+ s[0] = c;\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<byte>(byte c)\r
+ {\r
+ static char s[2];\r
+ s[0] = c;\r
+ return s;\r
+ }\r
+\r
+template<> inline const char *TypeToStr<bool>(bool tof)\r
+ {\r
+ static char s[2];\r
+ s[0] = tof ? 'T' : 'F';\r
+ return s;\r
+ }\r
+\r
+struct SeqDB;\r
+\r
+struct MxBase\r
+ {\r
+private:\r
+ MxBase(const MxBase &rhs);\r
+ MxBase &operator=(const MxBase &rhs);\r
+\r
+public:\r
+ char m_Name[32];\r
+ char m_Alpha[32];\r
+ char m_Alpha2[32];\r
+ unsigned m_RowCount;\r
+ unsigned m_ColCount;\r
+ unsigned m_AllocatedRowCount;\r
+ unsigned m_AllocatedColCount;\r
+ const SeqDB *m_SeqDB;\r
+ unsigned m_IdA;\r
+ unsigned m_IdB;\r
+ const SeqData *m_SA;\r
+ const SeqData *m_SB;\r
+\r
+ static list<MxBase *> *m_Matrices;\r
+ //static MxBase *Get(const string &Name);\r
+ //static float **Getf(const string &Name);\r
+ //static double **Getd(const string &Name);\r
+ //static char **Getc(const string &Name);\r
+\r
+ static unsigned m_AllocCount;\r
+ static unsigned m_ZeroAllocCount;\r
+ static unsigned m_GrowAllocCount;\r
+ static double m_TotalBytes;\r
+ static double m_MaxBytes;\r
+\r
+ static void OnCtor(MxBase *Mx);\r
+ static void OnDtor(MxBase *Mx);\r
+\r
+ MxBase()\r
+ {\r
+ m_AllocatedRowCount = 0;\r
+ m_AllocatedColCount = 0;\r
+ m_RowCount = 0;\r
+ m_ColCount = 0;\r
+ m_IdA = UINT_MAX;\r
+ m_IdB = UINT_MAX;\r
+ m_SeqDB = 0;\r
+ OnCtor(this);\r
+ }\r
+ virtual ~MxBase()\r
+ {\r
+ OnDtor(this);\r
+ }\r
+\r
+ virtual unsigned GetTypeSize() const = 0;\r
+ virtual unsigned GetBytes() const = 0;\r
+\r
+ void Clear()\r
+ {\r
+ FreeData();\r
+ m_AllocatedRowCount = 0;\r
+ m_AllocatedColCount = 0;\r
+ m_RowCount = 0;\r
+ m_ColCount = 0;\r
+ m_IdA = UINT_MAX;\r
+ m_IdB = UINT_MAX;\r
+ m_SA = 0;\r
+ m_SB = 0;\r
+ }\r
+\r
+ bool Empty() const\r
+ {\r
+ return m_RowCount == 0;\r
+ }\r
+\r
+ virtual void AllocData(unsigned RowCount, unsigned ColCount) = 0;\r
+ virtual void FreeData() = 0;\r
+ virtual const char *GetAsStr(unsigned i, unsigned j) const = 0;\r
+\r
+ void SetAlpha(const char *Alpha)\r
+ {\r
+ unsigned n = sizeof(m_Alpha);\r
+ strncpy(m_Alpha, Alpha, n);\r
+ m_Alpha[n] = 0;\r
+ }\r
+\r
+ void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqDB *DB, unsigned IdA, unsigned IdB,\r
+ const SeqData *SA, const SeqData *SB);\r
+\r
+ void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqDB *DB = 0, unsigned IdA = UINT_MAX, unsigned IdB = UINT_MAX);\r
+\r
+ void Alloc(const char *Name, unsigned RowCount, unsigned ColCount,\r
+ const SeqData *SA, const SeqData *SB);\r
+\r
+ static void LogAll()\r
+ {\r
+ Log("\n");\r
+ if (m_Matrices == 0)\r
+ {\r
+ Log("MxBase::m_Matrices=0\n");\r
+ return;\r
+ }\r
+ Log("\n");\r
+ Log("AllRows AllCols Sz MB Name\n");\r
+ Log("------- ------- ---- -------- ----\n");\r
+ double TotalMB = 0;\r
+ for (list<MxBase *>::const_iterator p = m_Matrices->begin();\r
+ p != m_Matrices->end(); ++p)\r
+ {\r
+ const MxBase *Mx = *p;\r
+ if (Mx == 0)\r
+ continue;\r
+ //if (Mx->m_RowCount != 0 || ShowEmpty)\r
+ // Mx->LogMe(WithData);\r
+ unsigned ar = Mx->m_AllocatedRowCount;\r
+ if (ar == 0)\r
+ continue;\r
+ unsigned ac = Mx->m_AllocatedColCount;\r
+ unsigned sz = Mx->GetTypeSize();\r
+ double MB = (double) ar*(double) ac*(double) sz/1e6;\r
+ TotalMB += MB;\r
+ Log("%7u %7u %4u %8.2f %s\n", ar, ac, sz, MB, Mx->m_Name);\r
+ }\r
+ Log(" --------\n");\r
+ Log("%7.7s %7.7s %4.4s %8.2f\n", "", "", "", TotalMB);\r
+ }\r
+\r
+ void LogMe(bool WithData = true, int Opts = 0) const;\r
+ static void LogCounts();\r
+ };\r
+\r
+template<class T> struct Mx : public MxBase\r
+ {\r
+// Disable unimplemented stuff\r
+private:\r
+ Mx(Mx &rhs);\r
+ Mx &operator=(Mx &rhs);\r
+ // const Mx &operator=(const Mx &rhs) const;\r
+\r
+public:\r
+ T **m_Data;\r
+\r
+ Mx()\r
+ {\r
+ m_Data = 0;\r
+ }\r
+ \r
+ ~Mx()\r
+ {\r
+ FreeData();\r
+ }\r
+\r
+ virtual void AllocData(unsigned RowCount, unsigned ColCount)\r
+ {\r
+ if (opt_logmemgrows)\r
+ Log("MxBase::AllocData(%u,%u) %s bytes, Name=%s\n",\r
+ RowCount, ColCount, IntToStr(GetBytes()), m_Name);\r
+ // m_Data = myalloc<T *>(RowCount);\r
+ m_Data = MYALLOC(T *, RowCount, Mx);\r
+ for (unsigned i = 0; i < RowCount; ++i)\r
+ // m_Data[i] = myalloc<T>(ColCount);\r
+ m_Data[i] = MYALLOC(T, ColCount, Mx);\r
+ AddBytes("Mx_AllocData", RowCount*sizeof(T *) + RowCount*ColCount*sizeof(T));\r
+\r
+ m_AllocatedRowCount = RowCount;\r
+ m_AllocatedColCount = ColCount;\r
+ }\r
+\r
+ virtual void FreeData()\r
+ {\r
+ for (unsigned i = 0; i < m_AllocatedRowCount; ++i)\r
+ MYFREE(m_Data[i], m_AllocatedColCount, Mx);\r
+ MYFREE(m_Data, m_AllocatedRowCount, Mx);\r
+ SubBytes("Mx_AllocData",\r
+ m_AllocatedRowCount*sizeof(T *) + m_AllocatedRowCount*m_AllocatedColCount*sizeof(T));\r
+\r
+ m_Data = 0;\r
+ m_RowCount = 0;\r
+ m_ColCount = 0;\r
+ m_AllocatedRowCount = 0;\r
+ m_AllocatedColCount = 0;\r
+ }\r
+\r
+ T **GetData()\r
+ {\r
+ return (T **) m_Data;\r
+ }\r
+\r
+ T Get(unsigned i, unsigned j) const\r
+ {\r
+ assert(i < m_RowCount);\r
+ assert(j < m_ColCount);\r
+ return m_Data[i][j];\r
+ }\r
+\r
+ void Put(unsigned i, unsigned j, T x) const\r
+ {\r
+ assert(i < m_RowCount);\r
+ assert(j < m_ColCount);\r
+ m_Data[i][j] = x;\r
+ }\r
+\r
+ T GetOffDiagAvgs(vector<T> &Avgs) const\r
+ {\r
+ if (m_RowCount != m_ColCount)\r
+ Die("GetOffDiagAvgs, not symmetrical");\r
+ Avgs.clear();\r
+ T Total = T(0);\r
+ for (unsigned i = 0; i < m_RowCount; ++i)\r
+ {\r
+ T Sum = T(0);\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ {\r
+ if (j == i)\r
+ continue;\r
+ Sum += m_Data[i][j];\r
+ }\r
+ T Avg = Sum/(m_RowCount-1);\r
+ Total += Avg;\r
+ Avgs.push_back(Avg);\r
+ }\r
+ return m_RowCount == 0 ? T(0) : Total/m_RowCount;\r
+ }\r
+\r
+ unsigned GetTypeSize() const\r
+ {\r
+ return sizeof(T);\r
+ }\r
+\r
+ virtual unsigned GetBytes() const\r
+ {\r
+ return m_AllocatedRowCount*m_AllocatedColCount*GetTypeSize() +\r
+ m_AllocatedRowCount*sizeof(T *);\r
+ }\r
+\r
+ const char *GetAsStr(unsigned i, unsigned j) const\r
+ {\r
+ return TypeToStr<T>(Get(i, j));\r
+ }\r
+\r
+ const T *const *const GetData() const\r
+ {\r
+ return (const T *const *) m_Data;\r
+ }\r
+\r
+ void Copy(const Mx<T> &rhs)\r
+ {\r
+ Alloc("Copy", rhs.m_RowCount, rhs.m_ColCount, rhs.m_SeqDB, rhs.m_IdA, rhs.m_IdB);\r
+ const T * const *Data = rhs.GetData();\r
+ for (unsigned i = 0; i < m_RowCount; ++i)\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ m_Data[i][j] = Data[i][j];\r
+ }\r
+\r
+ void Assign(T v)\r
+ {\r
+ for (unsigned i = 0; i < m_RowCount; ++i)\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ m_Data[i][j] = v;\r
+ }\r
+\r
+ bool Eq(const Mx &rhs, bool Bwd = false) const\r
+ {\r
+ if (rhs.m_ColCount != m_ColCount)\r
+ return false;\r
+ if (rhs.m_RowCount != m_RowCount)\r
+ return false;\r
+ const T * const*d = rhs.GetData();\r
+ int i1 = Bwd ? m_RowCount : 0;\r
+ int j1 = Bwd ? m_ColCount : 0;\r
+ int i2 = Bwd ? -1 : m_RowCount;\r
+ int j2 = Bwd ? -1 : m_ColCount;\r
+ for (int i = i1; i != i2; Bwd ? --i : ++i)\r
+ for (int j = j1; j != j2; Bwd ? --j : ++j)\r
+ {\r
+ float x = m_Data[i][j];\r
+ float y = d[i][j];\r
+ if (x < -1e10 && y < -1e10)\r
+ continue;\r
+ if (!feq(x, y))\r
+ {\r
+ Warning("%s[%d][%d] = %g, %s = %g",\r
+ m_Name, i, j, x, rhs.m_Name, y);\r
+ return false;\r
+ }\r
+ }\r
+ return true;\r
+ }\r
+\r
+ bool EqMask(const Mx &rhs, const Mx<bool> &Mask) const\r
+ {\r
+ if (rhs.m_ColCount != m_ColCount)\r
+ return false;\r
+ if (rhs.m_RowCount != m_RowCount)\r
+ return false;\r
+\r
+ if (Mask.m_ColCount != m_ColCount)\r
+ return false;\r
+ if (Mask.m_RowCount != m_RowCount)\r
+ return false;\r
+\r
+ const T * const*d = rhs.GetData();\r
+ bool Bwd = false;\r
+ int i1 = Bwd ? m_RowCount : 0;\r
+ int j1 = Bwd ? m_ColCount : 0;\r
+ int i2 = Bwd ? -1 : m_RowCount;\r
+ int j2 = Bwd ? -1 : m_ColCount;\r
+ for (int i = i1; i != i2; Bwd ? --i : ++i)\r
+ for (int j = j1; j != j2; Bwd ? --j : ++j)\r
+ {\r
+ if (!Mask.m_Data[i][j])\r
+ continue;\r
+ float x = m_Data[i][j];\r
+ float y = d[i][j];\r
+ if (x < -1e10 && y < -1e10)\r
+ continue;\r
+ if (!feq(x, y))\r
+ {\r
+ Warning("%s[%d][%d] = %g, %s = %g",\r
+ m_Name, i, j, x, rhs.m_Name, y);\r
+ return false;\r
+ }\r
+ }\r
+ return true;\r
+ }\r
+\r
+ void Init(T v)\r
+ {\r
+ for (unsigned i = 0; i < m_RowCount; ++i)\r
+ for (unsigned j = 0; j < m_ColCount; ++j)\r
+ m_Data[i][j] = v;\r
+ }\r
+ };\r
+\r
+void WriteMx(const string &Name, Mx<float> &Mxf);\r
+\r
+template<class T> void ReserveMx(Mx<T> &Mxf, unsigned N = UINT_MAX)\r
+ {\r
+ if (Mxf.m_AllocatedRowCount > 0)\r
+ return;\r
+ extern unsigned g_MaxInputSeqLength;\r
+ if (N == UINT_MAX)\r
+ N = g_MaxInputSeqLength+1;\r
+ Mxf.Alloc("(Reserved)", N, N);\r
+ }\r
+\r
+#endif // mx_h\r
--- /dev/null
+#ifndef MY_VERSION\r
+#define MY_VERSION "4.2"\r
+#endif\r
+\r
+STR_OPT( input, 0)\r
+STR_OPT( query, 0)\r
+STR_OPT( db, 0)\r
+STR_OPT( sort, 0)\r
+STR_OPT( output, 0)\r
+STR_OPT( uc, 0)\r
+STR_OPT( clstr2uc, 0)\r
+STR_OPT( uc2clstr, 0)\r
+STR_OPT( uc2fasta, 0)\r
+STR_OPT( uc2fastax, 0)\r
+STR_OPT( mergesort, 0)\r
+STR_OPT( tmpdir, ".")\r
+STR_OPT( staralign, 0)\r
+STR_OPT( sortuc, 0)\r
+STR_OPT( blastout, 0)\r
+STR_OPT( blast6out, 0)\r
+STR_OPT( fastapairs, 0)\r
+STR_OPT( idchar, "|")\r
+STR_OPT( diffchar, " ")\r
+STR_OPT( uchime, 0)\r
+STR_OPT( gapopen, 0)\r
+STR_OPT( gapext, 0)\r
+STR_OPT( uhire, 0)\r
+STR_OPT( ids, "99,98,95,90,85,80,70,50,35")\r
+STR_OPT( seeds, 0)\r
+STR_OPT( clump, 0)\r
+STR_OPT( clumpout, 0)\r
+STR_OPT( clump2fasta, 0)\r
+STR_OPT( clumpfasta, 0)\r
+STR_OPT( hireout, 0)\r
+STR_OPT( mergeclumps, 0)\r
+STR_OPT( alpha, 0)\r
+STR_OPT( hspalpha, 0)\r
+STR_OPT( probmx, 0)\r
+STR_OPT( matrix, 0)\r
+STR_OPT( tracestate, 0)\r
+STR_OPT( chainout, 0)\r
+STR_OPT( cluster, 0)\r
+STR_OPT( computekl, 0)\r
+STR_OPT( userout, 0)\r
+STR_OPT( userfields, 0)\r
+STR_OPT( seedsout, 0)\r
+STR_OPT( chainhits, 0)\r
+STR_OPT( findorfs, 0)\r
+STR_OPT( strand, 0)\r
+STR_OPT( getseqs, 0)\r
+STR_OPT( labels, 0)\r
+STR_OPT( doug, 0)\r
+STR_OPT( makeindex, 0)\r
+STR_OPT( indexstats, 0)\r
+STR_OPT( uchimeout, 0)\r
+STR_OPT( uchimealns, 0)\r
+STR_OPT( xframe, 0)\r
+STR_OPT( mkctest, 0)\r
+STR_OPT( allpairs, 0)\r
+STR_OPT( fastq2fasta, 0)\r
+STR_OPT( otusort, 0)\r
+STR_OPT( sparsedist, 0)\r
+STR_OPT( sparsedistparams, 0)\r
+STR_OPT( mcc, 0)\r
+STR_OPT( utax, 0)\r
+STR_OPT( simcl, 0)\r
+STR_OPT( absort, 0)\r
+STR_OPT( cc, 0)\r
+STR_OPT( uslink, 0)\r
+\r
+UNS_OPT( band, 16, 0, UINT_MAX)\r
+UNS_OPT( minlen, 10, 1, UINT_MAX)\r
+UNS_OPT( maxlen, 10000, 1, UINT_MAX)\r
+UNS_OPT( w, 0, 1, UINT_MAX)\r
+UNS_OPT( k, 0, 1, UINT_MAX)\r
+UNS_OPT( stepwords, 8, 0, UINT_MAX)\r
+UNS_OPT( maxaccepts, 1, 0, UINT_MAX)\r
+UNS_OPT( maxrejects, 8, 0, UINT_MAX)\r
+UNS_OPT( maxtargets, 0, 0, UINT_MAX)\r
+UNS_OPT( minhsp, 32, 1, UINT_MAX)\r
+UNS_OPT( bump, 50, 0, 100)\r
+UNS_OPT( rowlen, 64, 8, UINT_MAX)\r
+UNS_OPT( idprefix, 0, 0, UINT_MAX)\r
+UNS_OPT( idsuffix, 0, 0, UINT_MAX)\r
+UNS_OPT( chunks, 4, 2, UINT_MAX)\r
+UNS_OPT( minchunk, 64, 2, UINT_MAX)\r
+UNS_OPT( maxclump, 1000, 1, UINT_MAX)\r
+UNS_OPT( iddef, 0, 0, UINT_MAX)\r
+UNS_OPT( mincodons, 20, 1, UINT_MAX)\r
+UNS_OPT( maxovd, 8, 0, UINT_MAX)\r
+UNS_OPT( max2, 40, 0, UINT_MAX)\r
+UNS_OPT( querylen, 500, 0, UINT_MAX)\r
+UNS_OPT( targetlen, 500, 0, UINT_MAX)\r
+UNS_OPT( orfstyle, (1+2+4), 0, UINT_MAX)\r
+UNS_OPT( dbstep, 1, 1, UINT_MAX)\r
+UNS_OPT( randseed, 1, 0, UINT_MAX)\r
+UNS_OPT( maxp, 2, 2, UINT_MAX)\r
+UNS_OPT( idsmoothwindow, 32, 1, UINT_MAX)\r
+UNS_OPT( mindiffs, 3, 1, UINT_MAX)\r
+UNS_OPT( maxspan1, 24, 1, UINT_MAX)\r
+UNS_OPT( maxspan2, 24, 1, UINT_MAX)\r
+UNS_OPT( minorfcov, 16, 1, UINT_MAX)\r
+UNS_OPT( hashsize, 4195879, 1, UINT_MAX)\r
+UNS_OPT( maxpoly, 0, 0, UINT_MAX)\r
+UNS_OPT( droppct, 50, 0, 100)\r
+UNS_OPT( secs, 10, 0, UINT_MAX)\r
+UNS_OPT( maxqgap, 0, 0, UINT_MAX)\r
+UNS_OPT( maxtgap, 0, 0, UINT_MAX)\r
+\r
+INT_OPT( frame, 0, -3, +3)\r
+\r
+TOG_OPT( trace, false)\r
+TOG_OPT( logmemgrows, false)\r
+TOG_OPT( trunclabels, false)\r
+TOG_OPT( verbose, false)\r
+TOG_OPT( wordcountreject, true)\r
+TOG_OPT( rev, false)\r
+TOG_OPT( output_rejects, false)\r
+TOG_OPT( blast_termgaps, false)\r
+TOG_OPT( fastalign, true)\r
+TOG_OPT( flushuc, false)\r
+TOG_OPT( stable_sort, false)\r
+TOG_OPT( minus_frames, true)\r
+TOG_OPT( usort, true)\r
+TOG_OPT( nb, false)\r
+TOG_OPT( twohit, true)\r
+TOG_OPT( ssort, false)\r
+TOG_OPT( log_query, false)\r
+TOG_OPT( log_hothits, false)\r
+TOG_OPT( logwordstats, false)\r
+TOG_OPT( ucl, false)\r
+TOG_OPT( skipgaps2, true)\r
+TOG_OPT( skipgaps, true)\r
+TOG_OPT( denovo, false)\r
+TOG_OPT( cartoon_orfs, false)\r
+TOG_OPT( label_ab, false)\r
+TOG_OPT( wordweight, false)\r
+TOG_OPT( isort, false)\r
+TOG_OPT( selfid, false)\r
+TOG_OPT( leftjust, false)\r
+TOG_OPT( rightjust, false)\r
+\r
+FLT_OPT( id, 0.0, 0.0, 1.0)\r
+FLT_OPT( weak_id, 0.0, 0.0, 1.0)\r
+FLT_OPT( match, 1.0, 0.0, FLT_MAX)\r
+FLT_OPT( mismatch, -2.0, 0.0, FLT_MAX)\r
+FLT_OPT( split, 1000.0, 1.0, FLT_MAX)\r
+FLT_OPT( evalue, 10.0, 0.0, FLT_MAX)\r
+FLT_OPT( weak_evalue, 10.0, 0.0, FLT_MAX)\r
+FLT_OPT( evalue_g, 10.0, 0.0, FLT_MAX)\r
+FLT_OPT( chain_evalue, 10.0, 0.0, FLT_MAX)\r
+FLT_OPT( xdrop_u, 16.0, 0.0, FLT_MAX)\r
+FLT_OPT( xdrop_g, 32.0, 0.0, FLT_MAX)\r
+FLT_OPT( xdrop_ug, 16.0, 0.0, FLT_MAX)\r
+FLT_OPT( xdrop_nw, 16.0, 0.0, FLT_MAX)\r
+FLT_OPT( ka_gapped_lambda, 0.0, 0.0, FLT_MAX)\r
+FLT_OPT( ka_ungapped_lambda, 0.0, 0.0, FLT_MAX)\r
+FLT_OPT( ka_gapped_k, 0.0, 0.0, FLT_MAX)\r
+FLT_OPT( ka_ungapped_k, 0.0, 0.0, FLT_MAX)\r
+FLT_OPT( ka_dbsize, 0.0, 0.0, FLT_MAX)\r
+FLT_OPT( chain_targetfract, 0.0, 0.0, 1.0)\r
+FLT_OPT( targetfract, 0.0, 0.0, 1.0)\r
+FLT_OPT( queryfract, 0.0, 0.0, 1.0)\r
+FLT_OPT( fspenalty, 16.0, 0.0, FLT_MAX)\r
+FLT_OPT( sspenalty, 20.0, 0.0, FLT_MAX)\r
+FLT_OPT( seedt1, 13.0, 0.0, FLT_MAX)\r
+FLT_OPT( seedt2, 11.0, 0.0, FLT_MAX)\r
+FLT_OPT( lopen, 11.0, 0.0, FLT_MAX)\r
+FLT_OPT( lext, 1.0, 0.0, FLT_MAX)\r
+FLT_OPT( minh, 0.3, 0.0, FLT_MAX)\r
+FLT_OPT( xn, 8.0, 0.0, FLT_MAX)\r
+FLT_OPT( dn, 1.4, 0.0, FLT_MAX)\r
+FLT_OPT( xa, 1.0, 0.0, FLT_MAX)\r
+FLT_OPT( mindiv, 0.5, 0.0, 100.0)\r
+FLT_OPT( abskew, 2, 0.0, 100.0)\r
+FLT_OPT( abx, 8.0, 0.0, 100.0)\r
+FLT_OPT( minspanratio1, 0.7, 0.0, 1.0)\r
+FLT_OPT( minspanratio2, 0.7, 0.0, 1.0)\r
+\r
+FLAG_OPT( usersort)\r
+FLAG_OPT( exact)\r
+FLAG_OPT( optimal)\r
+FLAG_OPT( self)\r
+FLAG_OPT( ungapped)\r
+FLAG_OPT( global)\r
+FLAG_OPT( local)\r
+FLAG_OPT( xlat)\r
+FLAG_OPT( realign)\r
+FLAG_OPT( hash)\r
+FLAG_OPT( derep)\r
--- /dev/null
+#include <time.h>\r
+#include <stdarg.h>\r
+#include <sys/stat.h>\r
+#include <errno.h>\r
+#include <string.h>\r
+#include <ctype.h>\r
+#include <string>\r
+#include <vector>\r
+#include <set>\r
+#include <map>\r
+#include <signal.h>\r
+#include <float.h>\r
+\r
+#ifdef _MSC_VER\r
+#include <crtdbg.h>\r
+#include <process.h>\r
+#include <windows.h>\r
+#include <psapi.h>\r
+#include <io.h>\r
+#else\r
+#include <sys/time.h>\r
+#include <sys/resource.h>\r
+#include <unistd.h>\r
+#include <errno.h>\r
+#include <fcntl.h>\r
+#include <stdlib.h>\r
+#endif\r
+\r
+#include "myutils.h"\r
+\r
+const char *SVN_VERSION =\r
+#include "svnversion.h"\r
+;\r
+\r
+#define TEST_UTILS 0\r
+\r
+using namespace std;\r
+\r
+const unsigned MY_IO_BUFSIZ = 32000;\r
+const unsigned MAX_FORMATTED_STRING_LENGTH = 64000;\r
+\r
+static char *g_IOBuffers[256];\r
+static time_t g_StartTime = time(0);\r
+static vector<string> g_Argv;\r
+static double g_PeakMemUseBytes;\r
+\r
+#if TEST_UTILS\r
+void TestUtils()\r
+ {\r
+ const int C = 100000000;\r
+ for (int i = 0; i < C; ++i)\r
+ ProgressStep(i, C, "something or other");\r
+\r
+ Progress("\n");\r
+ Progress("Longer message\r");\r
+ Sleep(1000);\r
+ Progress("Short\r");\r
+ Sleep(1000);\r
+ Progress("And longer again\r");\r
+ Sleep(1000);\r
+ Progress("Shrt\n");\r
+ Sleep(1000);\r
+ const unsigned N = 10;\r
+ unsigned M = 10;\r
+ for (unsigned i = 0; i < N; ++i)\r
+ {\r
+ ProgressStep(i, N, "Allocating 1MB blocks");\r
+ for (unsigned j = 0; j < M; ++j)\r
+ {\r
+ ProgressStep(j, M, "Inner loop"); \r
+ malloc(100000);\r
+ Sleep(500);\r
+ }\r
+ }\r
+ }\r
+#endif // TEST_UTILS\r
+\r
+static void AllocBuffer(FILE *f)\r
+ {\r
+ int fd = fileno(f);\r
+ if (fd < 0 || fd >= 256)\r
+ return;\r
+ if (g_IOBuffers[fd] == 0)\r
+ g_IOBuffers[fd] = myalloc(char, MY_IO_BUFSIZ);\r
+ setvbuf(f, g_IOBuffers[fd], _IOFBF, MY_IO_BUFSIZ);\r
+ }\r
+\r
+static void FreeBuffer(FILE *f)\r
+ {\r
+ int fd = fileno(f);\r
+ if (fd < 0 || fd >= 256)\r
+ return;\r
+ if (g_IOBuffers[fd] == 0)\r
+ return;\r
+ myfree(g_IOBuffers[fd]);\r
+ g_IOBuffers[fd] = 0;\r
+ }\r
+\r
+unsigned GetElapsedSecs()\r
+ {\r
+ return (unsigned) (time(0) - g_StartTime);\r
+ }\r
+\r
+static unsigned g_NewCalls;\r
+static unsigned g_FreeCalls;\r
+static double g_InitialMemUseBytes;\r
+static double g_TotalAllocBytes;\r
+static double g_TotalFreeBytes;\r
+static double g_NetBytes;\r
+static double g_MaxNetBytes;\r
+\r
+void LogAllocStats()\r
+ {\r
+ Log("\n");\r
+ Log(" Allocs %u\n", g_NewCalls);\r
+ Log(" Frees %u\n", g_FreeCalls);\r
+ Log("Initial alloc %s\n", MemBytesToStr(g_InitialMemUseBytes));\r
+ Log(" Total alloc %s\n", MemBytesToStr(g_TotalAllocBytes));\r
+ Log(" Total free %s\n", MemBytesToStr(g_TotalFreeBytes));\r
+ Log(" Net bytes %s\n", MemBytesToStr(g_NetBytes));\r
+ Log("Max net bytes %s\n", MemBytesToStr(g_MaxNetBytes));\r
+ Log(" Peak total %s\n", MemBytesToStr(g_MaxNetBytes + g_InitialMemUseBytes));\r
+ }\r
+\r
+bool StdioFileExists(const string &FileName)\r
+ {\r
+ struct stat SD;\r
+ int i = stat(FileName.c_str(), &SD);\r
+ return i == 0;\r
+ }\r
+\r
+void myassertfail(const char *Exp, const char *File, unsigned Line)\r
+ {\r
+ Die("%s(%u) assert failed: %s", File, Line, Exp);\r
+ }\r
+\r
+bool myisatty(int fd)\r
+ {\r
+ return isatty(fd) != 0;\r
+ }\r
+\r
+#ifdef _MSC_VER\r
+#include <io.h>\r
+int fseeko(FILE *stream, off_t offset, int whence)\r
+ {\r
+ off_t FilePos = _fseeki64(stream, offset, whence);\r
+ return (FilePos == -1L) ? -1 : 0;\r
+ }\r
+#define ftello(fm) (off_t) _ftelli64(fm)\r
+#endif\r
+\r
+void LogStdioFileState(FILE *f)\r
+ {\r
+ unsigned long tellpos = (unsigned long) ftello(f);\r
+ long fseek_pos = fseek(f, 0, SEEK_CUR);\r
+ int fd = fileno(f);\r
+ Log("FILE * %p\n", f);\r
+ Log("fileno %d\n", fd);\r
+ Log("feof %d\n", feof(f));\r
+ Log("ferror %d\n", ferror(f));\r
+ Log("ftell %ld\n", tellpos);\r
+ Log("fseek %ld\n", fseek_pos);\r
+#if !defined(_GNU_SOURCE) && !defined(__APPLE_CC__)\r
+ fpos_t fpos;\r
+ int fgetpos_retval = fgetpos(f, &fpos);\r
+ Log("fpos %ld (retval %d)\n", (long) fpos, fgetpos_retval);\r
+// Log("eof %d\n", _eof(fd));\r
+#endif\r
+#ifdef _MSC_VER\r
+ __int64 pos64 = _ftelli64(f);\r
+ Log("_ftelli64 %lld\n", pos64);\r
+#endif\r
+ }\r
+\r
+FILE *OpenStdioFile(const string &FileName)\r
+ {\r
+ const char *Mode = "rb";\r
+ FILE *f = fopen(FileName.c_str(), Mode);\r
+ if (f == 0)\r
+ {\r
+ if (errno == EFBIG)\r
+ {\r
+ if (sizeof(off_t) == 4)\r
+ Die("File too big, off_t is 32 bits, recompile needed");\r
+ else\r
+ Die("Cannot open '%s', file too big (off_t=%u bits)",\r
+ FileName.c_str(), sizeof(off_t)*8);\r
+ }\r
+ Die("Cannot open %s, errno=%d %s",\r
+ FileName.c_str(), errno, strerror(errno));\r
+ }\r
+ AllocBuffer(f);\r
+ return f;\r
+ }\r
+\r
+FILE *CreateStdioFile(const string &FileName)\r
+ {\r
+ FILE *f = fopen(FileName.c_str(), "wb+");\r
+ if (0 == f)\r
+ Die("Cannot create %s, errno=%d %s",\r
+ FileName.c_str(), errno, strerror(errno));\r
+ AllocBuffer(f);\r
+ return f;\r
+ }\r
+\r
+void SetStdioFilePos(FILE *f, off_t Pos)\r
+ {\r
+ if (0 == f)\r
+ Die("SetStdioFilePos failed, f=NULL");\r
+ int Ok = fseeko(f, Pos, SEEK_SET);\r
+ off_t NewPos = ftello(f);\r
+ if (Ok != 0 || Pos != NewPos)\r
+ {\r
+ LogStdioFileState(f);\r
+ Die("SetStdioFilePos(%d) failed, Ok=%d NewPos=%d",\r
+ (int) Pos, Ok, (int) NewPos);\r
+ }\r
+ }\r
+\r
+void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes)\r
+ {\r
+ if (0 == f)\r
+ Die("ReadStdioFile failed, f=NULL");\r
+ SetStdioFilePos(f, Pos);\r
+ unsigned BytesRead = fread(Buffer, 1, Bytes, f);\r
+ if (BytesRead != Bytes)\r
+ {\r
+ LogStdioFileState(f);\r
+ Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d",\r
+ (int) Bytes, (int) BytesRead, errno);\r
+ }\r
+ }\r
+\r
+void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes)\r
+ {\r
+ if (0 == f)\r
+ Die("ReadStdioFile failed, f=NULL");\r
+ unsigned BytesRead = fread(Buffer, 1, Bytes, f);\r
+ if (BytesRead != Bytes)\r
+ {\r
+ LogStdioFileState(f);\r
+ Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d",\r
+ (int) Bytes, (int) BytesRead, errno);\r
+ }\r
+ }\r
+\r
+// Return values from functions like lseek, ftell, fgetpos are\r
+// "undefined" for files that cannot seek. Attempt to detect\r
+// whether a file can seek by checking for error returns.\r
+bool CanSetStdioFilePos(FILE *f)\r
+ {\r
+// Common special cases\r
+ if (f == stdin || f == stdout || f == stderr)\r
+ return false;\r
+\r
+ fpos_t CurrPos;\r
+ int ok1 = fgetpos(f, &CurrPos);\r
+ if (ok1 < 0)\r
+ return false;\r
+ int ok2 = fseek(f, 0, SEEK_END);\r
+ if (ok2 < 0)\r
+ return false;\r
+ fpos_t EndPos;\r
+ int ok3 = fgetpos(f, &EndPos);\r
+ int ok4 = fsetpos(f, &CurrPos);\r
+ if (!ok3 || !ok4)\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+byte *ReadAllStdioFile(FILE *f, unsigned &FileSize)\r
+ {\r
+ const unsigned BUFF_SIZE = 1024*1024;\r
+\r
+ if (CanSetStdioFilePos(f))\r
+ {\r
+ off_t Pos = GetStdioFilePos(f);\r
+ off_t FileSize = GetStdioFileSize(f);\r
+ if (FileSize > UINT_MAX)\r
+ Die("ReadAllStdioFile: file size > UINT_MAX");\r
+ SetStdioFilePos(f, 0);\r
+ byte *Buffer = myalloc(byte, unsigned(FileSize));\r
+ ReadStdioFile(f, Buffer, unsigned(FileSize));\r
+ SetStdioFilePos(f, Pos);\r
+ FileSize = unsigned(FileSize);\r
+ return Buffer;\r
+ }\r
+\r
+// Can't seek, read one buffer at a time.\r
+ FileSize = 0;\r
+\r
+// Just to initialize so that first call to realloc works.\r
+ byte *Buffer = (byte *) malloc(4);\r
+ if (Buffer == 0)\r
+ Die("ReadAllStdioFile, out of memory");\r
+ for (;;)\r
+ {\r
+ Buffer = (byte *) realloc(Buffer, FileSize + BUFF_SIZE);\r
+ unsigned BytesRead = fread(Buffer + FileSize, 1, BUFF_SIZE, f);\r
+ FileSize += BytesRead;\r
+ if (BytesRead < BUFF_SIZE)\r
+ {\r
+ Buffer = (byte *) realloc(Buffer, FileSize);\r
+ return Buffer;\r
+ }\r
+ }\r
+ }\r
+\r
+byte *ReadAllStdioFile(const std::string &FileName, off_t &FileSize)\r
+ {\r
+#if WIN32\r
+ FILE *f = OpenStdioFile(FileName);\r
+ FileSize = GetStdioFileSize(f);\r
+ CloseStdioFile(f);\r
+\r
+ HANDLE h = CreateFile(FileName.c_str(), GENERIC_READ, FILE_SHARE_READ,\r
+ NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);\r
+ if (h == INVALID_HANDLE_VALUE)\r
+ Die("ReadAllStdioFile:Open(%s) failed", FileName.c_str());\r
+\r
+ unsigned uFileSize = (unsigned) FileSize;\r
+ if ((off_t) uFileSize != FileSize)\r
+ Die("File too big (%.1f Gb): %s", double(FileSize)/1e9, FileName.c_str());\r
+\r
+ byte *Buffer = myalloc(byte, uFileSize);\r
+ DWORD BytesRead;\r
+ ReadFile(h, Buffer, uFileSize, &BytesRead, NULL);\r
+ if (FileSize != BytesRead)\r
+ Die("ReadAllStdioFile:Error reading %s, attempted %u got %u",\r
+ FileName.c_str(), FileSize, (unsigned) BytesRead);\r
+\r
+ CloseHandle(h);\r
+ return Buffer;\r
+#else\r
+ int h = open(FileName.c_str(), O_RDONLY);\r
+ if (h < 0)\r
+ Die("ReadAllStdioFile:Cannot open %s", FileName.c_str());\r
+ FileSize = lseek(h, 0, SEEK_END);\r
+ if (FileSize == (off_t) (-1))\r
+ Die("ReadAllStdioFile:Error seeking %s", FileName.c_str());\r
+ // byte *Buffer = myalloc<byte>(FileSize);\r
+ size_t stBytes = (size_t) FileSize;\r
+ if ((off_t) stBytes != FileSize)\r
+ Die("ReadAllStdioFile: off_t overflow");\r
+ byte *Buffer = (byte *) malloc(stBytes);\r
+ if (Buffer == 0)\r
+ Die("ReadAllStdioFile: failed to allocate %s", MemBytesToStr(stBytes));\r
+ lseek(h, 0, SEEK_SET);\r
+ size_t n = read(h, Buffer, stBytes);\r
+ if (n != FileSize)\r
+ Die("ReadAllStdioFile, Error reading %s, attempted %g got %g",\r
+ FileName.c_str(), (double) FileSize, (double) n);\r
+ close(h);\r
+ return Buffer;\r
+#endif\r
+ }\r
+\r
+void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes)\r
+ {\r
+ if (0 == f)\r
+ Die("WriteStdioFile failed, f=NULL");\r
+ SetStdioFilePos(f, Pos);\r
+ unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f);\r
+ if (BytesWritten != Bytes)\r
+ {\r
+ LogStdioFileState(f);\r
+ Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d",\r
+ (int) Bytes, (int) BytesWritten, errno);\r
+ }\r
+ }\r
+\r
+void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes)\r
+ {\r
+ if (0 == f)\r
+ Die("WriteStdioFile failed, f=NULL");\r
+ unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f);\r
+ if (BytesWritten != Bytes)\r
+ {\r
+ LogStdioFileState(f);\r
+ Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d",\r
+ (int) Bytes, (int) BytesWritten, errno);\r
+ }\r
+ }\r
+\r
+// Return false on EOF, true if line successfully read.\r
+bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes)\r
+ {\r
+ if (feof(f))\r
+ return false;\r
+ if ((int) Bytes < 0)\r
+ Die("ReadLineStdioFile: Bytes < 0");\r
+ char *RetVal = fgets(Line, (int) Bytes, f);\r
+ if (NULL == RetVal)\r
+ {\r
+ if (feof(f))\r
+ return false;\r
+ if (ferror(f))\r
+ Die("ReadLineStdioFile: errno=%d", errno);\r
+ Die("ReadLineStdioFile: fgets=0, feof=0, ferror=0");\r
+ }\r
+\r
+ if (RetVal != Line)\r
+ Die("ReadLineStdioFile: fgets != Buffer");\r
+ unsigned n = strlen(Line);\r
+ if (n < 1 || Line[n-1] != '\n')\r
+ Die("ReadLineStdioFile: line too long or missing end-of-line");\r
+ if (n > 0 && (Line[n-1] == '\r' || Line[n-1] == '\n'))\r
+ Line[n-1] = 0;\r
+ if (n > 1 && (Line[n-2] == '\r' || Line[n-2] == '\n'))\r
+ Line[n-2] = 0;\r
+ return true;\r
+ }\r
+\r
+// Return false on EOF, true if line successfully read.\r
+bool ReadLineStdioFile(FILE *f, string &Line)\r
+ {\r
+ Line.clear();\r
+ for (;;)\r
+ {\r
+ int c = fgetc(f);\r
+ if (c == -1)\r
+ {\r
+ if (feof(f))\r
+ {\r
+ if (!Line.empty())\r
+ return true;\r
+ return false;\r
+ }\r
+ Die("ReadLineStdioFile, errno=%d", errno);\r
+ }\r
+ if (c == '\r')\r
+ continue;\r
+ if (c == '\n')\r
+ return true;\r
+ Line.push_back((char) c);\r
+ }\r
+ }\r
+\r
+// Copies all of fFrom regardless of current\r
+// file position, appends to fTo.\r
+void AppendStdioFileToFile(FILE *fFrom, FILE *fTo)\r
+ {\r
+ off_t SavedFromPos = GetStdioFilePos(fFrom);\r
+ off_t FileSize = GetStdioFileSize(fFrom);\r
+ const off_t BUFF_SIZE = 1024*1024;\r
+ char *Buffer = myalloc(char, BUFF_SIZE);\r
+ SetStdioFilePos(fFrom, 0);\r
+ off_t BytesRemaining = FileSize;\r
+ while (BytesRemaining > 0)\r
+ {\r
+ off_t BytesToRead = BytesRemaining;\r
+ if (BytesToRead > BUFF_SIZE)\r
+ BytesToRead = BUFF_SIZE;\r
+ ReadStdioFile(fFrom, Buffer, (unsigned) BytesToRead);\r
+ WriteStdioFile(fTo, Buffer, (unsigned) BytesToRead);\r
+ BytesRemaining -= BytesToRead;\r
+ }\r
+ SetStdioFilePos(fFrom, SavedFromPos);\r
+ }\r
+\r
+void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo)\r
+ {\r
+ int Ok = rename(FileNameFrom.c_str(), FileNameTo.c_str());\r
+ if (Ok != 0)\r
+ Die("RenameStdioFile(%s,%s) failed, errno=%d %s",\r
+ FileNameFrom.c_str(), FileNameTo.c_str(), errno, strerror(errno));\r
+ }\r
+\r
+void FlushStdioFile(FILE *f)\r
+ {\r
+ int Ok = fflush(f);\r
+ if (Ok != 0)\r
+ Die("fflush(%p)=%d,", f, Ok);\r
+ }\r
+\r
+void CloseStdioFile(FILE *f)\r
+ {\r
+ if (f == 0)\r
+ return;\r
+ int Ok = fclose(f);\r
+ if (Ok != 0)\r
+ Die("fclose(%p)=%d", f, Ok);\r
+ FreeBuffer(f);\r
+ }\r
+\r
+off_t GetStdioFilePos(FILE *f)\r
+ {\r
+ off_t FilePos = ftello(f);\r
+ if (FilePos < 0)\r
+ Die("ftello=%d", (int) FilePos);\r
+ return FilePos;\r
+ }\r
+\r
+off_t GetStdioFileSize(FILE *f)\r
+ {\r
+ off_t CurrentPos = GetStdioFilePos(f);\r
+ int Ok = fseeko(f, 0, SEEK_END);\r
+ if (Ok < 0)\r
+ Die("fseek in GetFileSize");\r
+\r
+ off_t Length = ftello(f);\r
+ if (Length < 0)\r
+ Die("ftello in GetFileSize");\r
+ SetStdioFilePos(f, CurrentPos);\r
+ return Length;\r
+ }\r
+\r
+void DeleteStdioFile(const string &FileName)\r
+ {\r
+ int Ok = remove(FileName.c_str());\r
+ if (Ok != 0)\r
+ Die("remove(%s) failed, errno=%d %s", FileName.c_str(), errno, strerror(errno));\r
+ }\r
+\r
+void myvstrprintf(string &Str, const char *Format, va_list ArgList)\r
+ {\r
+ static char szStr[MAX_FORMATTED_STRING_LENGTH];\r
+ vsnprintf(szStr, MAX_FORMATTED_STRING_LENGTH-1, Format, ArgList);\r
+ szStr[MAX_FORMATTED_STRING_LENGTH - 1] = '\0';\r
+ Str.assign(szStr);\r
+ }\r
+\r
+void myvstrprintf(string &Str, const char *Format, ...)\r
+ {\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Str, Format, ArgList);\r
+ va_end(ArgList);\r
+ }\r
+\r
+FILE *g_fLog = 0;\r
+\r
+void SetLogFileName(const string &FileName)\r
+ {\r
+ if (g_fLog != 0)\r
+ CloseStdioFile(g_fLog);\r
+ g_fLog = 0;\r
+ if (FileName.empty())\r
+ return;\r
+ g_fLog = CreateStdioFile(FileName);\r
+ }\r
+\r
+void Log(const char *Format, ...)\r
+ {\r
+ if (g_fLog == 0)\r
+ return;\r
+\r
+ static bool InLog = false;\r
+ if (InLog)\r
+ return;\r
+\r
+ InLog = true;\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ vfprintf(g_fLog, Format, ArgList);\r
+ va_end(ArgList);\r
+ fflush(g_fLog);\r
+ InLog = false;\r
+ }\r
+\r
+void Die(const char *Format, ...)\r
+ {\r
+ static bool InDie = false;\r
+ if (InDie)\r
+ exit(1);\r
+ InDie = true;\r
+ string Msg;\r
+\r
+ if (g_fLog != 0)\r
+ setbuf(g_fLog, 0);\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Msg, Format, ArgList);\r
+ va_end(ArgList);\r
+\r
+ fprintf(stderr, "\n\n");\r
+ Log("\n");\r
+ time_t t = time(0);\r
+ Log("%s", asctime(localtime(&t)));\r
+ for (unsigned i = 0; i < g_Argv.size(); i++)\r
+ {\r
+ fprintf(stderr, (i == 0) ? "%s" : " %s", g_Argv[i].c_str());\r
+ Log((i == 0) ? "%s" : " %s", g_Argv[i].c_str());\r
+ }\r
+ fprintf(stderr, "\n");\r
+ Log("\n");\r
+\r
+ time_t CurrentTime = time(0);\r
+ unsigned ElapsedSeconds = unsigned(CurrentTime - g_StartTime);\r
+ const char *sstr = SecsToStr(ElapsedSeconds);\r
+ Log("Elapsed time: %s\n", sstr);\r
+\r
+ const char *szStr = Msg.c_str();\r
+ fprintf(stderr, "\n---Fatal error---\n%s\n", szStr);\r
+ Log("\n---Fatal error---\n%s\n", szStr);\r
+\r
+#ifdef _MSC_VER\r
+ if (IsDebuggerPresent())\r
+ __debugbreak();\r
+ _CrtSetDbgFlag(0);\r
+#endif\r
+\r
+ exit(1);\r
+ }\r
+\r
+void Warning(const char *Format, ...)\r
+ {\r
+ string Msg;\r
+\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Msg, Format, ArgList);\r
+ va_end(ArgList);\r
+\r
+ const char *szStr = Msg.c_str();\r
+\r
+ fprintf(stderr, "\nWARNING: %s\n", szStr);\r
+ if (g_fLog != stdout)\r
+ {\r
+ Log("\nWARNING: %s\n", szStr);\r
+ fflush(g_fLog);\r
+ }\r
+ }\r
+\r
+#ifdef _MSC_VER\r
+double GetMemUseBytes()\r
+ {\r
+ HANDLE hProc = GetCurrentProcess();\r
+ PROCESS_MEMORY_COUNTERS PMC;\r
+ BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC));\r
+ if (!bOk)\r
+ return 1000000;\r
+ double Bytes = (double) PMC.WorkingSetSize;\r
+ if (Bytes > g_PeakMemUseBytes)\r
+ g_PeakMemUseBytes = Bytes;\r
+ return Bytes;\r
+ }\r
+#elif linux || __linux__\r
+double GetMemUseBytes()\r
+ {\r
+ static char statm[64];\r
+ static int PageSize = 1;\r
+ if (0 == statm[0])\r
+ {\r
+ PageSize = sysconf(_SC_PAGESIZE);\r
+ pid_t pid = getpid();\r
+ sprintf(statm, "/proc/%d/statm", (int) pid);\r
+ }\r
+\r
+ int fd = open(statm, O_RDONLY);\r
+ if (-1 == fd)\r
+ return 1000000;\r
+ char Buffer[64];\r
+ int n = read(fd, Buffer, sizeof(Buffer) - 1);\r
+ close(fd);\r
+ fd = -1;\r
+\r
+ if (n <= 0)\r
+ return 1000000;\r
+\r
+ Buffer[n] = 0;\r
+ double Pages = atof(Buffer);\r
+\r
+ double Bytes = Pages*PageSize;\r
+ if (Bytes > g_PeakMemUseBytes)\r
+ g_PeakMemUseBytes = Bytes;\r
+ return Bytes;\r
+ }\r
+#elif defined(__MACH__)\r
+#include <memory.h>\r
+#include <stdlib.h>\r
+#include <stdio.h>\r
+#include <unistd.h>\r
+#include <sys/types.h>\r
+#include <sys/sysctl.h>\r
+#include <sys/socket.h>\r
+#include <sys/gmon.h>\r
+#include <mach/vm_param.h>\r
+#include <netinet/in.h>\r
+#include <netinet/icmp6.h>\r
+#include <sys/vmmeter.h>\r
+#include <sys/proc.h>\r
+#include <mach/vm_statistics.h>\r
+#include <mach/task_info.h>\r
+#include <mach/task.h>\r
+#include <mach/mach_init.h>\r
+\r
+#define DEFAULT_MEM_USE 100000000.0\r
+\r
+double GetMemUseBytes()\r
+ {\r
+ task_t mytask = mach_task_self();\r
+ struct task_basic_info ti;\r
+ memset((void *) &ti, 0, sizeof(ti));\r
+ mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT;\r
+ kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count);\r
+ if (ok == KERN_INVALID_ARGUMENT)\r
+ return DEFAULT_MEM_USE;\r
+\r
+ if (ok != KERN_SUCCESS)\r
+ return DEFAULT_MEM_USE;\r
+\r
+ double Bytes = (double ) ti.resident_size;\r
+ if (Bytes > g_PeakMemUseBytes)\r
+ g_PeakMemUseBytes = Bytes;\r
+ return Bytes;\r
+ }\r
+#else\r
+double GetMemUseBytes()\r
+ {\r
+ return 0;\r
+ }\r
+#endif\r
+\r
+double GetPeakMemUseBytes()\r
+ {\r
+ return g_PeakMemUseBytes;\r
+ }\r
+\r
+const char *SecsToHHMMSS(int Secs)\r
+ {\r
+ int HH = Secs/3600;\r
+ int MM = (Secs - HH*3600)/60;\r
+ int SS = Secs%60;\r
+ static char Str[16];\r
+ if (HH == 0)\r
+ sprintf(Str, "%02d:%02d", MM, SS);\r
+ else\r
+ sprintf(Str, "%02d:%02d:%02d", HH, MM, SS);\r
+ return Str;\r
+ }\r
+\r
+const char *SecsToStr(double Secs)\r
+ {\r
+ if (Secs >= 10.0)\r
+ return SecsToHHMMSS((int) Secs);\r
+\r
+ static char Str[16];\r
+ if (Secs < 1e-6)\r
+ sprintf(Str, "%.2gs", Secs);\r
+ else if (Secs < 1e-3)\r
+ sprintf(Str, "%.2fms", Secs*1e3);\r
+ else\r
+ sprintf(Str, "%.3fs", Secs);\r
+ return Str;\r
+ }\r
+\r
+const char *MemBytesToStr(double Bytes)\r
+ {\r
+ static char Str[32];\r
+\r
+ if (Bytes < 1e6)\r
+ sprintf(Str, "%.1fkb", Bytes/1e3);\r
+ else if (Bytes < 10e6)\r
+ sprintf(Str, "%.1fMb", Bytes/1e6);\r
+ else if (Bytes < 1e9)\r
+ sprintf(Str, "%.0fMb", Bytes/1e6);\r
+ else if (Bytes < 10e9)\r
+ sprintf(Str, "%.1fGb", Bytes/1e9);\r
+ else if (Bytes < 100e9)\r
+ sprintf(Str, "%.0fGb", Bytes/1e9);\r
+ else\r
+ sprintf(Str, "%.3gb", Bytes);\r
+ return Str;\r
+ }\r
+\r
+const char *IntToStr(unsigned i)\r
+ {\r
+ static char Str[32];\r
+\r
+ double d = (double) i;\r
+ if (i < 10000)\r
+ sprintf(Str, "%u", i);\r
+ else if (i < 1e6)\r
+ sprintf(Str, "%.1fk", d/1e3);\r
+ else if (i < 10e6)\r
+ sprintf(Str, "%.1fM", d/1e6);\r
+ else if (i < 1e9)\r
+ sprintf(Str, "%.0fM", d/1e6);\r
+ else if (i < 10e9)\r
+ sprintf(Str, "%.1fG", d/1e9);\r
+ else if (i < 100e9)\r
+ sprintf(Str, "%.0fG", d/1e9);\r
+ else\r
+ sprintf(Str, "%.3g", d);\r
+ return Str;\r
+ }\r
+\r
+const char *FloatToStr(double d)\r
+ {\r
+ static char Str[32];\r
+\r
+ double a = fabs(d);\r
+ if (a < 0.01)\r
+ sprintf(Str, "%.3g", a);\r
+ else if (a >= 0.01 && a < 1)\r
+ sprintf(Str, "%.3f", a);\r
+ else if (a <= 10 && a >= 1)\r
+ {\r
+ double intpart;\r
+ if (modf(a, &intpart) < 0.05)\r
+ sprintf(Str, "%.0f", d);\r
+ else\r
+ sprintf(Str, "%.1f", d);\r
+ }\r
+ else if (a > 10 && a < 10000)\r
+ sprintf(Str, "%.0f", d);\r
+ else if (a < 1e6)\r
+ sprintf(Str, "%.1fk", d/1e3);\r
+ else if (a < 10e6)\r
+ sprintf(Str, "%.1fM", d/1e6);\r
+ else if (a < 1e9)\r
+ sprintf(Str, "%.0fM", d/1e6);\r
+ else if (a < 10e9)\r
+ sprintf(Str, "%.1fG", d/1e9);\r
+ else if (a < 100e9)\r
+ sprintf(Str, "%.0fG", d/1e9);\r
+ else\r
+ sprintf(Str, "%.3g", d);\r
+ return Str;\r
+ }\r
+\r
+bool opt_quiet = false;\r
+bool opt_version = false;\r
+bool opt_logopts = false;\r
+bool opt_compilerinfo = false;\r
+bool opt_help = false;\r
+string opt_log = "";\r
+\r
+bool optset_quiet = false;\r
+bool optset_version = false;\r
+bool optset_logopts = false;\r
+bool optset_compilerinfo = false;\r
+bool optset_help = false;\r
+bool optset_log = false;\r
+\r
+static string g_CurrentProgressLine;\r
+static string g_ProgressDesc;\r
+static unsigned g_ProgressIndex;\r
+static unsigned g_ProgressCount;\r
+\r
+static unsigned g_CurrProgressLineLength;\r
+static unsigned g_LastProgressLineLength;\r
+static unsigned g_CountsInterval;\r
+static unsigned g_StepCalls;\r
+static time_t g_TimeLastOutputStep;\r
+\r
+static string &GetProgressPrefixStr(string &s)\r
+ {\r
+ double Bytes = GetMemUseBytes();\r
+ unsigned Secs = GetElapsedSecs();\r
+ s = string(SecsToHHMMSS(Secs));\r
+ if (Bytes > 0)\r
+ {\r
+ s.push_back(' ');\r
+ char Str[32];\r
+ sprintf(Str, "%5.5s", MemBytesToStr(Bytes));\r
+ s += string(Str);\r
+ }\r
+ s.push_back(' ');\r
+ return s;\r
+ }\r
+\r
+void ProgressLog(const char *Format, ...)\r
+ {\r
+ string Str;\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Str, Format, ArgList);\r
+ va_end(ArgList);\r
+\r
+ Log("%s", Str.c_str());\r
+ Progress("%s", Str.c_str());\r
+ }\r
+\r
+void Progress(const char *Format, ...)\r
+ {\r
+ if (opt_quiet)\r
+ return;\r
+\r
+ string Str;\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Str, Format, ArgList);\r
+ va_end(ArgList);\r
+\r
+#if 0\r
+ Log("Progress(");\r
+ for (unsigned i = 0; i < Str.size(); ++i)\r
+ {\r
+ char c = Str[i];\r
+ if (c == '\r')\r
+ Log("\\r");\r
+ else if (c == '\n')\r
+ Log("\\n");\r
+ else\r
+ Log("%c", c);\r
+ }\r
+ Log(")\n");\r
+#endif //0\r
+\r
+ for (unsigned i = 0; i < Str.size(); ++i)\r
+ {\r
+ if (g_CurrProgressLineLength == 0)\r
+ {\r
+ string s;\r
+ GetProgressPrefixStr(s);\r
+ for (unsigned j = 0; j < s.size(); ++j)\r
+ {\r
+ fputc(s[j], stderr);\r
+ ++g_CurrProgressLineLength;\r
+ }\r
+ }\r
+\r
+ char c = Str[i];\r
+ if (c == '\n' || c == '\r')\r
+ {\r
+ for (unsigned j = g_CurrProgressLineLength; j < g_LastProgressLineLength; ++j)\r
+ fputc(' ', stderr);\r
+ if (c == '\n')\r
+ g_LastProgressLineLength = 0;\r
+ else\r
+ g_LastProgressLineLength = g_CurrProgressLineLength;\r
+ g_CurrProgressLineLength = 0;\r
+ fputc(c, stderr);\r
+ }\r
+ else\r
+ {\r
+ fputc(c, stderr);\r
+ ++g_CurrProgressLineLength;\r
+ }\r
+ }\r
+ }\r
+\r
+void ProgressExit()\r
+ {\r
+ time_t Now = time(0);\r
+ struct tm *t = localtime(&Now);\r
+ const char *s = asctime(t);\r
+ unsigned Secs = GetElapsedSecs();\r
+\r
+ Log("\n");\r
+ Log("Finished %s", s); // there is a newline in s\r
+ Log("Elapsed time %s\n", SecsToHHMMSS((int) Secs));\r
+ Log("Max memory %s\n", MemBytesToStr(g_PeakMemUseBytes));\r
+#if WIN32 && DEBUG\r
+// Skip exit(), which can be very slow in DEBUG build\r
+// VERY DANGEROUS practice, because it skips global destructors.\r
+// But if you know the rules, you can break 'em, right?\r
+ ExitProcess(0);\r
+#endif\r
+ }\r
+\r
+const char *PctStr(double x, double y)\r
+ {\r
+ if (y == 0)\r
+ {\r
+ if (x == 0)\r
+ return "100%";\r
+ else\r
+ return "inf%";\r
+ }\r
+ static char Str[16];\r
+ double p = x*100.0/y;\r
+ sprintf(Str, "%5.1f%%", p);\r
+ return Str;\r
+ }\r
+\r
+string &GetProgressLevelStr(string &s)\r
+ {\r
+ unsigned Index = g_ProgressIndex;\r
+ unsigned Count = g_ProgressCount;\r
+ if (Count == UINT_MAX)\r
+ {\r
+ if (Index == UINT_MAX)\r
+ s = "100%";\r
+ else\r
+ {\r
+ char Tmp[16];\r
+ sprintf(Tmp, "%u", Index); \r
+ s = Tmp;\r
+ }\r
+ }\r
+ else\r
+ s = string(PctStr(Index+1, Count));\r
+ s += string(" ") + g_ProgressDesc;\r
+ return s;\r
+ }\r
+\r
+void ProgressStep(unsigned i, unsigned N, const char *Format, ...)\r
+ {\r
+ if (opt_quiet)\r
+ return;\r
+\r
+ if (i == 0)\r
+ {\r
+ string Str;\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(Str, Format, ArgList);\r
+ va_end(ArgList);\r
+ g_ProgressDesc = Str;\r
+ g_ProgressIndex = 0;\r
+ g_ProgressCount = N;\r
+ g_CountsInterval = 1;\r
+ g_StepCalls = 0;\r
+ g_TimeLastOutputStep = 0;\r
+ if (g_CurrProgressLineLength > 0)\r
+ Progress("\n");\r
+ }\r
+\r
+ if (i >= N && i != UINT_MAX)\r
+ Die("ProgressStep(%u,%u)", i, N);\r
+ bool IsLastStep = (i == UINT_MAX || i + 1 == N);\r
+ if (!IsLastStep)\r
+ {\r
+ ++g_StepCalls;\r
+ if (g_StepCalls%g_CountsInterval != 0)\r
+ return;\r
+\r
+ time_t Now = time(0);\r
+ if (Now == g_TimeLastOutputStep)\r
+ {\r
+ if (g_CountsInterval < 128)\r
+ g_CountsInterval = (g_CountsInterval*3)/2;\r
+ else\r
+ g_CountsInterval += 64;\r
+ return;\r
+ }\r
+ else\r
+ {\r
+ time_t Secs = Now - g_TimeLastOutputStep;\r
+ if (Secs > 1)\r
+ g_CountsInterval = unsigned(g_CountsInterval/(Secs*8));\r
+ }\r
+\r
+ if (g_CountsInterval < 1)\r
+ g_CountsInterval = 1;\r
+\r
+ g_TimeLastOutputStep = Now;\r
+ }\r
+\r
+ g_ProgressIndex = i;\r
+\r
+ if (i > 0)\r
+ {\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ myvstrprintf(g_ProgressDesc, Format, ArgList);\r
+ }\r
+\r
+ string LevelStr;\r
+ GetProgressLevelStr(LevelStr);\r
+ Progress(" %s\r", LevelStr.c_str());\r
+\r
+ if (IsLastStep)\r
+ {\r
+ g_CountsInterval = 1;\r
+ fputc('\n', stderr);\r
+ }\r
+ }\r
+\r
+enum OptType\r
+ {\r
+ OT_Flag,\r
+ OT_Tog,\r
+ OT_Int,\r
+ OT_Uns,\r
+ OT_Str,\r
+ OT_Float,\r
+ OT_Enum\r
+ };\r
+\r
+struct OptInfo\r
+ {\r
+ void *Value;\r
+ bool *OptSet;\r
+ string LongName;\r
+ OptType Type;\r
+ int iMin;\r
+ int iMax;\r
+ unsigned uMin;\r
+ unsigned uMax;\r
+ double dMin;\r
+ double dMax;\r
+ map<string, unsigned> EnumValues;\r
+\r
+ bool bDefault;\r
+ int iDefault;\r
+ unsigned uDefault;\r
+ double dDefault;\r
+ string strDefault;\r
+\r
+ string Help;\r
+\r
+ bool operator<(const OptInfo &rhs) const\r
+ {\r
+ return LongName < rhs.LongName;\r
+ }\r
+ };\r
+\r
+static set<OptInfo> g_Opts;\r
+\r
+void Help()\r
+ {\r
+ printf("\n");\r
+\r
+ void Usage();\r
+ Usage();\r
+\r
+ for (set<OptInfo>::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p)\r
+ {\r
+ const OptInfo &Opt = *p;\r
+\r
+ printf("\n");\r
+ string LongName = Opt.LongName.c_str();\r
+ if (Opt.Type == OT_Tog)\r
+ LongName = string("[no]") + LongName;\r
+ printf(" --%s ", LongName.c_str());\r
+\r
+ switch (Opt.Type)\r
+ {\r
+ case OT_Flag:\r
+ break;\r
+ case OT_Tog:\r
+ break;\r
+ case OT_Int:\r
+ printf("<int>");\r
+ break;\r
+ case OT_Uns:\r
+ printf("<uint>");\r
+ break;\r
+ case OT_Str:\r
+ printf("<str>");\r
+ break;\r
+ case OT_Float:\r
+ printf("<float>");\r
+ break;\r
+ case OT_Enum:\r
+ printf("<enum>");\r
+ break;\r
+ default:\r
+ printf("??type");\r
+ break;\r
+ }\r
+\r
+ printf(" ");\r
+ const string &s = Opt.Help;\r
+ for (string::const_iterator q = s.begin(); q != s.end(); ++q)\r
+ {\r
+ char c = *q;\r
+ if (c == '\n')\r
+ printf("\n ");\r
+ else\r
+ printf("%c", c);\r
+ }\r
+ printf("\n");\r
+ }\r
+ printf("\n");\r
+ exit(0);\r
+ }\r
+\r
+void CmdLineErr(const char *Format, ...)\r
+ {\r
+ va_list ArgList;\r
+ va_start(ArgList, Format);\r
+ string Str;\r
+ myvstrprintf(Str, Format, ArgList);\r
+ va_end(ArgList);\r
+ fprintf(stderr, "\n");\r
+ fprintf(stderr, "Invalid command line\n");\r
+ fprintf(stderr, "%s\n", Str.c_str());\r
+ fprintf(stderr, "For list of command-line options use --help.\n");\r
+ fprintf(stderr, "\n");\r
+ exit(1);\r
+ }\r
+\r
+static set<OptInfo>::iterator GetOptInfo(const string &LongName,\r
+ bool ErrIfNotFound)\r
+ {\r
+ for (set<OptInfo>::iterator p = g_Opts.begin();\r
+ p != g_Opts.end(); ++p)\r
+ {\r
+ const OptInfo &Opt = *p;\r
+ if (Opt.LongName == LongName)\r
+ return p;\r
+ if (Opt.Type == OT_Tog && "no" + Opt.LongName == LongName)\r
+ return p;\r
+ }\r
+ if (ErrIfNotFound)\r
+ CmdLineErr("Option --%s is invalid", LongName.c_str());\r
+ return g_Opts.end();\r
+ }\r
+\r
+static void AddOpt(const OptInfo &Opt)\r
+ {\r
+ if (GetOptInfo(Opt.LongName, false) != g_Opts.end())\r
+ Die("Option --%s defined twice", Opt.LongName.c_str());\r
+ g_Opts.insert(Opt);\r
+ }\r
+\r
+#ifdef _MSC_VER\r
+#pragma warning(disable: 4505) // unreferenced local function\r
+#endif\r
+\r
+static void DefineFlagOpt(const string &LongName, const string &Help,\r
+ void *Value, bool *OptSet)\r
+ {\r
+ *(bool *) Value = false;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.bDefault = false;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Flag;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void DefineTogOpt(const string &LongName, bool Default, const string &Help,\r
+ void *Value, bool *OptSet)\r
+ {\r
+ *(bool *) Value = Default;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.bDefault = Default;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Tog;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void DefineIntOpt(const string &LongName, int Default, int Min, int Max,\r
+ const string &Help, void *Value, bool *OptSet)\r
+ {\r
+ *(int *) Value = Default;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.iDefault = Default;\r
+ Opt.iMin = Min;\r
+ Opt.iMax = Max;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Int;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void DefineUnsOpt(const string &LongName, unsigned Default, unsigned Min,\r
+ unsigned Max, const string &Help, void *Value, bool *OptSet)\r
+ {\r
+ *(unsigned *) Value = Default;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.uDefault = Default;\r
+ Opt.uMin = Min;\r
+ Opt.uMax = Max;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Uns;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void DefineFloatOpt(const string &LongName, double Default, double Min,\r
+ double Max, const string &Help, void *Value, bool *OptSet)\r
+ {\r
+ *(double *) Value = Default;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.dDefault = Default;\r
+ Opt.dMin = Min;\r
+ Opt.dMax = Max;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Float;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void DefineStrOpt(const string &LongName, const char *Default,\r
+ const string &Help, void *Value, bool *OptSet)\r
+ {\r
+ *(string *) Value = (Default == 0 ? "" : string(Default));\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.OptSet = OptSet;\r
+ Opt.LongName = LongName;\r
+ Opt.strDefault = (Default == 0 ? "" : string(Default));\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Str;\r
+ AddOpt(Opt);\r
+ }\r
+\r
+static void ParseEnumValues(const string &Values, map<string, unsigned> &EnumValues)\r
+ {\r
+ EnumValues.clear();\r
+ \r
+ string Name;\r
+ string Value;\r
+ bool Eq = false;\r
+ for (string::const_iterator p = Values.begin(); ; ++p)\r
+ {\r
+ char c = (p == Values.end() ? '|' : *p);\r
+ if (isspace(c))\r
+ ;\r
+ else if (c == '|')\r
+ {\r
+ if (EnumValues.find(Name) != EnumValues.end())\r
+ Die("Invalid enum values, '%s' defined twice: '%s'",\r
+ Name.c_str(), Values.c_str());\r
+ if (Name.empty() || Value.empty())\r
+ Die("Invalid enum values, empty name or value: '%s'",\r
+ Values.c_str());\r
+\r
+ EnumValues[Name] = atoi(Value.c_str());\r
+ Name.clear();\r
+ Value.clear();\r
+ Eq = false;\r
+ }\r
+ else if (c == '=')\r
+ Eq = true;\r
+ else if (Eq)\r
+ Value.push_back(c);\r
+ else\r
+ Name.push_back(c);\r
+ if (p == Values.end())\r
+ return;\r
+ }\r
+ }\r
+\r
+static void DefineEnumOpt(const string &LongName, const string &ShortName,\r
+ int Default, const string &Values, const string &Help, void *Value)\r
+ {\r
+ *(int *) Value = Default;\r
+\r
+ OptInfo Opt;\r
+ Opt.Value = Value;\r
+ Opt.LongName = LongName;\r
+ Opt.iDefault = Default;\r
+ Opt.Help = Help;\r
+ Opt.Type = OT_Enum;\r
+ ParseEnumValues(Values, Opt.EnumValues);\r
+ AddOpt(Opt);\r
+ }\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+#define FLAG_OPT(LongName) bool opt_##LongName; bool optset_##LongName;\r
+#define TOG_OPT(LongName, Default) bool opt_##LongName; bool optset_##LongName;\r
+#define INT_OPT(LongName, Default, Min, Max) int opt_##LongName; bool optset_##LongName;\r
+#define UNS_OPT(LongName, Default, Min, Max) unsigned opt_##LongName; bool optset_##LongName;\r
+#define FLT_OPT(LongName, Default, Min, Max) double opt_##LongName; bool optset_##LongName;\r
+#define STR_OPT(LongName, Default) string opt_##LongName; bool optset_##LongName;\r
+#define ENUM_OPT(LongName, Values, Default) int opt_##LongName; bool optset_##LongName;\r
+#include "myopts.h"\r
+\r
+static int EnumStrToInt(const OptInfo &Opt, const string &Value)\r
+ {\r
+ const map<string, unsigned> &e = Opt.EnumValues;\r
+ string s;\r
+ for (map<string, unsigned>::const_iterator p = e.begin(); p != e.end(); ++p)\r
+ {\r
+ if (Value == p->first)\r
+ return p->second;\r
+ s += " " + p->first;\r
+ }\r
+ CmdLineErr("--%s %s not recognized, valid are: %s",\r
+ Opt.LongName.c_str(), Value.c_str(), s.c_str());\r
+ ureturn(-1);\r
+ }\r
+\r
+static void SetOpt(OptInfo &Opt, const string &Value)\r
+ {\r
+ *Opt.OptSet = true;\r
+ switch (Opt.Type)\r
+ {\r
+ case OT_Int:\r
+ {\r
+ *(int *) Opt.Value = atoi(Value.c_str());\r
+ break;\r
+ }\r
+ case OT_Uns:\r
+ {\r
+ unsigned uValue = 0;\r
+ int n = sscanf(Value.c_str(), "%u", &uValue);\r
+ if (n != 1)\r
+ CmdLineErr("Invalid value '%s' for --%s",\r
+ Value.c_str(), Opt.LongName.c_str());\r
+ *(unsigned *) Opt.Value = uValue;\r
+ break;\r
+ }\r
+ case OT_Float:\r
+ {\r
+ *(double *) Opt.Value = atof(Value.c_str());\r
+ break;\r
+ }\r
+ case OT_Str:\r
+ {\r
+ *(string *) Opt.Value = Value;\r
+ break;\r
+ }\r
+ case OT_Enum:\r
+ {\r
+ *(int *) Opt.Value = EnumStrToInt(Opt, Value);\r
+ break;\r
+ }\r
+ default:\r
+ asserta(false);\r
+ }\r
+ }\r
+\r
+void LogOpts()\r
+ {\r
+ for (set<OptInfo>::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p)\r
+ {\r
+ const OptInfo &Opt = *p;\r
+ Log("%s = ", Opt.LongName.c_str());\r
+ switch (Opt.Type)\r
+ {\r
+ case OT_Flag:\r
+ Log("%s", (*(bool *) Opt.Value) ? "yes" : "no");\r
+ break;\r
+ case OT_Tog:\r
+ Log("%s", (*(bool *) Opt.Value) ? "on" : "off");\r
+ break;\r
+ case OT_Int:\r
+ Log("%d", *(int *) Opt.Value);\r
+ break;\r
+ case OT_Uns:\r
+ Log("%u", *(unsigned *) Opt.Value);\r
+ break;\r
+ case OT_Float:\r
+ {\r
+ double Value = *(double *) Opt.Value;\r
+ if (Value == FLT_MAX)\r
+ Log("*");\r
+ else\r
+ Log("%g", Value);\r
+ break;\r
+ }\r
+ case OT_Str:\r
+ Log("%s", (*(string *) Opt.Value).c_str());\r
+ break;\r
+ case OT_Enum:\r
+ Log("%d", *(int *) Opt.Value);\r
+ break;\r
+ default:\r
+ asserta(false);\r
+ }\r
+ Log("\n");\r
+ }\r
+ }\r
+\r
+static void CompilerInfo()\r
+ {\r
+#ifdef _FILE_OFFSET_BITS\r
+ printf("_FILE_OFFSET_BITS=%d\n", _FILE_OFFSET_BITS);\r
+#else\r
+ printf("_FILE_OFFSET_BITS not defined\n");\r
+#endif\r
+\r
+#define x(t) printf("sizeof(" #t ") = %d\n", (int) sizeof(t));\r
+ x(int)\r
+ x(long)\r
+ x(float)\r
+ x(double)\r
+ x(void *)\r
+ x(off_t)\r
+#undef x\r
+ exit(0);\r
+ }\r
+\r
+void Split(const string &Str, vector<string> &Fields, char Sep)\r
+ {\r
+ Fields.clear();\r
+ const unsigned Length = (unsigned) Str.size();\r
+ string s;\r
+ for (unsigned i = 0; i < Length; ++i)\r
+ {\r
+ char c = Str[i];\r
+ if ((Sep == 0 && isspace(c)) || c == Sep)\r
+ {\r
+ if (!s.empty() || Sep != 0)\r
+ Fields.push_back(s);\r
+ s.clear();\r
+ }\r
+ else\r
+ s.push_back(c);\r
+ }\r
+ if (!s.empty())\r
+ Fields.push_back(s);\r
+ }\r
+\r
+static void GetArgsFromFile(const string &FileName, vector<string> &Args)\r
+ {\r
+ Args.clear();\r
+\r
+ FILE *f = OpenStdioFile(FileName);\r
+ string Line;\r
+ while (ReadLineStdioFile(f, Line))\r
+ {\r
+ size_t n = Line.find('#');\r
+ if (n != string::npos)\r
+ Line = Line.substr(0, n);\r
+ vector<string> Fields;\r
+ Split(Line, Fields);\r
+ Args.insert(Args.end(), Fields.begin(), Fields.end());\r
+ }\r
+ CloseStdioFile(f);\r
+ }\r
+\r
+void MyCmdLine(int argc, char **argv)\r
+ {\r
+ static unsigned RecurseDepth = 0;\r
+ ++RecurseDepth;\r
+\r
+ DefineFlagOpt("compilerinfo", "Write info about compiler types and #defines to stdout.",\r
+ (void *) &opt_compilerinfo, &optset_compilerinfo);\r
+ DefineFlagOpt("quiet", "Turn off progress messages.", (void *) &opt_quiet, &optset_quiet);\r
+ DefineFlagOpt("version", "Show version and exit.", (void *) &opt_version, &optset_version);\r
+ DefineFlagOpt("logopts", "Log options.", (void *) &opt_logopts, &optset_logopts);\r
+ DefineFlagOpt("help", "Display command-line options.", (void *) &opt_help, &optset_help);\r
+ DefineStrOpt("log", "", "Log file name.", (void *) &opt_log, &optset_log);\r
+\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+#define FLAG_OPT(LongName) DefineFlagOpt(#LongName, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define TOG_OPT(LongName, Default) DefineTogOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define INT_OPT(LongName, Default, Min, Max) DefineIntOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define UNS_OPT(LongName, Default, Min, Max) DefineUnsOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define FLT_OPT(LongName, Default, Min, Max) DefineFloatOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define STR_OPT(LongName, Default) DefineStrOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#define ENUM_OPT(LongName, Values, Default) DefineEnumOpt(#LongName, Values, Default, "help", (void *) &opt_##LongName, &optset_##LongName);\r
+#include "myopts.h"\r
+\r
+ if (RecurseDepth == 0)\r
+ g_Argv.clear();\r
+\r
+ for (int i = 0; i < argc; ++i) {\r
+ g_Argv.push_back(string(argv[i]));\r
+ }\r
+\r
+ int i = 1;\r
+ for (;;)\r
+ {\r
+ if (i >= argc)\r
+ break;\r
+ const string &Arg = g_Argv[i];\r
+ \r
+ if (Arg.empty())\r
+ continue;\r
+ else if (Arg == "file:" && i + 1 < argc)\r
+ {\r
+ const string &FileName = g_Argv[i+1];\r
+ vector<string> Args;\r
+ GetArgsFromFile(FileName, Args);\r
+ for (vector<string>::const_iterator p = Args.begin();\r
+ p != Args.end(); ++p)\r
+ {\r
+ g_Argv.push_back(*p);\r
+ ++argc;\r
+ }\r
+ i += 2;\r
+ continue;\r
+ }\r
+ else if (Arg.size() > 1 && Arg[0] == '-')\r
+ {\r
+ string LongName = (Arg.size() > 2 && Arg[1] == '-' ? Arg.substr(2) : Arg.substr(1));\r
+ OptInfo Opt = *GetOptInfo(LongName, true);\r
+ *Opt.OptSet = true;\r
+ if (Opt.Type == OT_Flag)\r
+ {\r
+ g_Opts.erase(Opt);\r
+ *(bool *) Opt.Value = true;\r
+ g_Opts.insert(Opt);\r
+ ++i;\r
+ continue;\r
+ }\r
+ else if (Opt.Type == OT_Tog)\r
+ {\r
+ g_Opts.erase(Opt);\r
+ if (string("no") + Opt.LongName == LongName)\r
+ *(bool *) Opt.Value = false;\r
+ else\r
+ {\r
+ asserta(Opt.LongName == LongName);\r
+ *(bool *) Opt.Value = true;\r
+ }\r
+ g_Opts.insert(Opt);\r
+ ++i;\r
+ continue;\r
+ }\r
+\r
+ ++i;\r
+ if (i >= argc)\r
+ CmdLineErr("Missing value for option --%s", LongName.c_str());\r
+\r
+ string Value = g_Argv[i];\r
+ SetOpt(Opt, Value);\r
+\r
+ ++i;\r
+ continue;\r
+ }\r
+ else\r
+ CmdLineErr("Expected -option_name or --option_name, got '%s'", Arg.c_str());\r
+ }\r
+\r
+ --RecurseDepth;\r
+ if (RecurseDepth > 0)\r
+ return;\r
+\r
+ if (opt_help)\r
+ Help();\r
+\r
+ if (opt_compilerinfo)\r
+ CompilerInfo();\r
+\r
+ SetLogFileName(opt_log);\r
+\r
+ if (opt_log != "")\r
+ {\r
+ for (int i = 0; i < argc; ++i)\r
+ Log("%s%s", i == 0 ? "" : " ", g_Argv[i].c_str());\r
+ Log("\n");\r
+ time_t Now = time(0);\r
+ struct tm *t = localtime(&Now);\r
+ const char *s = asctime(t);\r
+ Log("Started %s", s); // there is a newline in s\r
+ Log("Version " MY_VERSION ".%s\n", SVN_VERSION);\r
+ Log("\n");\r
+ }\r
+\r
+ if (opt_logopts)\r
+ LogOpts();\r
+ }\r
+\r
+double Pct(double x, double y)\r
+ {\r
+ if (y == 0.0f)\r
+ return 0.0f;\r
+ return (x*100.0f)/y;\r
+ }\r
+\r
+void GetCmdLine(string &s)\r
+ {\r
+ s.clear();\r
+ for (unsigned i = 0; i < SIZE(g_Argv); ++i)\r
+ {\r
+ if (i > 0)\r
+ s += " ";\r
+ s += g_Argv[i];\r
+ }\r
+ }\r
+\r
+char *mystrsave(const char *s)\r
+ {\r
+ unsigned n = unsigned(strlen(s));\r
+ char *t = myalloc(char, n+1);\r
+ memcpy(t, s, n+1);\r
+ return t;\r
+ }\r
+\r
+void Logu(unsigned u, unsigned w, unsigned prefixspaces)\r
+ {\r
+ for (unsigned i = 0; i < prefixspaces; ++i)\r
+ Log(" ");\r
+ if (u == UINT_MAX)\r
+ Log("%*.*s", w, w, "*");\r
+ else\r
+ Log("%*u", w, u);\r
+ }\r
+\r
+void Logf(float x, unsigned w, unsigned prefixspaces)\r
+ {\r
+ for (unsigned i = 0; i < prefixspaces; ++i)\r
+ Log(" ");\r
+ if (x == FLT_MAX)\r
+ Log("%*.*s", w, w, "*");\r
+ else\r
+ Log("%*.2f", w, x);\r
+ }\r
+\r
+static uint32 g_SLCG_state = 1;\r
+\r
+// Numerical values used by Microsoft C, according to wikipedia:\r
+// http://en.wikipedia.org/wiki/Linear_congruential_generator\r
+static uint32 g_SLCG_a = 214013;\r
+static uint32 g_SLCG_c = 2531011;\r
+\r
+// Simple Linear Congruential Generator\r
+// Bad properties; used just to initialize the better generator.\r
+static uint32 SLCG_rand()\r
+ {\r
+ g_SLCG_state = g_SLCG_state*g_SLCG_a + g_SLCG_c;\r
+ return g_SLCG_state;\r
+ }\r
+\r
+static void SLCG_srand(uint32 Seed)\r
+ {\r
+ g_SLCG_state = Seed;\r
+ for (int i = 0; i < 10; ++i)\r
+ SLCG_rand();\r
+ }\r
+\r
+/***\r
+A multiply-with-carry random number generator, see:\r
+http://en.wikipedia.org/wiki/Multiply-with-carry\r
+\r
+The particular multipliers used here were found on\r
+the web where they are attributed to George Marsaglia.\r
+***/\r
+\r
+static bool g_InitRandDone = false;\r
+static uint32 g_X[5];\r
+\r
+uint32 RandInt32()\r
+ {\r
+ InitRand();\r
+\r
+ uint64 Sum = 2111111111*(uint64) g_X[3] + 1492*(uint64) g_X[2] +\r
+ 1776*(uint64) g_X[1] + 5115*(uint64) g_X[0] + g_X[4];\r
+ g_X[3] = g_X[2];\r
+ g_X[2] = g_X[1];\r
+ g_X[1] = g_X[0];\r
+ g_X[4] = (uint32) (Sum >> 32);\r
+ g_X[0] = (uint32) Sum;\r
+ return g_X[0];\r
+ }\r
+\r
+unsigned randu32()\r
+ {\r
+ return (unsigned) RandInt32();\r
+ }\r
+\r
+void InitRand()\r
+ {\r
+ if (g_InitRandDone)\r
+ return;\r
+// Do this first to avoid recursion\r
+ g_InitRandDone = true;\r
+\r
+ unsigned Seed = (optset_randseed ? opt_randseed : (unsigned) (time(0)*getpid()));\r
+ Log("RandSeed=%u\n", Seed);\r
+ SLCG_srand(Seed);\r
+\r
+ for (unsigned i = 0; i < 5; i++)\r
+ g_X[i] = SLCG_rand();\r
+\r
+ for (unsigned i = 0; i < 100; i++)\r
+ RandInt32();\r
+ }\r
+\r
+// MUST COME AT END BECAUSE OF #undef\r
+#if RCE_MALLOC\r
+#undef mymalloc\r
+#undef myfree\r
+#undef myfree2\r
+void *mymalloc(unsigned bytes, const char *FileName, int Line)\r
+ {\r
+ void *rce_malloc(unsigned bytes, const char *FileName, int Line);\r
+ return rce_malloc(bytes, FileName, Line);\r
+ }\r
+\r
+void myfree(void *p, const char *FileName, int Line)\r
+ {\r
+ void rce_free(void *p, const char *FileName, int Line);\r
+ rce_free(p, FileName, Line);\r
+ }\r
+\r
+void myfree2(void *p, unsigned bytes, const char *FileName, int Line)\r
+ {\r
+ void rce_free(void *p, const char *FileName, int Line);\r
+ rce_free(p, FileName, Line);\r
+ }\r
+\r
+#else // RCE_MALLOC\r
+void *mymalloc(unsigned bytes)\r
+ {\r
+ ++g_NewCalls;\r
+ if (g_InitialMemUseBytes == 0)\r
+ g_InitialMemUseBytes = GetMemUseBytes();\r
+\r
+ g_TotalAllocBytes += bytes;\r
+ g_NetBytes += bytes;\r
+ if (g_NetBytes > g_MaxNetBytes)\r
+ {\r
+ if (g_NetBytes > g_MaxNetBytes + 10000000)\r
+ GetMemUseBytes();//to force update of peak\r
+ g_MaxNetBytes = g_NetBytes;\r
+ }\r
+ void *p = malloc(bytes);\r
+ //void *p = _malloc_dbg(bytes, _NORMAL_BLOCK, __FILE__, __LINE__);\r
+ if (0 == p)\r
+ {\r
+ double b = GetMemUseBytes();\r
+ fprintf(stderr, "\nOut of memory mymalloc(%u), curr %.3g bytes",\r
+ (unsigned) bytes, b);\r
+ void LogAllocs();\r
+ LogAllocs();\r
+#if DEBUG && defined(_MSC_VER)\r
+ asserta(_CrtCheckMemory());\r
+#endif\r
+ Die("Out of memory, mymalloc(%u), curr %.3g bytes\n",\r
+ (unsigned) bytes, b);\r
+ }\r
+ return p;\r
+ }\r
+\r
+void myfree(void *p)\r
+ {\r
+ if (p == 0)\r
+ return;\r
+ free(p);\r
+ //_free_dbg(p, _NORMAL_BLOCK);\r
+ }\r
+\r
+void myfree2(void *p, unsigned bytes)\r
+ {\r
+ ++g_FreeCalls;\r
+ g_TotalFreeBytes += bytes;\r
+ g_NetBytes -= bytes;\r
+\r
+ if (p == 0)\r
+ return;\r
+ free(p);\r
+ }\r
+#endif\r
--- /dev/null
+#ifndef myutils_h\r
+#define myutils_h\r
+\r
+#define RCE_MALLOC 0\r
+
+#include <stdio.h>\r
+#include <sys/types.h>\r
+#include <string>\r
+#include <string.h>\r
+#include <memory.h>\r
+#include <vector>\r
+#include <math.h>\r
+#include <stdarg.h>\r
+#include <cstdlib>\r
+#include <climits>\r
+\r
+#ifndef _MSC_VER\r
+#include <inttypes.h>\r
+#endif\r
+\r
+using namespace std;\r
+\r
+#ifdef _MSC_VER\r
+#include <crtdbg.h>\r
+#pragma warning(disable: 4996) // deprecated functions\r
+#define _CRT_SECURE_NO_DEPRECATE 1\r
+#endif\r
+\r
+#if defined(_DEBUG) && !defined(DEBUG)\r
+#define DEBUG 1\r
+#endif\r
+\r
+#if defined(DEBUG) && !defined(_DEBUG)\r
+#define _DEBUG 1\r
+#endif\r
+\r
+#ifndef NDEBUG\r
+#define DEBUG 1\r
+#define _DEBUG 1\r
+#endif\r
+\r
+typedef unsigned char byte;\r
+typedef unsigned short uint16;\r
+typedef unsigned uint32;\r
+typedef int int32;\r
+typedef double float32;\r
+typedef signed char int8;\r
+typedef unsigned char uint8;\r
+\r
+#ifdef _MSC_VER\r
+\r
+typedef __int64 int64;\r
+typedef unsigned __int64 uint64;\r
+\r
+#define INT64_PRINTF "lld"\r
+#define UINT64_PRINTF "llu"\r
+\r
+#define SIZE_T_PRINTF "u"\r
+#define OFF64_T_PRINTF "lld"\r
+\r
+#define INT64_PRINTFX "llx"\r
+#define UINT64_PRINTFX "llx"\r
+\r
+#define SIZE_T_PRINTFX "x"\r
+#define OFF64_T_PRINTFX "llx"\r
+\r
+#elif defined(__x86_64__)\r
+\r
+typedef long int64;\r
+typedef unsigned long uint64;\r
+\r
+#define INT64_PRINTF "ld"\r
+#define UINT64_PRINTF "lu"\r
+\r
+#define SIZE_T_PRINTF "lu"\r
+#define OFF64_T_PRINTF "ld"\r
+\r
+#define INT64_PRINTFX "lx"\r
+#define UINT64_PRINTFX "lx"\r
+\r
+#define SIZE_T_PRINTFX "lx"\r
+#define OFF64_T_PRINTFX "lx"\r
+\r
+#else\r
+\r
+typedef long long int64;\r
+typedef unsigned long long uint64;\r
+\r
+#define INT64_PRINTF "lld"\r
+#define UINT64_PRINTF "llu"\r
+\r
+#define SIZE_T_PRINTF "u"\r
+#define OFF64_T_PRINTF "lld"\r
+\r
+#define INT64_PRINTFX "llx"\r
+#define UINT64_PRINTFX "llx"\r
+\r
+#define SIZE_T_PRINTFX "x"\r
+#define OFF64_T_PRINTFX "llx"\r
+#endif\r
+\r
+#define d64 INT64_PRINTF\r
+#define u64 UINT64_PRINTF\r
+#define x64 UINT64_PRINTFX\r
+\r
+// const uint64 UINT64_MAX = (~((uint64) 0));\r
+\r
+void myassertfail(const char *Exp, const char *File, unsigned Line);\r
+#undef assert\r
+#ifdef NDEBUG\r
+#define assert(exp) ((void)0)\r
+#define myassert(exp) ((void)0)\r
+#else\r
+#define assert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+#define myassert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+#endif\r
+#define asserta(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) )\r
+\r
+#define ureturn(x) return (x)\r
+\r
+#define NotUsed(v) ((void *) &v)\r
+\r
+// pom=plus or minus, tof=true or false\r
+static inline char pom(bool Plus) { return Plus ? '+' : '-'; }\r
+static inline char tof(bool x) { return x ? 'T' : 'F'; }\r
+static inline char yon(bool x) { return x ? 'Y' : 'N'; }\r
+unsigned GetElapsedSecs();\r
+\r
+#if RCE_MALLOC\r
+\r
+void *rce_malloc(unsigned bytes, const char *FileName, int Line);\r
+void rce_free(void *p, const char *FileName, int LineNr);\r
+void rce_chkmem();\r
+\r
+void rce_dumpmem_(const char *FileName, int LineNr);\r
+#define rce_dumpmem() rce_dumpmem_(__FILE__, __LINE__)\r
+\r
+void rce_assertvalidptr_(void *p, const char *FileName, int LineNr);\r
+#define rce_assertvalidptr(p) rce_assertvalidptr_(p, __FILE__, __LINE__)\r
+\r
+void rce_dumpptr_(void *p, const char *FileName, int LineNr);\r
+#define rce_dumpptr(p) rce_dumpptr_(p, __FILE__, __LINE__)\r
+\r
+#define mymalloc(n) rce_malloc((n), __FILE__, __LINE__)\r
+#define myfree(p) rce_free(p, __FILE__, __LINE__)\r
+#define myfree2(p,n) rce_free(p, __FILE__, __LINE__)\r
+#define myalloc(t, n) (t *) rce_malloc((n)*sizeof(t), __FILE__, __LINE__)\r
+\r
+#else // RCE_MALLOC\r
+void *mymalloc(unsigned bytes);\r
+void myfree2(void *p, unsigned Bytes);\r
+void myfree(void *p);\r
+#define rce_chkmem() /* empty */\r
+#define myalloc(t, n) (t *) mymalloc((n)*sizeof(t))\r
+#endif // RCE_MALLOC\r
+\r
+#define SIZE(c) unsigned((c).size())\r
+\r
+bool myisatty(int fd);\r
+\r
+#ifdef _MSC_VER\r
+#define off_t __int64\r
+#endif\r
+\r
+FILE *OpenStdioFile(const string &FileName);\r
+FILE *CreateStdioFile(const string &FileName);\r
+bool CanSetStdioFilePos(FILE *f);\r
+void CloseStdioFile(FILE *f);\r
+void SetStdioFilePos(FILE *f, off_t Pos);\r
+void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes);\r
+void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes);\r
+void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes);\r
+void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes);\r
+bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes);\r
+bool ReadLineStdioFile(FILE *f, string &Line);\r
+byte *ReadAllStdioFile(FILE *f, off_t &FileSize);\r
+byte *ReadAllStdioFile(const string &FileName, off_t &FileSize);\r
+void AppendStdioFileToFile(FILE *fFrom, FILE *fTo);\r
+void FlushStdioFile(FILE *f);\r
+bool StdioFileExists(const string &FileName);\r
+off_t GetStdioFilePos(FILE *f);\r
+off_t GetStdioFileSize(FILE *f);\r
+void LogStdioFileState(FILE *f);\r
+void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo);\r
+void DeleteStdioFile(const string &FileName);\r
+\r
+void myvstrprintf(string &Str, const char *szFormat, va_list ArgList);\r
+void myvstrprintf(string &Str, const char *szFormat, ...);\r
+\r
+void SetLogFileName(const string &FileName);\r
+void Log(const char *szFormat, ...);\r
+\r
+void Die(const char *szFormat, ...);\r
+void Warning(const char *szFormat, ...);\r
+\r
+void ProgressStep(unsigned i, unsigned N, const char *Format, ...);\r
+void Progress(const char *szFormat, ...);\r
+void Progress(const string &Str);\r
+void ProgressLog(const char *szFormat, ...);\r
+void ProgressExit();\r
+\r
+char *mystrsave(const char *s);\r
+\r
+double GetPeakMemUseBytes();\r
+\r
+// Are two floats equal to within epsilon?\r
+const double epsilon = 0.01;\r
+inline bool feq(double x, double y, double epsilon)\r
+ {\r
+ if (fabs(x) > 10000)\r
+ epsilon = fabs(x)/10000;\r
+ if (fabs(x - y) > epsilon)\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+inline bool feq(double x, double y)\r
+ {\r
+ if (x < -1e6 && y < -1e6)\r
+ return true;\r
+ double e = epsilon;\r
+ if (fabs(x) > 10000)\r
+ e = fabs(x)/10000;\r
+ if (fabs(x - y) > e)\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+#define asserteq(x, y) assert(feq(x, y))\r
+#define assertaeq(x, y) asserta(feq(x, y))\r
+\r
+#define zero(a, n) memset(a, 0, n*sizeof(a[0]))\r
+\r
+void InitRand();\r
+unsigned randu32();\r
+void Split(const string &Str, vector<string> &Fields, char Sep = 0);\r
+double Pct(double x, double y);\r
+double GetMemUseBytes();\r
+const char *MemBytesToStr(double Bytes);\r
+const char *IntToStr(unsigned i);\r
+const char *FloatToStr(double d);\r
+const char *SecsToStr(double Secs);\r
+void Logu(unsigned u, unsigned w, unsigned prefixspaces = 2);\r
+void Logf(float x, unsigned w, unsigned prefixspaces = 2);\r
+const char *SecsToHHMMSS(int Secs);\r
+\r
+void MyCmdLine(int argc, char **argv);\r
+void CmdLineErr(const char *Format, ...);\r
+void Help();\r
+void GetCmdLine(string &s);\r
+\r
+#define FLAG_OPT(LongName) extern bool opt_##LongName; extern bool optset_##LongName;\r
+#define TOG_OPT(LongName, Default) extern bool opt_##LongName; extern bool optset_##LongName;\r
+#define INT_OPT(LongName, Default, Min, Max) extern int opt_##LongName; extern bool optset_##LongName;\r
+#define UNS_OPT(LongName, Default, Min, Max) extern unsigned opt_##LongName; extern bool optset_##LongName;\r
+#define FLT_OPT(LongName, Default, Min, Max) extern double opt_##LongName; extern bool optset_##LongName;\r
+#define STR_OPT(LongName, Default) extern string opt_##LongName; extern bool optset_##LongName;\r
+#define ENUM_OPT(LongName, Default, Values) extern int opt_##LongName; extern bool optset_##LongName;\r
+#include "myopts.h"\r
+#undef FLAG_OPT\r
+#undef TOG_OPT\r
+#undef INT_OPT\r
+#undef UNS_OPT\r
+#undef FLT_OPT\r
+#undef STR_OPT\r
+#undef ENUM_OPT\r
+\r
+extern const char *SVN_VERSION;\r
+extern const char *SVN_MODS;\r
+extern bool opt_quiet;
+extern bool opt_version;
+extern FILE *g_fLog;
+\r
+#endif // myutils_h\r
--- /dev/null
+#ifndef orf_h\r
+#define orf_h\r
+\r
+#include "alpha.h"\r
+\r
+struct ORFData\r
+ {\r
+ const byte *NucSeq;\r
+ const byte *AminoSeq;\r
+ int Frame;\r
+ unsigned NucL;\r
+ unsigned AminoL;\r
+ unsigned NucLo;\r
+ unsigned NucHi;\r
+ ORFData *Next;\r
+\r
+ unsigned GetNucPosFirstBase() const;\r
+ unsigned GetAAPos(unsigned NucPos) const;\r
+ unsigned GetCodex(unsigned NucPos) const;\r
+ unsigned GetNucLo(unsigned AALo, unsigned AAHi) const;\r
+ unsigned GetNucHi(unsigned AALo, unsigned AAHi) const;\r
+ unsigned GetAALo(unsigned NucLo, unsigned NucHi) const;\r
+ unsigned GetAAHi(unsigned NucLo, unsigned NucHi) const;\r
+ unsigned GetNucPosFirstBaseInCodon(unsigned AAPos) const;\r
+ unsigned GetNucPosLastBaseInCodon(unsigned AAPos) const;\r
+ unsigned RoundToCodonLo(unsigned NucPos) const;\r
+ unsigned RoundToCodonHi(unsigned NucPos) const;\r
+ void LogMe() const;\r
+ void LogMe2() const;\r
+ };\r
+\r
+const byte ORFEND = '.';\r
+\r
+void GetORFs(const byte *NucSeq, unsigned NucL, vector<ORFData> &ORFs,\r
+ unsigned ORFStyle, int FindFrame, int Sign);\r
+\r
+#endif // orf_h\r
--- /dev/null
+#ifndef out_h\r
+#define out_h\r
+\r
+#include "seq.h"\r
+#include "hsp.h"\r
+#include "orf.h"\r
+#include "path.h"\r
+#include <float.h>\r
+\r
+struct AlnData\r
+ {\r
+/***\r
+SA.Seq and SB.Seq align.\r
+Reverse strand stuff for nucleotides is handled like this:\r
+ SA.RevComp must be false.\r
+ If SB.RevComp is true, then SA.Seq is r.c.'d relative to the sequence in\r
+ the input file (query or db). If so, coordinates in HSP refer to SB.Seq\r
+ so are also r.c.'d relative to the original sequence.\r
+***/\r
+ SeqData SA;\r
+ SeqData SB;\r
+ HSPData HSP;\r
+ const char *Path;\r
+ char IdDesc[256];\r
+\r
+ float FractId;\r
+ float RawScore;\r
+ float BitScore;\r
+ float Evalue;\r
+\r
+ void LogMe() const\r
+ {\r
+ Log("AD: ");\r
+ HSP.LogMe();\r
+ Log(" %s,%s\n", SA.Label, SB.Label);\r
+ }\r
+ };\r
+\r
+bool OnDerepHit(const SeqData &SA, const SeqData &SB);\r
+\r
+bool OnLocalUngappedHit(const SeqData &SA, const SeqData &SB,\r
+ const HSPData &HSP, float &Evalue, float &FractId);\r
+\r
+bool OnLocalGappedHit(const SeqData &SA, const SeqData &SB,\r
+ const HSPData &HSP, const PathData &PD, float &Evalue, float &FractId);\r
+\r
+bool OnGlobalHit(const SeqData &SA, const SeqData &SB, const PathData &PD,\r
+ float &FractId);\r
+\r
+void OnReject(const SeqData &SA, const SeqData &SB, double FractId,\r
+ const char *Path);\r
+\r
+void OnNotMatched(const char *Label, unsigned L);\r
+void OnNewCluster(unsigned ClusterIndex, const char *Label, unsigned L);\r
+void OnNewLibCluster(unsigned ClusterIndex, const char *Label, unsigned L);\r
+void OnLibCluster(unsigned ClusterIndex, unsigned Size, double AvgId,\r
+ const char *Label);\r
+void OnNewCluster(unsigned ClusterIndex, unsigned Size, double AvgId,\r
+ const char *Label);\r
+void OnChainCov(const SeqData &NucleoSD, const SeqData &TargetSD,\r
+ float Score, float ChainCov);\r
+\r
+void SetUserFieldIndexes(const string &s);\r
+\r
+void BlastOut(FILE *f, const AlnData &AD);\r
+void Blast6Out(FILE *f, const AlnData &AD);\r
+void FastaPairOut(FILE *f, const AlnData &AD);\r
+void UserOut(FILE *f, const AlnData &AD);\r
+\r
+void BlastOutORF(FILE *f, const AlnData &AD);\r
+\r
+void OpenOutputFiles();\r
+void CloseOutputFiles();\r
+void SetLibSeedCount(unsigned DBSeqCount);\r
+const char *UserFieldIndexToStr(unsigned i);\r
+\r
+extern float **g_SubstMx;\r
+\r
+static char g_IdChar = '|';\r
+static char g_DiffChar = ' ';\r
+\r
+static inline char GetSymN(byte Letter1, byte Letter2)\r
+ {\r
+ Letter1 = toupper(Letter1);\r
+ Letter2 = toupper(Letter2);\r
+ if (Letter1 == Letter2)\r
+ return g_IdChar;\r
+ return g_DiffChar;\r
+ }\r
+\r
+static inline char GetSymA(byte Letter1, byte Letter2)\r
+ {\r
+ Letter1 = toupper(Letter1);\r
+ Letter2 = toupper(Letter2);\r
+ if (Letter1 == Letter2)\r
+ return '|';\r
+\r
+ float Score = g_SubstMx[Letter1][Letter2];\r
+ if (Score >= 2.0f)\r
+ return ':';\r
+ if (Score > 0.0f)\r
+ return '.';\r
+ return ' ';\r
+ }\r
+\r
+static inline char GetSym(byte Letter1, byte Letter2, bool Nucleo)\r
+ {\r
+ if (Nucleo)\r
+ return GetSymN(Letter1, Letter2);\r
+ else\r
+ return GetSymA(Letter1, Letter2);\r
+ }\r
+\r
+static unsigned GetNDig(unsigned n)\r
+ {\r
+ if (n < 10)\r
+ return 1;\r
+ if (n < 100)\r
+ return 2;\r
+ if (n < 1000)\r
+ return 3;\r
+ if (n < 10000)\r
+ return 4;\r
+ if (n < 100000)\r
+ return 5;\r
+ if (n < 1000000)\r
+ return 6;\r
+ return 10;\r
+ }\r
+\r
+extern unsigned *g_UserFieldIndexes;\r
+extern unsigned g_UserFieldCount;\r
+\r
+#endif // out_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "path.h"\r
+#include "timing.h"\r
+\r
+#define TRACE 0\r
+\r
+const unsigned PathMagic = 0x9A783A16;\r
+\r
+struct PathBuffer\r
+ {\r
+ unsigned Magic;\r
+ char *Buffer;\r
+ unsigned Size;\r
+ bool InUse;\r
+ };\r
+\r
+static PathBuffer **g_PathBuffers;\r
+static unsigned g_PathBufferSize;\r
+\r
+static char *AllocBuffer(unsigned Size)\r
+ {\r
+ if (Size == 0)\r
+ return 0;\r
+\r
+// Is a free buffer that is big enough?\r
+ for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+ {\r
+ PathBuffer *PB = g_PathBuffers[i];\r
+ asserta(PB->Magic == PathMagic);\r
+ if (!PB->InUse)\r
+ {\r
+ if (PB->Size >= Size)\r
+ {\r
+ PB->InUse = true;\r
+ return PB->Buffer;\r
+ }\r
+ if (PB->Buffer == 0)\r
+ {\r
+ unsigned Size2 = Size + 1024;\r
+ PB->Buffer = MYALLOC(char, Size2, Path);\r
+ PB->Size = Size2;\r
+ PB->InUse = true;\r
+ return PB->Buffer;\r
+ }\r
+ }\r
+ }\r
+\r
+// No available buffer, must expand g_PathBuffers[]\r
+ unsigned NewPathBufferSize = g_PathBufferSize + 1024;\r
+ PathBuffer **NewPathBuffers = MYALLOC(PathBuffer *, NewPathBufferSize, Path);\r
+ \r
+ for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+ NewPathBuffers[i] = g_PathBuffers[i];\r
+\r
+ for (unsigned i = g_PathBufferSize; i < NewPathBufferSize; ++i)\r
+ {\r
+ PathBuffer *PB = MYALLOC(PathBuffer, 1, Path);\r
+ PB->Magic = PathMagic;\r
+ PB->Buffer = 0;\r
+ PB->Size = 0;\r
+ PB->InUse = false;\r
+ NewPathBuffers[i] = PB;\r
+ }\r
+\r
+ PathBuffer *PB = NewPathBuffers[g_PathBufferSize];\r
+\r
+ MYFREE(g_PathBuffers, g_PathBufferSize, Path);\r
+ g_PathBuffers = NewPathBuffers;\r
+ g_PathBufferSize = NewPathBufferSize;\r
+\r
+ asserta(!PB->InUse && PB->Buffer == 0);\r
+\r
+ unsigned Size2 = Size + 1024;\r
+ PB->Buffer = MYALLOC(char, Size2, Path);\r
+ PB->Size = Size2;\r
+ PB->InUse = true;\r
+ return PB->Buffer;\r
+ }\r
+\r
+static void FreeBuffer(char *Buffer)\r
+ {\r
+ if (Buffer == 0)\r
+ return;\r
+\r
+ for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+ {\r
+ PathBuffer *PB = g_PathBuffers[i];\r
+ if (PB->Buffer == Buffer)\r
+ {\r
+ asserta(PB->InUse);\r
+ PB->InUse = false;\r
+ return;\r
+ }\r
+ }\r
+\r
+ Die("FreeBuffer, not found");\r
+ }\r
+\r
+void PathData::Alloc(unsigned MaxLen)\r
+ {\r
+ if (MaxLen < Bytes)\r
+ return;\r
+\r
+ StartTimer(PathAlloc);\r
+ if (Bytes > 0)\r
+ {\r
+ FreeBuffer(Front);\r
+ }\r
+\r
+ Bytes = MaxLen + 1;\r
+ Front = AllocBuffer(Bytes);\r
+ Back = Front + Bytes - 1;\r
+ Start = 0;\r
+ EndTimer(PathAlloc);\r
+ }\r
+\r
+void PathData::Free()\r
+ {\r
+ FreeBuffer(Front);\r
+ Front = 0;\r
+ Start = 0;\r
+ Back = 0;\r
+ }\r
+\r
+void PathData::Copy(const PathData &rhs)\r
+ {\r
+ Alloc(rhs.Bytes);\r
+ strcpy(Front, rhs.Front);\r
+ Start = Front + (rhs.Start - rhs.Front);\r
+ }\r
+\r
+void PathData::FromStr(const char *PathStr)\r
+ {\r
+ asserta(PathStr != 0);\r
+ unsigned NeededBytes = (unsigned) strlen(PathStr) + 1;\r
+ Alloc(NeededBytes);\r
+ strcpy(Front, PathStr);\r
+ Start = Front;\r
+ }\r
+\r
+void LogPathStats()\r
+ {\r
+ Log("\n");\r
+ unsigned Bytes = 0;\r
+ for (unsigned i = 0; i < g_PathBufferSize; ++i)\r
+ {\r
+ const PathBuffer *PB = g_PathBuffers[i];\r
+ Bytes += PB->Size;\r
+ }\r
+ Log("%u paths allocated, total memory %u bytes\n", g_PathBufferSize, Bytes);\r
+ }\r
--- /dev/null
+#ifndef path_h\r
+#define path_h\r
+\r
+struct PathData\r
+ {\r
+private:\r
+ PathData(PathData &);\r
+ PathData &operator=(PathData &);\r
+\r
+public:\r
+ char *Start;\r
+ char *Front;\r
+ char *Back;\r
+ unsigned Bytes;\r
+\r
+public:\r
+ PathData()\r
+ {\r
+ Clear(true);\r
+ }\r
+ ~PathData()\r
+ {\r
+ Free();\r
+ }\r
+ void Free();\r
+ void Alloc(unsigned MaxLen);\r
+ void Clear(bool ctor = false)\r
+ {\r
+ Start = 0;\r
+ if (ctor)\r
+ {\r
+ Front = 0;\r
+ Back = 0;\r
+ Bytes = 0;\r
+ }\r
+ else\r
+ Free();\r
+ }\r
+ void Copy(const PathData &rhs);\r
+ void FromStr(const char *PathStr);\r
+ void Reverse()\r
+ {\r
+ asserta(Start != 0);\r
+ unsigned L = (unsigned) strlen(Start);\r
+ for (unsigned k = 0; k < L/2; ++k)\r
+ {\r
+ char c = Start[k];\r
+ Start[k] = Start[L-k-1];\r
+ Start[L-k-1] = c;\r
+ }\r
+ }\r
+ void SetEmpty()\r
+ {\r
+ Start = 0;\r
+ }\r
+\r
+ bool IsEmpty() const\r
+ {\r
+ return Start == 0;\r
+ }\r
+ };\r
+\r
+#endif // path_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "ultra.h"\r
+#include "chime.h"\r
+#include "uc.h"\r
+#include "dp.h"\r
+#include <set>\r
+#include <algorithm>\r
+\r
+#define TRACE 0\r
+\r
+extern FILE *g_fUChime;\r
+\r
+void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
+ vector<unsigned> &Parents);\r
+\r
+void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
+ const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
+\r
+double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
+\r
+static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
+ vector<unsigned> &IdVec, unsigned d)\r
+ {\r
+ IdVec.clear();\r
+ const unsigned ColCount = SIZE(Path);\r
+\r
+ const byte *Q = QSD.Seq;\r
+ const byte *P = PSD.Seq;\r
+\r
+ const unsigned QL = QSD.L;\r
+ const unsigned PL = PSD.L;\r
+\r
+ if (QL <= d)\r
+ {\r
+ IdVec.resize(QSD.L, 0);\r
+ return;\r
+ }\r
+\r
+ unsigned QPos = 0;\r
+ unsigned PPos = 0;\r
+\r
+ vector<bool> SameVec;\r
+ SameVec.reserve(QL);\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char c = Path[Col];\r
+\r
+ bool Same = false;\r
+ if (c == 'M')\r
+ {\r
+ byte q = Q[QPos];\r
+ byte p = P[PPos];\r
+ Same = (toupper(q) == toupper(p));\r
+ }\r
+\r
+ if (c == 'M' || c == 'D')\r
+ {\r
+ ++QPos;\r
+ SameVec.push_back(Same);\r
+ }\r
+\r
+ if (c == 'M' || c == 'I')\r
+ ++PPos;\r
+ }\r
+\r
+ asserta(SIZE(SameVec) == QL);\r
+\r
+ unsigned n = 0;\r
+ for (unsigned QPos = 0; QPos < d; ++QPos)\r
+ {\r
+ if (SameVec[QPos])\r
+ ++n;\r
+ IdVec.push_back(n);\r
+ }\r
+\r
+ for (unsigned QPos = d; QPos < QL; ++QPos)\r
+ {\r
+ if (SameVec[QPos])\r
+ ++n;\r
+ IdVec.push_back(n);\r
+ if (SameVec[QPos-d])\r
+ --n;\r
+ }\r
+ asserta(SIZE(IdVec) == QL);\r
+\r
+#if TRACE\r
+ {\r
+ Log("\n");\r
+ Log("GetSmoothedIdVec\n");\r
+ unsigned QPos = 0;\r
+ unsigned PPos = 0;\r
+ Log("Q P Same Id\n");\r
+ Log("- - ---- -------\n");\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ char c = Path[Col];\r
+\r
+ bool Same = false;\r
+ if (c == 'M')\r
+ {\r
+ byte q = Q[QPos];\r
+ byte p = P[PPos];\r
+ Same = (toupper(q) == toupper(p));\r
+ Log("%c %c %4c %7d\n", q, p, tof(Same), IdVec[QPos]);\r
+ }\r
+\r
+ if (c == 'M' || c == 'D')\r
+ ++QPos;\r
+ if (c == 'M' || c == 'I')\r
+ ++PPos;\r
+ }\r
+ }\r
+#endif\r
+ }\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+ const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+ float MinFractId, ChimeHit2 &Hit)\r
+ {\r
+ Hit.Clear();\r
+ Hit.QLabel = QSD.Label;\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log("\n");\r
+ Log("SearchChime()\n");\r
+ Log("Query>%s\n", QSD.Label);\r
+ }\r
+\r
+ vector<unsigned> Parents;\r
+ GetCandidateParents(U, QSD, QAb, Parents);\r
+\r
+ unsigned ParentCount = SIZE(Parents);\r
+ if (ParentCount <= 1)\r
+ {\r
+ if (opt_verbose)\r
+ Log("%u candidate parents, done.\n", ParentCount);\r
+ return false;\r
+ }\r
+\r
+ if (opt_fastalign)\r
+ HF.SetA(QSD);\r
+ HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
+\r
+ unsigned ChunkLength;\r
+ vector<unsigned> ChunkLos;\r
+ GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
+ const unsigned ChunkCount = SIZE(ChunkLos);\r
+\r
+ vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
+ vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
+\r
+ vector<SeqData> PSDs;\r
+ vector<string> Paths;\r
+ double TopPctId = 0.0;\r
+ unsigned TopParentIndex = UINT_MAX;\r
+ unsigned QL = QSD.L;\r
+ vector<unsigned> MaxIdVec(QL, 0);\r
+ for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+ {\r
+ unsigned ParentSeqIndex = Parents[ParentIndex];\r
+\r
+ SeqData PSD;\r
+ //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
+ //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
+ //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
+ //PSD.Index = ParentSeqIndex;\r
+ U.GetSeqData(ParentSeqIndex, PSD);\r
+ PSDs.push_back(PSD);\r
+\r
+ if (opt_fastalign)\r
+ HF.SetB(PSD);\r
+\r
+ PathData PD;\r
+\r
+ float HSPId;\r
+ bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
+ if (!Found)\r
+ {\r
+ Paths.push_back(""); \r
+ continue;\r
+ }\r
+\r
+ double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
+ if (opt_selfid && PctId == 100.0)\r
+ {\r
+ Paths.push_back(""); \r
+ continue;\r
+ }\r
+\r
+ if (PctId > TopPctId)\r
+ {\r
+ TopParentIndex = ParentIndex;\r
+ TopPctId = PctId;\r
+ if (TopPctId >= 100.0 - opt_mindiv)\r
+ {\r
+ if (opt_verbose)\r
+ {\r
+ Log(" %.1f%% >%s\n", TopPctId, PSD.Label);\r
+ Log(" Top hit exceeds ctl threshold, done.\n");\r
+ return false;\r
+ }\r
+ }\r
+ }\r
+\r
+ string Path = PD.Start;\r
+ Paths.push_back(Path);\r
+\r
+ vector<unsigned> IdVec;\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] > MaxIdVec[QPos])\r
+ MaxIdVec[QPos] = IdVec[QPos];\r
+ }\r
+\r
+ vector<unsigned> BestParents;\r
+ for (unsigned k = 0; k < opt_maxp; ++k)\r
+ {\r
+ unsigned BestParent = UINT_MAX;\r
+ unsigned BestCov = 0;\r
+ for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
+ {\r
+ const SeqData &PSD = PSDs[ParentIndex];\r
+ const string &Path = Paths[ParentIndex];\r
+ if (Path == "")\r
+ continue;\r
+\r
+ vector<unsigned> IdVec;\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+\r
+ unsigned Cov = 0;\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] == MaxIdVec[QPos])\r
+ ++Cov;\r
+\r
+ if (Cov > BestCov)\r
+ {\r
+ BestParent = ParentIndex;\r
+ BestCov = Cov;\r
+ }\r
+ }\r
+\r
+ if (BestParent == UINT_MAX)\r
+ break;\r
+\r
+ BestParents.push_back(BestParent);\r
+ vector<unsigned> IdVec;\r
+\r
+ const SeqData &PSD = PSDs[BestParent];\r
+ const string &Path = Paths[BestParent];\r
+ GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
+ for (unsigned QPos = 0; QPos < QL; ++QPos)\r
+ if (IdVec[QPos] == MaxIdVec[QPos])\r
+ MaxIdVec[QPos] = UINT_MAX;\r
+ }\r
+\r
+ unsigned BestParentCount = SIZE(BestParents);\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
+ for (unsigned k = 0; k < BestParentCount; ++k)\r
+ {\r
+ unsigned i = BestParents[k];\r
+ Log(" %s\n", PSDs[i].Label);\r
+ }\r
+ }\r
+\r
+ bool Found = false;\r
+ for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
+ {\r
+ unsigned i1 = BestParents[k1];\r
+ asserta(i1 < ParentCount);\r
+\r
+ const SeqData &PSD1 = PSDs[i1];\r
+ const string &Path1 = Paths[i1];\r
+\r
+ for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
+ {\r
+ unsigned i2 = BestParents[k2];\r
+ asserta(i2 < ParentCount);\r
+ asserta(i2 != i1);\r
+\r
+ const SeqData &PSD2 = PSDs[i2];\r
+ const string &Path2 = Paths[i2];\r
+\r
+ ChimeHit2 Hit2;\r
+ AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
+ Hit2.PctIdQT = TopPctId;\r
+\r
+ if (Hit2.Accept())\r
+ Found = true;\r
+\r
+ if (Hit2.Score > Hit.Score)\r
+ Hit = Hit2;\r
+\r
+ if (opt_verbose)\r
+ Hit2.LogMe();\r
+ }\r
+ }\r
+\r
+ return Found;\r
+ }\r
--- /dev/null
+#ifndef seq_h\r
+#define seq_h\r
+\r
+struct ORFData;\r
+\r
+struct SeqData\r
+ {\r
+ const char *Label;\r
+ const byte *Seq;\r
+ unsigned L;\r
+ unsigned Index;\r
+\r
+// RevComp means that SeqData.Seq is reverse-complemented relative\r
+// to the sequence in the input file (query or db). Coordinates in\r
+// a hit (e.g., AlnData) will be relative to SeqData.Seq, so both\r
+// the sequence and the coordinates should be r.c.'d for output.\r
+ bool RevComp;\r
+ bool Nucleo;\r
+ const ORFData *ORFParent;\r
+\r
+ SeqData()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+ void Clear()\r
+ {\r
+ Label = 0;\r
+ Seq = 0;\r
+ L = 0;\r
+ Index = UINT_MAX;\r
+ RevComp = false;\r
+ Nucleo = false;\r
+ ORFParent = 0;\r
+ }\r
+ };\r
+\r
+#endif // seq_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+#include "sfasta.h"\r
+#include "seq.h"\r
+\r
+void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)\r
+ {\r
+ const unsigned ROWLEN = 80;\r
+ if (Label != 0)\r
+ fprintf(f, ">%s\n", Label);\r
+ unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+ for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+ {\r
+ unsigned From = BlockIndex*ROWLEN;\r
+ unsigned To = From + ROWLEN;\r
+ if (To >= L)\r
+ To = L;\r
+ for (unsigned Pos = From; Pos < To; ++Pos)\r
+ fputc(Seq[Pos], f);\r
+ fputc('\n', f);\r
+ }\r
+ }\r
+\r
+SeqDB::~SeqDB()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+SeqDB::SeqDB()\r
+ {\r
+ Clear(true);\r
+ }\r
+\r
+void SeqDB::Clear(bool ctor)\r
+ {\r
+ if (!ctor)\r
+ {\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ unsigned n = strlen(m_Labels[i]);\r
+ MYFREE(m_Labels[i], n, SeqDB);\r
+ MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);\r
+ }\r
+ MYFREE(m_Labels, m_Size, SeqDB);\r
+ MYFREE(m_Seqs, m_Size, SeqDB);\r
+ MYFREE(m_SeqLengths, m_Size, SeqDB);\r
+ }\r
+\r
+ m_FileName.clear();\r
+ m_SeqCount = 0;\r
+ m_Size = 0;\r
+\r
+ m_Labels = 0;\r
+ m_Seqs = 0;\r
+ m_SeqLengths = 0;\r
+\r
+ m_Aligned = false;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ }\r
+\r
+void SeqDB::InitEmpty(bool Nucleo)\r
+ {\r
+ Clear();\r
+ m_IsNucleo = Nucleo;\r
+ m_IsNucleoSet = true;\r
+ }\r
+\r
+void SeqDB::FromFasta(const string &FileName, bool AllowGaps)\r
+ {\r
+ Clear();\r
+ m_FileName = FileName;\r
+ SFasta SF;\r
+\r
+ SF.Open(FileName);\r
+ SF.m_AllowGaps = AllowGaps;\r
+\r
+ ProgressStep(0, 1000, "Reading %s", FileName.c_str());\r
+ for (;;)\r
+ {\r
+ unsigned QueryPctDoneX10 = SF.GetPctDoneX10();\r
+ ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());\r
+ const byte *Seq = SF.GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+\r
+ const char *Label = SF.GetLabel();\r
+ unsigned L = SF.GetSeqLength();\r
+ AddSeq(Label, Seq, L);\r
+ }\r
+ ProgressStep(999, 1000, "Reading %s", FileName.c_str());\r
+\r
+ SetIsNucleo();\r
+\r
+ Progress("%s sequences\n", IntToStr(GetSeqCount()));\r
+ }\r
+\r
+void SeqDB::ToFasta(const string &FileName) const\r
+ {\r
+ FILE *f = CreateStdioFile(FileName);\r
+ for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)\r
+ ToFasta(f, SeqIndex);\r
+ CloseStdioFile(f);\r
+ }\r
+\r
+void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const\r
+ {\r
+ if (WithLabel)\r
+ fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+\r
+ const unsigned ROWLEN = 80;\r
+\r
+ unsigned L = GetSeqLength(SeqIndex);\r
+ const byte *Seq = GetSeq(SeqIndex);\r
+ unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;\r
+ for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)\r
+ {\r
+ unsigned From = BlockIndex*ROWLEN;\r
+ unsigned To = From + ROWLEN;\r
+ if (To >= L)\r
+ To = L;\r
+ for (unsigned Pos = From; Pos < To; ++Pos)\r
+ fputc(Seq[Pos], f);\r
+ fputc('\n', f);\r
+ }\r
+ }\r
+\r
+void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const\r
+ {\r
+ asserta(SeqIndex < m_SeqCount);\r
+ fprintf(f, ">%s\n", GetLabel(SeqIndex));\r
+ SeqToFasta(f, SeqIndex);\r
+ }\r
+\r
+unsigned SeqDB::GetMaxLabelLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned MaxL = 0;\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ unsigned L = (unsigned) strlen(m_Labels[Index]);\r
+ if (L > MaxL)\r
+ MaxL = L;\r
+ }\r
+ return MaxL;\r
+ }\r
+\r
+unsigned SeqDB::GetMaxSeqLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned MaxL = 0;\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ unsigned L = m_SeqLengths[Index];\r
+ if (L > MaxL)\r
+ MaxL = L;\r
+ }\r
+ return MaxL;\r
+ }\r
+\r
+void SeqDB::LogMe() const\r
+ {\r
+ Log("\n");\r
+ const unsigned SeqCount = GetSeqCount();\r
+ Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));\r
+ if (SeqCount == 0)\r
+ return;\r
+\r
+ Log("Index Label Length Seq\n");\r
+ Log("----- ---------------- ------ ---\n");\r
+ for (unsigned Index = 0; Index < SeqCount; ++Index)\r
+ {\r
+ Log("%5u", Index);\r
+ Log(" %16.16s", m_Labels[Index]);\r
+ unsigned L = m_SeqLengths[Index];\r
+ Log(" %6u", L);\r
+ Log(" %*.*s", L, L, m_Seqs[Index]);\r
+ Log("\n");\r
+ }\r
+ }\r
+\r
+void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const\r
+ {\r
+ asserta(Id < m_SeqCount);\r
+ Buffer.Seq = m_Seqs[Id];\r
+ Buffer.Label = m_Labels[Id];\r
+ Buffer.L = m_SeqLengths[Id];\r
+ Buffer.Index = Id;\r
+ Buffer.ORFParent = 0;\r
+ Buffer.RevComp = false;\r
+ Buffer.Nucleo = IsNucleo();\r
+ }\r
+\r
+void SeqDB::SetIsNucleo()\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned N = 0;\r
+ for (unsigned i = 0; i < 100; ++i)\r
+ {\r
+ unsigned SeqIndex = unsigned(rand()%SeqCount);\r
+ const byte *Seq = GetSeq(SeqIndex);\r
+ unsigned L = GetSeqLength(SeqIndex);\r
+ const unsigned Pos = unsigned(rand()%L);\r
+ byte c = Seq[Pos];\r
+\r
+ if (g_IsNucleoChar[c])\r
+ ++N;\r
+ }\r
+ m_IsNucleo = (N > 80);\r
+ m_IsNucleoSet = true;\r
+ }\r
+\r
+unsigned SeqDB::GetTotalLength() const\r
+ {\r
+ const unsigned SeqCount = GetSeqCount();\r
+ unsigned TotalLength = 0;\r
+ for (unsigned Id = 0; Id < SeqCount; ++Id)\r
+ TotalLength += GetSeqLength(Id);\r
+ return TotalLength;\r
+ }\r
+\r
+unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)\r
+ {\r
+ StartTimer(AddSeq);\r
+ if (m_SeqCount >= m_Size)\r
+ {\r
+ unsigned NewSize = unsigned(m_Size*1.5) + 1024;\r
+ char **NewLabels = MYALLOC(char *, NewSize, SeqDB);\r
+ byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);\r
+ unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);\r
+\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ NewLabels[i] = m_Labels[i];\r
+ NewSeqs[i] = m_Seqs[i];\r
+ NewSeqLengths[i] = m_SeqLengths[i];\r
+ }\r
+\r
+ MYFREE(m_Labels, m_SeqCount, SeqDB);\r
+ MYFREE(m_Seqs, m_SeqCount, SeqDB);\r
+ MYFREE(m_SeqLengths, m_SeqCount, SeqDB);\r
+\r
+ m_Labels = NewLabels;\r
+ m_Seqs = NewSeqs;\r
+ m_SeqLengths = NewSeqLengths;\r
+ m_Size = NewSize;\r
+ }\r
+\r
+ unsigned Index = m_SeqCount++;\r
+ m_Seqs[Index] = MYALLOC(byte, L, SeqDB);\r
+ memcpy(m_Seqs[Index], Seq, L);\r
+\r
+ unsigned n = strlen(Label) + 1;\r
+ m_Labels[Index] = MYALLOC(char, n, SeqDB);\r
+ memcpy(m_Labels[Index], Label, n);\r
+\r
+ if (Index == 0)\r
+ m_Aligned = true;\r
+ else\r
+ m_Aligned = (m_Aligned && L == m_SeqLengths[0]);\r
+\r
+ m_SeqLengths[Index] = L;\r
+\r
+ EndTimer(AddSeq);\r
+ return Index;\r
+ }\r
+\r
+unsigned SeqDB::GetIndex(const char *Label) const\r
+ {\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ if (strcmp(Label, m_Labels[i]) == 0)\r
+ return i;\r
+ Die("SeqDB::GetIndex(%s), not found", Label);\r
+ return UINT_MAX;\r
+ }\r
+\r
+void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)\r
+ {\r
+ LabelToIndex.clear();\r
+ for (unsigned i = 0; i < m_SeqCount; ++i)\r
+ {\r
+ const string &Label = string(GetLabel(i));\r
+ if (LabelToIndex.find(Label) != LabelToIndex.end())\r
+ Die("Duplicate label: %s", Label.c_str());\r
+ LabelToIndex[Label] = i;\r
+ }\r
+ }\r
--- /dev/null
+#ifndef seqdb_h\r
+#define seqdb_h\r
+\r
+#include <vector>\r
+#include <map>\r
+\r
+struct SeqData;\r
+\r
+using namespace std;\r
+\r
+struct SeqDB\r
+ {\r
+private:\r
+ SeqDB(const SeqDB &rhs);\r
+ SeqDB &operator=(const SeqDB &rhs);\r
+\r
+public:\r
+ string m_FileName;\r
+ char **m_Labels;\r
+ byte **m_Seqs;\r
+ unsigned *m_SeqLengths;\r
+ unsigned m_SeqCount;\r
+ unsigned m_Size;\r
+\r
+ bool m_Aligned;\r
+ bool m_IsNucleo;\r
+ bool m_IsNucleoSet;\r
+\r
+public:\r
+ SeqDB();\r
+ ~SeqDB();\r
+ void Clear(bool ctor = false);\r
+ void InitEmpty(bool Nucleo);\r
+\r
+ unsigned AddSeq(const char *Label, const byte *Seq, unsigned L);\r
+\r
+ byte *GetSeq(unsigned SeqIndex) const\r
+ {\r
+ asserta(SeqIndex < m_SeqCount);\r
+ return m_Seqs[SeqIndex];\r
+ }\r
+\r
+ const char *GetLabel(unsigned SeqIndex) const\r
+ {\r
+ asserta(SeqIndex < m_SeqCount);\r
+ return m_Labels[SeqIndex];\r
+ }\r
+\r
+ unsigned GetSeqLength(unsigned SeqIndex) const\r
+ {\r
+ asserta(SeqIndex < m_SeqCount);\r
+ return m_SeqLengths[SeqIndex];\r
+ }\r
+\r
+ unsigned GetSeqCount() const\r
+ {\r
+ return m_SeqCount;\r
+ }\r
+\r
+ unsigned GetPairCount() const\r
+ {\r
+ unsigned SeqCount = GetSeqCount();\r
+ return (SeqCount*(SeqCount - 1))/2;\r
+ }\r
+\r
+ unsigned GetPairIndex(unsigned SeqIndex1, unsigned SeqIndex2) const\r
+ {\r
+ if (SeqIndex1 > SeqIndex2)\r
+ return (SeqIndex1*(SeqIndex1 - 1))/2 + SeqIndex2;\r
+ return (SeqIndex2*(SeqIndex2 - 1))/2 + SeqIndex1;\r
+ }\r
+\r
+ unsigned GetColCount() const\r
+ {\r
+ if (!m_Aligned)\r
+ Die("SeqDB::GetColCount, not aligned");\r
+ if (m_SeqCount == 0)\r
+ Die("SeqDB::GetColCount, empty");\r
+ return m_SeqLengths[0];\r
+ }\r
+\r
+ bool IsNucleo() const\r
+ {\r
+ asserta(m_IsNucleoSet);\r
+ return m_IsNucleo;\r
+ }\r
+\r
+ void GetSeqData(unsigned Id, SeqData &Buffer) const;\r
+\r
+ unsigned GetMaxLabelLength() const;\r
+ unsigned GetMaxSeqLength() const;\r
+ void SetIsNucleo();\r
+ unsigned GetIndex(const char *Label) const;\r
+ void MakeLabelToIndex(map<string, unsigned> &LabelToIndex);\r
+\r
+ void LogMe() const;\r
+ void FromFasta(const string &FileName, bool AllowGaps = false);\r
+\r
+ void ToFasta(const string &FileName) const;\r
+ void ToFasta(FILE *f, unsigned SeqIndex) const;\r
+ void SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel = false) const;\r
+\r
+ unsigned GetTotalLength() const;\r
+ };\r
+\r
+bool isgap(byte c);\r
+\r
+#endif\r
--- /dev/null
+#include "myutils.h"
+#include "mx.h"
+
+Mx<float> g_SubstMxf;
+float **g_SubstMx;
+
+static const char Alphabet[] = "ACGTU";
+
+void SetNucSubstMx(double Match, double Mismatch)\r
+ {\r
+ static bool Done = false;\r
+ if (Done)\r
+ return;\r
+ Done = true;\r
+\r
+ if (Match <= 0.0)\r
+ Die("Match score should be +ve");\r
+ if (Mismatch >= 0.0)\r
+ Die("Mismatch score should be -ve");\r
+\r
+ unsigned N = unsigned(strlen(Alphabet));\r
+\r
+ g_SubstMxf.Alloc("NUCMX", 256, 256);\r
+ strcpy(g_SubstMxf.m_Alpha, "ACGT");\r
+ g_SubstMxf.Init(0);\r
+ g_SubstMx = g_SubstMxf.GetData();\r
+ for (unsigned i = 0; i < N; ++i)\r
+ {\r
+ for (unsigned j = 0; j < N; ++j)\r
+ {\r
+ float v = float(i == j ? Match : Mismatch);\r
+\r
+ byte ui = (byte) toupper(Alphabet[i]);\r
+ byte uj = (byte) toupper(Alphabet[j]);\r
+ byte li = (byte) tolower(ui);\r
+ byte lj = (byte) tolower(uj);\r
+ ui = (byte) toupper(ui);\r
+ uj = (byte) toupper(uj);\r
+\r
+ g_SubstMx[ui][uj] = v;\r
+ g_SubstMx[uj][ui] = v;\r
+\r
+ g_SubstMx[ui][lj] = v;\r
+ g_SubstMx[uj][li] = v;\r
+\r
+ g_SubstMx[li][uj] = v;\r
+ g_SubstMx[lj][ui] = v;\r
+\r
+ g_SubstMx[li][lj] = v;\r
+ g_SubstMx[lj][li] = v;\r
+ }\r
+ }\r
+\r
+ for (unsigned j = 0; j < N; ++j)\r
+ {\r
+ float v = 0.0f;\r
+\r
+ byte ui = (byte) 'N';\r
+ byte uj = (byte) toupper(Alphabet[j]);\r
+ byte li = (byte) 'n';\r
+ byte lj = (byte) tolower(uj);\r
+ ui = (byte) toupper(ui);\r
+ uj = (byte) toupper(uj);\r
+\r
+ g_SubstMx[ui][uj] = v;\r
+ g_SubstMx[uj][ui] = v;\r
+\r
+ g_SubstMx[ui][lj] = v;\r
+ g_SubstMx[uj][li] = v;\r
+\r
+ g_SubstMx[li][uj] = v;\r
+ g_SubstMx[lj][ui] = v;\r
+\r
+ g_SubstMx[li][lj] = v;\r
+ g_SubstMx[lj][li] = v;\r
+ }\r
+ }\r
--- /dev/null
+#include "sfasta.h"\r
+#include "orf.h"\r
+#include "alpha.h"\r
+#include "timing.h"\r
+\r
+static inline bool isgap(byte c)\r
+ {\r
+ return c == '-' || c == '.';\r
+ }\r
+\r
+const unsigned BufferSize = 16*1024*1024;\r
+\r
+static unsigned GetMaxPoly(const byte *Seq, unsigned L)\r
+ {\r
+ byte CurrChar = Seq[0];\r
+ unsigned Start = 0;\r
+ unsigned MaxLen = 1;\r
+ for (unsigned i = 1; i < L; ++i)\r
+ {\r
+ char c = Seq[i];\r
+ if (c != CurrChar || i+1 == L)\r
+ {\r
+ unsigned Len = i - Start;\r
+ if (Len > MaxLen)\r
+ MaxLen = Len;\r
+ CurrChar = c;\r
+ Start = i;\r
+ }\r
+ }\r
+ return MaxLen;\r
+ }\r
+\r
+SFasta::SFasta()\r
+ {\r
+ m_FileName = "";\r
+ m_File = 0;\r
+ m_Buffer = 0;\r
+ m_BufferSize = 0;\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ m_FileSize = 0;\r
+ m_Label = 0;\r
+ m_SeqLength = 0;\r
+ m_TooShortCount = 0;\r
+ m_TooLongCount = 0;\r
+ m_ShortestLength = 0;\r
+ m_LongestLength = 0;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ }\r
+\r
+SFasta::~SFasta()\r
+ {\r
+ Clear();\r
+ }\r
+\r
+void SFasta::Clear()\r
+ {\r
+ MYFREE(m_Buffer, m_BufferSize, SFasta);\r
+ if (m_File != 0)\r
+ CloseStdioFile(m_File);\r
+\r
+ m_FileName = "";\r
+ m_File = 0;\r
+ m_Buffer = 0;\r
+ m_BufferSize = 0;\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ m_FileSize = 0;\r
+ m_Label = 0;\r
+ m_SeqLength = 0;\r
+ m_SeqIndex = UINT_MAX;\r
+ m_AllowGaps = false;\r
+ m_IsNucleo = false;\r
+ m_IsNucleoSet = false;\r
+ m_TooShortCount = 0;\r
+ m_TooLongCount = 0;\r
+ m_ShortestLength = 0;\r
+ m_LongestLength = 0;\r
+ m_TooPolyCount = 0;\r
+ }\r
+\r
+void SFasta::LogMe() const\r
+ {\r
+ Log("\n");\r
+ Log("SFasta::LogMe()\n");\r
+ Log("FileName=%s\n", m_FileName.c_str());\r
+ Log("FileSize=%u\n", (unsigned) m_FileSize);\r
+ Log("FilePos=%u\n", (unsigned) m_FilePos);\r
+ Log("BufferSize=%u\n", m_BufferSize);\r
+ Log("BufferPos=%u\n", m_BufferOffset);\r
+ Log("BufferBytes=%u\n", m_BufferBytes);\r
+ if (m_Label == 0)\r
+ Log("Label=NULL\n");\r
+ else\r
+ Log("Label=%s\n", m_Label);\r
+ Log("SeqLength=%u\n", m_SeqLength);\r
+ }\r
+\r
+const byte *SFasta::GetNextSeq()\r
+ {\r
+ for (;;)\r
+ {\r
+ const byte *Seq = GetNextSeqLo();\r
+ if (Seq == 0)\r
+ {\r
+ if (m_TooShortCount > 0)\r
+ Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",\r
+ m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());\r
+ if (m_TooLongCount > 0)\r
+ Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",\r
+ m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());\r
+ if (m_TooPolyCount > 0)\r
+ Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",\r
+ m_TooPolyCount, opt_maxpoly);\r
+ return 0;\r
+ }\r
+ if (m_SeqLength < opt_minlen)\r
+ {\r
+ ++m_TooShortCount;\r
+ if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)\r
+ m_ShortestLength = m_SeqLength;\r
+ continue;\r
+ }\r
+ if (m_SeqLength > opt_maxlen && opt_maxlen != 0)\r
+ {\r
+ if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)\r
+ m_LongestLength = m_SeqLength;\r
+ ++m_TooLongCount;\r
+ continue;\r
+ }\r
+ return Seq;\r
+ }\r
+ }\r
+\r
+const byte *SFasta::GetNextSeqLo()\r
+ {\r
+// End of cache?\r
+ if (m_BufferOffset == m_BufferBytes)\r
+ {\r
+ // End of file?\r
+ if (m_FilePos == m_FileSize)\r
+ return 0;\r
+ FillCache();\r
+ }\r
+\r
+ StartTimer(SF_GetNextSeq);\r
+ asserta(m_Buffer[m_BufferOffset] == '>');\r
+ m_Label = (char *) (m_Buffer + m_BufferOffset + 1);\r
+ \r
+//// Scan to end-of-line.\r
+//// Use dubious library function strchr() in the hope\r
+//// that it uses fast machine code.\r
+// byte *ptr = (byte *) strchr(m_Label, '\n');\r
+// asserta(ptr != 0);\r
+// *ptr = 0;\r
+\r
+ byte *ptr = 0;\r
+ for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)\r
+ {\r
+ char c = m_Buffer[i];\r
+ if (c == '\n' || c == '\r')\r
+ {\r
+ ptr = m_Buffer + i;\r
+ break;\r
+ }\r
+ }\r
+ asserta(ptr != 0);\r
+\r
+ if (opt_trunclabels)\r
+ {\r
+ for (char *p = m_Label; *p; ++p)\r
+ if (isspace(*p))\r
+ {\r
+ *p = 0;\r
+ break;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ for (char *p = m_Label; *p; ++p)\r
+ {\r
+ if (*p == '\t')\r
+ *p = ' ';\r
+ else if (*p == '\r' || *p == '\n')\r
+ {\r
+ *p = 0;\r
+ char NextChar = *(p+1);\r
+ if (NextChar == '\r' || NextChar == '\n')\r
+ ++p;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+\r
+// ptr points to end-of-line.\r
+// Move to start of sequence data.\r
+ byte *Seq = ++ptr;\r
+\r
+// Delete white space in-place\r
+ byte *To = ptr;\r
+ m_BufferOffset = (unsigned) (ptr - m_Buffer);\r
+ while (m_BufferOffset < m_BufferBytes)\r
+ {\r
+ byte c = m_Buffer[m_BufferOffset];\r
+ if (c == '>')\r
+ {\r
+ char prevc = '\n';\r
+ if (m_BufferOffset > 0)\r
+ prevc = m_Buffer[m_BufferOffset-1];\r
+ if (prevc == '\n' || prevc == '\r')\r
+ break;\r
+ }\r
+ ++m_BufferOffset;\r
+ if (isalpha(c) || (isgap(c) && m_AllowGaps))\r
+ *To++ = c;\r
+ else if (c == '\n' || c == '\r')\r
+ continue;\r
+ else\r
+ {\r
+ const char *Label = (m_Label == 0 ? "" : m_Label);\r
+ static bool WarningDone = false;\r
+ if (!WarningDone)\r
+ {\r
+ if (isgap(c))\r
+ Warning("Ignoring gaps in FASTA file '%s'",\r
+ m_FileName.c_str());\r
+ else if (isprint(c))\r
+ Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",\r
+ m_FileName.c_str(), c, Label);\r
+ else\r
+ Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",\r
+ m_FileName.c_str(), c, Label);\r
+ WarningDone = true;\r
+ }\r
+ continue;\r
+ }\r
+ }\r
+ m_SeqLength = unsigned(To - Seq);\r
+\r
+ if (m_SeqIndex == UINT_MAX)\r
+ m_SeqIndex = 0;\r
+ else\r
+ ++m_SeqIndex;\r
+\r
+ EndTimer(SF_GetNextSeq);\r
+ return Seq;\r
+ }\r
+\r
+void SFasta::Open(const string &FileName)\r
+ {\r
+ Clear();\r
+ m_FileName = FileName;\r
+ m_File = OpenStdioFile(FileName);\r
+ m_BufferSize = BufferSize;\r
+ //m_Buffer = myalloc<byte>(m_BufferSize);\r
+ m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);\r
+ m_FileSize = GetStdioFileSize(m_File);\r
+ }\r
+\r
+void SFasta::Rewind()\r
+ {\r
+ m_BufferOffset = 0;\r
+ m_BufferBytes = 0;\r
+ m_FilePos = 0;\r
+ }\r
+\r
+bool SFasta::SetIsNucleo()\r
+ {\r
+ if (m_FilePos != 0)\r
+ Die("SFasta::IsNucleo, not at BOF");\r
+\r
+ unsigned LetterCount = 0;\r
+ unsigned NucleoLetterCount = 0;\r
+ for (;;)\r
+ {\r
+ const byte *Seq = GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+ unsigned L = GetSeqLength();\r
+ for (unsigned i = 0; i < L; ++i)\r
+ if (g_IsNucleoChar[Seq[i]])\r
+ ++NucleoLetterCount;\r
+ LetterCount += L;\r
+ if (LetterCount > 256)\r
+ break;\r
+ }\r
+ Rewind();\r
+ if (LetterCount == 0)\r
+ {\r
+ m_IsNucleoSet = true;\r
+ m_IsNucleo = true;\r
+ return true;\r
+ }\r
+\r
+// Nucleo if more than 90% nucleo letters AGCTUN\r
+ m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;\r
+ m_IsNucleoSet = true;\r
+ return m_IsNucleo;\r
+ }\r
+\r
+void SFasta::FillCache()\r
+ {\r
+ StartTimer(SF_FillCache);\r
+ asserta(m_FilePos < m_FileSize);\r
+\r
+// off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.\r
+ off_t otBytesToRead = m_FileSize - m_FilePos;\r
+\r
+ bool FinalBuffer = true;\r
+ if (otBytesToRead > (off_t) m_BufferSize)\r
+ {\r
+ FinalBuffer = false;\r
+ otBytesToRead = m_BufferSize;\r
+ }\r
+\r
+ unsigned BytesToRead = unsigned(otBytesToRead);\r
+ asserta(BytesToRead > 0);\r
+ asserta(BytesToRead <= m_BufferSize);\r
+\r
+ SetStdioFilePos(m_File, m_FilePos);\r
+ ReadStdioFile(m_File, m_Buffer, BytesToRead);\r
+ if (m_Buffer[0] != '>')\r
+ {\r
+ if (m_FilePos == 0)\r
+ Die("Input is not FASTA file");\r
+ else\r
+ Die("SFasta::FillCache() failed, expected '>'");\r
+ }\r
+\r
+ m_BufferOffset = 0;\r
+\r
+// If last buffer in file, done\r
+ if (FinalBuffer)\r
+ {\r
+ m_BufferBytes = BytesToRead;\r
+ m_FilePos += BytesToRead;\r
+ EndTimer(SF_FillCache);\r
+ return;\r
+ }\r
+\r
+// If not last buffer, truncate any partial sequence\r
+// at end of buffer. Search backwards to find last '>'.\r
+ byte *ptr = m_Buffer + BytesToRead - 1;\r
+ while (ptr > m_Buffer)\r
+ {\r
+ if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))\r
+ break;\r
+ --ptr;\r
+ }\r
+\r
+ if (ptr == m_Buffer)\r
+ {\r
+ LogMe();\r
+ if (*ptr != '>')\r
+ {\r
+ // No '>' found.\r
+ // This might techincally be legal FASTA if the entire\r
+ // buffer is white space, but strange if not the last buffer\r
+ // in the file, so quit anyway.\r
+ Die("Failed to find '>' (pos=%u, bytes=%u)",\r
+ (unsigned) m_FilePos, BytesToRead);\r
+ }\r
+ else\r
+ {\r
+ // Entire buffer is one sequence which may be truncated.\r
+ Die("Sequence too long (pos=%u, bytes=%u)",\r
+ (unsigned) m_FilePos, BytesToRead);\r
+ }\r
+ }\r
+\r
+ asserta(*ptr == '>');\r
+\r
+ m_BufferBytes = unsigned(ptr - m_Buffer);\r
+ m_FilePos += m_BufferBytes;\r
+\r
+ EndTimer(SF_FillCache);\r
+ }\r
+\r
+unsigned SFasta::GetPctDoneX10() const\r
+ {\r
+ if (m_FilePos == 0 || m_FileSize == 0)\r
+ return 0;\r
+\r
+ assert(m_FilePos >= (off_t) m_BufferBytes);\r
+ off_t BufferStart = m_FilePos - m_BufferBytes;\r
+ off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+ unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));\r
+ if (iPctX10 == 0)\r
+ return 1;\r
+ if (iPctX10 >= 999)\r
+ return 998;\r
+ return iPctX10;\r
+ }\r
+\r
+double SFasta::GetPctDone() const\r
+ {\r
+ if (m_FilePos == 0 || m_FileSize == 0)\r
+ return 0;\r
+\r
+ assert(m_FilePos >= (off_t) m_BufferBytes);\r
+ off_t BufferStart = m_FilePos - m_BufferBytes;\r
+ off_t BufferPos = BufferStart + m_BufferOffset;\r
+\r
+ return double(BufferPos)*100.0/double(m_FileSize);\r
+ }\r
+\r
+bool SFasta::GetNextSD(SeqData &SD)\r
+ {\r
+ SD.Seq = GetNextSeq();\r
+ if (SD.Seq == 0)\r
+ return false;\r
+\r
+ SD.Label = GetLabel();\r
+ SD.L = GetSeqLength();\r
+ SD.Index = GetSeqIndex();\r
+ SD.ORFParent = 0;\r
+ SD.Nucleo = GetIsNucleo();\r
+ SD.RevComp = false;\r
+\r
+ return true;\r
+ }\r
+\r
+#if TEST\r
+void TestSFasta()\r
+ {\r
+ SFasta SF;\r
+ SF.Open(opt_input);\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log(" Index Length Label\n");\r
+ Log("------- ------- -----\n");\r
+ }\r
+\r
+ unsigned Index = 0;\r
+ unsigned SeqCount = 0;\r
+ double LetterCount = 0.0;\r
+ ProgressStep(0, 1000, "Reading");\r
+ for (;;)\r
+ {\r
+ const byte *Seq = SF.GetNextSeq();\r
+ if (Seq == 0)\r
+ break;\r
+ ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");\r
+ const char *Label = SF.GetLabel();\r
+ unsigned L = SF.GetSeqLength();\r
+ ++SeqCount;\r
+ LetterCount += L;\r
+\r
+ if (opt_verbose)\r
+ {\r
+ Log(">%7u %7u '%s'\n", Index, L, Label);\r
+ Log("+%7.7s %7.7s \"%*.*s\"\n", "", "", L, L, Seq);\r
+ }\r
+\r
+ ++Index;\r
+ }\r
+ ProgressStep(999, 1000, "Reading");\r
+\r
+ Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+ Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));\r
+ }\r
+#endif // TEST\r
--- /dev/null
+#ifndef sfasta_h\r
+#define sfasta_h\r
+\r
+#include "myutils.h"\r
+#include "seq.h"\r
+\r
+typedef void (*ON_START_XSEQ)(const SeqData &SD);\r
+typedef void (*ON_END_XSEQ)(const SeqData &SD);\r
+\r
+// Sequential reader for FASTA file format.\r
+// Serves sequences in file order to save memory.\r
+// Caches biggish chunks to compromise memory vs. speed.\r
+class SFasta\r
+ {\r
+public:\r
+ string m_FileName;\r
+ FILE *m_File;\r
+ bool m_AllowGaps;\r
+\r
+ off_t m_FileSize;\r
+\r
+// Position to start next read\r
+ off_t m_FilePos;\r
+\r
+// Cached data.\r
+ byte *m_Buffer;\r
+\r
+// Bytes allocated to m_Buffer\r
+ unsigned m_BufferSize;\r
+\r
+// Current position in buffer, normally points to '>'\r
+ unsigned m_BufferOffset;\r
+\r
+// File data in buffer <= m_BufferSize\r
+ unsigned m_BufferBytes;\r
+\r
+// Current label\r
+// Points into m_Buffer, not a separate buffer.\r
+ char *m_Label;\r
+\r
+// Current sequence length\r
+ unsigned m_SeqLength;\r
+\r
+// Current seq index\r
+ unsigned m_SeqIndex;\r
+\r
+ unsigned m_ShortestLength;\r
+ unsigned m_LongestLength;\r
+ unsigned m_TooShortCount;\r
+ unsigned m_TooLongCount;\r
+ unsigned m_TooPolyCount;\r
+\r
+private:\r
+ bool m_IsNucleoSet;\r
+ bool m_IsNucleo;\r
+\r
+public:\r
+ SFasta();\r
+ ~SFasta();\r
+\r
+ void Clear();\r
+ void Open(const string &FileName);\r
+ void Rewind();\r
+ bool SetIsNucleo();\r
+ bool GetIsNucleo() const { asserta(m_IsNucleoSet); return m_IsNucleo; };\r
+\r
+// Get next sequence.\r
+// Returns zero on end-of-file\r
+ const byte *GetNextSeq();\r
+\r
+// Get next sequence as SeqData object, return false on end-of-file.\r
+ bool GetNextSD(SeqData &SD);\r
+\r
+// Length of most recent sequence returned by GetNextSeq().\r
+ unsigned GetSeqLength() const { return m_SeqLength; }\r
+\r
+// Label of most recent sequence returned by GetNextSeq().\r
+ const char *GetLabel() const { return m_Label; }\r
+\r
+// Index of most recent sequence returned by GetNextSeq().\r
+ unsigned GetSeqIndex() const { return m_SeqIndex; }\r
+\r
+ unsigned GetPctDoneX10() const;\r
+ double GetPctDone() const;\r
+\r
+ void LogMe() const;\r
+\r
+private:\r
+ void FillCache();\r
+ const byte *GetNextSeqLo();\r
+ };\r
+\r
+#endif // sfasta_h\r
--- /dev/null
+"Path: .\n"
+"URL: file:///public/svn/usearch\n"
+"Repository Root: file:///public/svn/usearch\n"
+"Repository UUID: 58640331-1837-4c17-bc3e-636dc59aced1\n"
+"Revision: 34\n"
+"Node Kind: directory\n"
+"Schedule: normal\n"
+"Last Changed Author: bob\n"
+"Last Changed Rev: 34\n"
+"Last Changed Date: 2011-05-01 08:29:04 -0700 (Sun, 01 May 2011)\n"
+"\n"
+"? mk\n"
+"! svnmods.h\n"
+"M ungappedblastid.cpp\n"
+"M chaindisjointhits.cpp\n"
--- /dev/null
+T(MxBase_Alloc)\r
+T(MxBase_FreeData)\r
+T(MxBase_AllocData)\r
+T(SortSeqIndexes)\r
+T(Alloc_Vectors)\r
+T(MainLoop_NotNW)\r
+T(WriteOutput)\r
+T(NWB)\r
+T(ReadAllStdioFile)\r
+T(Windex_Init)\r
+T(Windex_SetSeqIndex)\r
+T(SeqToWords)\r
+T(SeqToWordsStep)\r
+T(SeqToShortWords)\r
+T(SeqToShortWordsA)\r
+T(SeqToShortWordsB)\r
+T(GetFractIdB)\r
+T(Windex_UniqueWordsAlloc)\r
+T(Windex_UniqueWords)\r
+T(GetPctId)\r
+T(Windex_Reset)\r
+T(GetSig)\r
+T(NWEditDist)\r
+T(EditDist_Myers)\r
+T(EditDist_BlockTarget)\r
+T(NWBand)\r
+T(WordCounting)\r
+T(NWAff)\r
+T(NWAffBand)\r
+T(NWSimple)\r
+T(NWSimpleB)\r
+T(BandWrap)\r
+T(IncIdCounts)\r
+T(GetBestDiagB)\r
+T(GetBestDiagB1)\r
+T(GetBestDiagB2)\r
+T(ClusterInit)\r
+T(ClusterPrep)\r
+T(HotSort1)\r
+T(HotSort2)\r
+T(SortA)\r
+T(SortB)\r
+T(CountSort)\r
+T(AddWords)\r
+T(ClusterWindex)\r
+T(MainInit)\r
+T(Output)\r
+T(WindexTail)\r
+T(WindexExit)\r
+T(Sort)\r
+T(U_AllocSeqLength)\r
+T(U_AllocSeedCount)\r
+T(U_AddSeed)\r
+T(AddSeq)\r
+T(U_SetWordCounts)\r
+T(U_SetWordCountsHash)\r
+T(U_SetWordScores)\r
+T(U_SetHotHits)\r
+T(U_SetHotHitsHash)\r
+T(U_SetHotHitsScores)\r
+T(U_Search)\r
+T(U_SearchExact)\r
+T(WF_SeqToWords)\r
+T(WF_SeqToWordsA)\r
+T(WF_SeqToWordsB)\r
+T(WF_AllocLA)\r
+T(WF_AllocLB)\r
+T(WF_AllocDiags)\r
+T(WF_SetA)\r
+T(WF_SetA_Nb)\r
+T(WF_SetAZero)\r
+T(WF_SetA2)\r
+T(WF_SetB)\r
+T(WF_GetCommonWordCount)\r
+T(WF_GetBestDiag)\r
+T(GetFractIdGivenPath)\r
+T(WX_GetUniqueWords)\r
+T(CompressPath)\r
+T(GetHSPs1)\r
+T(GetHSPs2)\r
+T(AlignHSPs)\r
+T(WF_ResolveHSPs)\r
+T(WX_SetExcludes)\r
+T(ViterbiFast)\r
+T(ViterbiFastBand)\r
+T(ViterbiFastBand0)\r
+T(ViterbiFastBand1)\r
+T(ViterbiFastBand2)\r
+T(ViterbiFastBand3)\r
+T(ViterbiFastBand4)\r
+T(TraceBackBit)\r
+T(TraceBackBitSW)\r
+T(SF_GetNextSeq)\r
+T(SF_FillCache)\r
+T(OnGlobalAccept)\r
+T(UngappedBlast)\r
+T(UngappedBlastId)\r
+T(UngappedBlast2Hit)\r
+T(LogHSPs)\r
+T(BlastOutput)\r
+T(BlastLeft)\r
+T(BlastRight)\r
+T(Blast1)\r
+T(Blast2)\r
+T(Blast3)\r
+T(Blast4)\r
+T(GetBestSeg)\r
+T(SWLinearDP)\r
+T(SWLinearTB)\r
+T(SWLinearDP2)\r
+T(SWLinearTB2)\r
+T(Chain)\r
+T(XlatSeq)\r
+T(XlatSeqToLetters)\r
+T(XDropFwdSimple)\r
+T(XDropFwdFast)\r
+T(XDropFwdFastTB)\r
+T(XDropBwd)\r
+T(SWSimple)\r
+T(PathAlloc)\r
+T(SubPath)\r
+T(SWUngapped)\r
+T(SWFast)\r
+T(SWFastNTB)\r
+T(SWAT_CacheQuery)\r
+T(SWAT_AlignTarget)\r
+T(SWAT_CacheQueryNW)\r
+T(SWAT_AlignTargetNW)\r
+T(SeqDB_FromFasta)\r
+T(LocalUngappedHitToAD)\r
+T(LocalGappedHitToAD)\r
+T(GlobalHitToAD)\r
+T(ResolveOverlaps)\r
+T(GetORFs)\r
+T(ChainCov_AddHit)\r
+T(ChainCov_EndQuery)\r
+T(ChainCov_DoTarget)\r
+T(BuildNb)\r
+T(MakeIntSubstMx)\r
+T(UngappedExtendLeft)\r
+T(UngappedExtendRight)\r
+T(AlignSP)\r
+T(AlignHSP)\r
+\r
+// Background\r
+T(Bg_SearchLoop)\r
+T(Bg_MainInit)\r
+T(Bg_MainTerm)\r
+T(Bg_Other)\r
+T(Bg_1)\r
+T(Bg_2)\r
+T(Bg_3)\r
+T(Bg_4)\r
+T(Bg_5)\r
+T(Bg_6)\r
+T(Bg_7)\r
+T(Bg_8)\r
+T(Bg_9)\r
+T(Bg_XFrame2)\r
+T(Bg_Usearch1)\r
+T(Bg_Usearch2)\r
+T(Bg_Usearch3)\r
+T(Bg_Usearch4)\r
+T(Bg_Hot)\r
+\r
+// For Timer2\r
+T(Search_2)\r
+T(Search_Loop_2)\r
+T(Search_InnerLoop_2)\r
+T(OnHit_2)\r
+T(UngappedBlast_2)\r
+T(MainInit_2)\r
+T(MainTerm_2)\r
--- /dev/null
+#define TIMING 0
+#ifndef timing_h
+#define timing_h
+
+#define BG_TIMING 0
+
+#if !TIMING
+#undef BG_TIMING
+#define BG_TIMING 0
+#endif
+
+#if UCHIMES
+#undef TIMING
+#define TIMING 0
+#endif
+
+#if TIMING
+
+enum TIMER
+ {
+ TIMER_None,
+#define T(x) TIMER_##x,
+#include "timers.h"
+#undef T
+ };
+
+const unsigned TimerCount =
+ 1 // TIMER_None
+#define T(x) +1
+#include "timers.h"
+#undef T
+ ;
+
+enum COUNTER
+ {
+#define C(x) COUNTER_##x,
+#include "counters.h"
+#undef C
+ };
+
+enum ALLOCER
+ {
+#define A(x) ALLOCER_##x,
+#include "allocs.h"
+#undef A
+ };
+
+const unsigned CounterCount =
+#define C(x) +1
+#include "counters.h"
+#undef C
+ ;
+
+const unsigned AllocerCount =
+#define A(x) +1
+#include "allocs.h"
+#undef A
+ ;
+
+#ifdef _MSC_VER
+
+typedef unsigned __int64 TICKS;
+
+#pragma warning(disable:4035)
+inline TICKS GetClockTicks()
+ {
+ _asm
+ {
+ _emit 0x0f
+ _emit 0x31
+ }
+ }
+
+#else // ifdef _MSC_VER
+
+typedef uint64_t TICKS;
+__inline__ uint64_t GetClockTicks()
+ {
+ uint32_t lo, hi;
+ /* We cannot use "=A", since this would use %rax on x86_64 */
+ __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+ return (uint64_t)hi << 32 | lo;
+ }
+
+#endif // ifdef _MSC_VER
+
+//void AddTicks(const string &Name, TICKS Ticks1, TICKS Ticks2);
+//void AddBytes(const string &Name, double Bytes);
+//#define SubBytes(Name, Bytes) AddBytes(Name, -double(Bytes))
+
+const char *TimerToStr(TIMER t);
+
+extern TICKS g_BeginTicks[TimerCount];
+extern double g_TotalTicks[TimerCount];
+extern double g_TotalCounts[TimerCount];
+extern double g_Counters[CounterCount];
+extern unsigned g_AllocNewCount[AllocerCount];
+extern unsigned g_AllocFreeCount[AllocerCount];
+extern double g_AllocNewBytes[AllocerCount];
+extern double g_AllocFreeBytes[AllocerCount];
+extern double g_AllocNetBytes[AllocerCount];
+extern double g_AllocPeakBytes[AllocerCount];
+extern bool g_Timer2[TimerCount];
+extern TIMER g_CurrTimer;
+#if BG_TIMING
+extern TIMER g_BackgroundTimer;
+#endif
+
+#define MYALLOC(Type, N, Name) (Type *) MyAlloc_((N)*sizeof(Type), ALLOCER_##Name, __FILE__, __LINE__)
+#define MYFREE(Array, N, Name) MyFree_(Array, N*sizeof(Array[0]), ALLOCER_##Name, __FILE__, __LINE__)
+
+inline void *MyAlloc_(unsigned Bytes, unsigned a, const char *FileName, int Line)
+ {
+ ++g_AllocNewCount[a];
+ g_AllocNewBytes[a] += Bytes;
+ g_AllocNetBytes[a] += Bytes;
+ if (g_AllocNetBytes[a] > g_AllocPeakBytes[a])
+ g_AllocPeakBytes[a] = g_AllocNetBytes[a];
+ return mymalloc(Bytes);
+ }
+
+inline void MyFree_(void *p, unsigned Bytes, unsigned a, const char *FileName, int Line)
+ {
+ ++g_AllocFreeCount[a];
+ g_AllocFreeBytes[a] += Bytes;
+ g_AllocNetBytes[a] -= Bytes;
+ myfree2(p, Bytes);
+ }
+
+#if BG_TIMING
+inline void SetBackgroundTimer_(TIMER Timer)
+ {
+ TICKS Now = GetClockTicks();
+ if (g_BeginTicks[g_BackgroundTimer] != 0)
+ {
+ ++g_TotalCounts[g_BackgroundTimer];
+ g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]);
+ }
+ g_BackgroundTimer = Timer;
+ g_BeginTicks[Timer] = Now;
+ }
+#else
+#define SetBackgroundTimer_(Timer) /* empty */
+#endif
+
+inline void StartTimer_(TIMER Timer)
+ {
+ if (g_CurrTimer != TIMER_None)
+ Die("StartTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+ TICKS Now = GetClockTicks();
+#if BG_TIMING
+ if (g_BeginTicks[g_BackgroundTimer] != 0)
+ {
+ ++g_TotalCounts[g_BackgroundTimer];
+ g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]);
+ }
+#endif
+ g_BeginTicks[Timer] = Now;
+ g_CurrTimer = Timer;
+ }
+
+inline void PauseTimer_(TIMER Timer)
+ {
+ if (Timer != g_CurrTimer)
+ Die("PauseTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+ TICKS Now = GetClockTicks();
+ g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]);
+ g_BeginTicks[Timer] = Now;
+ g_CurrTimer = TIMER_None;
+ }
+
+inline void EndTimer_(TIMER Timer)
+ {
+ if (Timer != g_CurrTimer)
+ Die("EndTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer));
+
+ TICKS Now = GetClockTicks();
+#if BG_TIMING
+ g_BeginTicks[g_BackgroundTimer] = Now;
+#endif
+ g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]);
+ ++g_TotalCounts[Timer];
+ g_CurrTimer = TIMER_None;
+ }
+
+inline void StartTimer2_(TIMER Timer)
+ {
+ g_Timer2[Timer] = true;
+ g_BeginTicks[Timer] = GetClockTicks();
+ }
+
+inline void EndTimer2_(TIMER Timer)
+ {
+ g_TotalTicks[Timer] += double(GetClockTicks() - g_BeginTicks[Timer]);
+ ++g_TotalCounts[Timer];
+ }
+
+#define AddCounter(x, N) g_Counters[COUNTER_##x] += N
+#define IncCounter(x) ++(g_Counters[COUNTER_##x])
+#define StartTimer(x) StartTimer_(TIMER_##x)
+#define PauseTimer(x) PauseTimer_(TIMER_##x)
+#define EndTimer(x) EndTimer_(TIMER_##x)
+#define StartTimer2(x) StartTimer2_(TIMER_##x)
+#define EndTimer2(x) EndTimer2_(TIMER_##x)
+
+#if BG_TIMING
+#define SetBackgroundTimer(x) SetBackgroundTimer_(TIMER_##x)
+#else
+#define SetBackgroundTimer(x) /* empty */
+#endif
+
+#else // if TIMING
+
+#define AddCounter(x, N) /* empty */
+#define IncCounter(x) /* empty */
+#define StartTimer(x) /* empty */
+#define PauseTimer(x) /* empty */
+#define EndTimer(x) /* empty */
+#define StartTimer2(x) /* empty */
+#define PauseTimer2(x) /* empty */
+#define EndTimer2(x) /* empty */
+#define SetBackgroundTimer(x) /* empty */
+#define MYALLOC(Type, N, Name) myalloc(Type, N)
+#define MYFREE(Array, N, Name) myfree(Array)
+
+#endif // if TIMING
+
+void LogMemStats();
+void LogTickStats();
+void LogStats();
+void LogAllocs();
+
+#define AddBytes(x, n) /* empty */
+#define SubBytes(x, n) /* empty */
+
+#endif // if timing_h
--- /dev/null
+#include "dp.h"
+
+#define TRACE 0
+
+Mx<byte> g_Mx_TBBit;
+byte **g_TBBit;
+float *g_DPRow1;
+float *g_DPRow2;
+static float *g_DPBuffer1;
+static float *g_DPBuffer2;
+
+static unsigned g_CacheLB;
+
+void AllocBit(unsigned LA, unsigned LB)
+ {
+ g_Mx_TBBit.Alloc("TBBit", LA+1, LB+1);
+ g_TBBit = g_Mx_TBBit.GetData();
+ if (LB > g_CacheLB)
+ {
+ MYFREE(g_DPBuffer1, g_CacheLB, AllocBit);
+ MYFREE(g_DPBuffer2, g_CacheLB, AllocBit);
+
+ g_CacheLB = LB + 128;
+
+ // Allow use of [-1]
+ //g_DPBuffer1 = myalloc<float>(g_CacheLB+3);
+ //g_DPBuffer2 = myalloc<float>(g_CacheLB+3);
+ g_DPBuffer1 = MYALLOC(float, g_CacheLB+3, AllocBit);
+ g_DPBuffer2 = MYALLOC(float, g_CacheLB+3, AllocBit);
+ g_DPRow1 = g_DPBuffer1 + 1;
+ g_DPRow2 = g_DPBuffer2 + 1;
+ }
+ }
+
+void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD)
+ {
+ PD.Alloc(LA+LB);
+
+ StartTimer(TraceBackBit);
+ char *PathPtr = PD.Back;
+ *PathPtr = 0;
+
+ byte **TB = g_TBBit;
+
+#if TRACE
+ Log("\n");
+ Log("TraceBackBit\n");
+#endif
+
+ size_t i = LA;
+ size_t j = LB;
+ for (;;)
+ {
+#if TRACE
+ Log("i=%3d j=%3d state=%c\n", (int) i, (int) j, State);
+#endif
+ if (i == 0 && j == 0)
+ break;
+
+ --PathPtr;
+ *PathPtr = State;
+
+ byte t;
+ switch (State)
+ {
+ case 'M':
+ asserta(i > 0 && j > 0);
+ t = TB[i-1][j-1];
+ if (t & TRACEBITS_DM)
+ State = 'D';
+ else if (t & TRACEBITS_IM)
+ State = 'I';
+ else
+ State = 'M';
+ --i;
+ --j;
+ break;
+ case 'D':
+ asserta(i > 0);
+ t = TB[i-1][j];
+ if (t & TRACEBITS_MD)
+ State = 'M';
+ else
+ State = 'D';
+ --i;
+ break;
+
+ case 'I':
+ asserta(j > 0);
+ t = TB[i][j-1];
+ if (t & TRACEBITS_MI)
+ State = 'M';
+ else
+ State = 'I';
+ --j;
+ break;
+
+ default:
+ Die("TraceBackBit, invalid state %c", State);
+ }
+ }
+ PD.Start = PathPtr;
+ EndTimer(TraceBackBit);
+ }
+
+void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj,
+ unsigned &Leni, unsigned &Lenj, PathData &PD)
+ {
+ PD.Alloc(LA+LB);
+
+ StartTimer(TraceBackBitSW);
+ char *PathPtr = PD.Back;
+ *PathPtr = 0;
+
+ byte **TB = g_TBBit;
+
+#if TRACE
+ Log("\n");
+ Log("TraceBackBitSW\n");
+#endif
+
+ unsigned i = Besti;
+ unsigned j = Bestj;
+ char State = 'M';
+ for (;;)
+ {
+#if TRACE
+ Log("i=%3d j=%3d state=%c\n", (int) i, (int) j, State);
+#endif
+ --PathPtr;
+ *PathPtr = State;
+
+ byte t;
+ switch (State)
+ {
+ case 'M':
+ asserta(i > 0 && j > 0);
+ t = TB[i-1][j-1];
+ if (t & TRACEBITS_DM)
+ State = 'D';
+ else if (t & TRACEBITS_IM)
+ State = 'I';
+ else if (t & TRACEBITS_SM)
+ {
+ Leni = Besti - i + 1;
+ Lenj = Bestj - j + 1;
+ PD.Start = PathPtr;
+ EndTimer(TraceBackBitSW);
+ return;
+ }
+ else
+ State = 'M';
+ --i;
+ --j;
+ break;
+ case 'D':
+ asserta(i > 0);
+ t = TB[i-1][j];
+ if (t & TRACEBITS_MD)
+ State = 'M';
+ else
+ State = 'D';
+ --i;
+ break;
+
+ case 'I':
+ asserta(j > 0);
+ t = TB[i][j-1];
+ if (t & TRACEBITS_MI)
+ State = 'M';
+ else
+ State = 'I';
+ --j;
+ break;
+
+ default:
+ Die("TraceBackBitSW, invalid state %c", State);
+ }
+ }
+ }
--- /dev/null
+#ifndef uc_h\r
+#define uc_h\r
+\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+#include "path.h"\r
+\r
+struct AlnData;\r
+\r
+class UCFile\r
+ {\r
+public:\r
+ FILE *m_File;\r
+ byte *m_Data;\r
+ vector<char> m_RecTypes;\r
+ vector<float> m_PctIds;\r
+ vector<const char *> m_Labels;\r
+ vector<const char *> m_SeedLabels;\r
+ vector<unsigned> m_SeedIndexes;\r
+ vector<const char *> m_CompressedPaths;\r
+ vector<unsigned> m_SeqLengths;\r
+ vector<unsigned> m_SortOrder;\r
+ vector<char> m_Strands;\r
+ vector<unsigned> m_Los;\r
+ vector<unsigned> m_SeedLos;\r
+\r
+public:\r
+ UCFile();\r
+ void Clear(bool ctor = false);\r
+ void Close();\r
+ void FromFile(const string &FileName);\r
+ void FromClstr(const string &FileName);\r
+ void ToFile(const string &FileName);\r
+ unsigned GetRecordCount() const;\r
+ void LogMe() const;\r
+ void ToClstr(const string &FileName);\r
+ void ToFasta(const string &FileName, const SeqDB &Input, bool Reformat);\r
+ void Create(const string &FileName);\r
+ void Sort();\r
+ void Flush() const;\r
+\r
+ void WriteNotMatched(unsigned L, const char *Label) const;\r
+ void WriteLibSeed(unsigned SeedIndex, unsigned L, const char *Label) const;\r
+ void WriteNewSeed(unsigned SeedIndex, unsigned L, const char *Label) const;\r
+ void WriteHit(const SeqData &SA, const SeqData &SB, double FractId,\r
+ const PathData &PD) const;\r
+ void WriteReject(const SeqData &SA, const SeqData &SB, double FractId,\r
+ const char *Path) const;\r
+ void WriteHit(unsigned SeedIndex, unsigned L, double PctId,\r
+ const char *CompressedPath, char Strand, unsigned Lo, unsigned SeedLo,\r
+ const char *Label, const char *SeedLabel) const;\r
+ void WriteHit(const AlnData &AD);\r
+ void WriteLibCluster(unsigned SeedIndex, unsigned Size, double AvgId,\r
+ const char *Label) const;\r
+ void WriteNewCluster(unsigned SeedIndex, unsigned Size, double AvgId,\r
+ const char *Label) const;\r
+ void WriteSeqX(FILE *f, const byte *Seq, unsigned L, const char *CompressedPath) const;\r
+ };\r
+\r
+#endif // uc_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "chime.h"\r
+#include "seqdb.h"\r
+#include "dp.h"\r
+#include "ultra.h"\r
+#include "hspfinder.h"\r
+#include <algorithm>\r
+#include <set>\r
+\r
+bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
+ const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
+ float MinFractId, ChimeHit2 &Hit);\r
+\r
+FILE *g_fUChime;\r
+FILE *g_fUChimeAlns;\r
+const vector<float> *g_SortVecFloat;\r
+bool g_UchimeDeNovo = false;\r
+\r
+void Usage()\r
+ {\r
+ printf("\n");\r
+ printf("UCHIME %s by Robert C. Edgar\n", MY_VERSION);\r
+ printf("http://www.drive5.com/uchime\n");\r
+ printf("\n");\r
+ printf("This software is donated to the public domain\n");\r
+ printf("\n");\r
+\r
+ printf(\r
+#include "help.h"\r
+ );\r
+ }\r
+\r
+void SetBLOSUM62()\r
+ {\r
+ Die("SetBLOSUM62 not implemented");\r
+ }\r
+\r
+void ReadSubstMx(const string &/*FileName*/, Mx<float> &/*Mxf*/)\r
+ {\r
+ Die("ReadSubstMx not implemented");\r
+ }\r
+\r
+void LogAllocs()\r
+ {\r
+ /*empty*/\r
+ }\r
+\r
+static bool CmpDescVecFloat(unsigned i, unsigned j)\r
+ {\r
+ return (*g_SortVecFloat)[i] > (*g_SortVecFloat)[j];\r
+ }\r
+\r
+void Range(vector<unsigned> &v, unsigned N)\r
+ {\r
+ v.clear();\r
+ v.reserve(N);\r
+ for (unsigned i = 0; i < N; ++i)\r
+ v.push_back(i);\r
+ }\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order)\r
+ {\r
+ StartTimer(Sort);\r
+ const unsigned N = SIZE(Values);\r
+ Range(Order, N);\r
+ g_SortVecFloat = &Values;\r
+ sort(Order.begin(), Order.end(), CmpDescVecFloat);\r
+ EndTimer(Sort);\r
+ }\r
+\r
+float GetAbFromLabel(const string &Label)\r
+ {\r
+ vector<string> Fields;\r
+ Split(Label, Fields, '/');\r
+ const unsigned N = SIZE(Fields);\r
+ for (unsigned i = 0; i < N; ++i)\r
+ {\r
+ const string &Field = Fields[i];\r
+ if (Field.substr(0, 3) == "ab=")\r
+ {\r
+ string a = Field.substr(3, string::npos);\r
+ return (float) atof(a.c_str());\r
+ }\r
+ }\r
+ if (g_UchimeDeNovo)\r
+ Die("Missing abundance /ab=xx/ in label >%s", Label.c_str());\r
+ return 0.0;\r
+ }\r
+\r
+int main(int argc, char *argv[])\r
+ {\r
+ \r
+ MyCmdLine(argc, argv);\r
+\r
+ if (argc < 2)\r
+ {\r
+ Usage();\r
+ return 0;\r
+ }\r
+\r
+ if (opt_version)\r
+ {\r
+ printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);\r
+ return 0;\r
+ }\r
+\r
+ printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);\r
+ printf("by Robert C. Edgar\n");\r
+ printf("http://drive5.com/uchime\n");\r
+ printf("This code is donated to the public domain.\n");\r
+ printf("\n");\r
+ if (!optset_w)\r
+ opt_w = 8;\r
+ \r
+ float MinFractId = 0.95f;\r
+ if (optset_id)\r
+ MinFractId = (float) opt_id;\r
+\r
+ Log("%8.2f minh\n", opt_minh);\r
+ Log("%8.2f xn\n", opt_xn);\r
+ Log("%8.2f dn\n", opt_dn);\r
+ Log("%8.2f xa\n", opt_xa);\r
+ Log("%8.2f mindiv\n", opt_mindiv);\r
+ Log("%8u maxp\n", opt_maxp);\r
+\r
+ if (opt_input == "" && opt_uchime != "")\r
+ opt_input = opt_uchime;\r
+\r
+ if (opt_input == "")\r
+ Die("Missing --input");\r
+\r
+ g_UchimeDeNovo = (opt_db == "");\r
+\r
+ if (opt_uchimeout != "")\r
+ g_fUChime = CreateStdioFile(opt_uchimeout);\r
+\r
+ if (opt_uchimealns != "")\r
+ g_fUChimeAlns = CreateStdioFile(opt_uchimealns);\r
+\r
+ SeqDB Input;\r
+ SeqDB DB;\r
+\r
+ Input.FromFasta(opt_input);\r
+ if (!Input.IsNucleo())\r
+ Die("Input contains amino acid sequences");\r
+\r
+ const unsigned QuerySeqCount = Input.GetSeqCount();\r
+ vector<unsigned> Order;\r
+ for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+ Order.push_back(i);\r
+\r
+ if (g_UchimeDeNovo)\r
+ {\r
+ vector<float> Abs;\r
+ for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+ {\r
+ const char *Label = Input.GetLabel(i);\r
+ float Ab = GetAbFromLabel(Label);\r
+ Abs.push_back(Ab);\r
+ }\r
+ SortDescending(Abs, Order);\r
+ DB.m_IsNucleoSet = true;\r
+ DB.m_IsNucleo = true;\r
+ }\r
+ else\r
+ {\r
+ DB.FromFasta(opt_db);\r
+ if (!DB.IsNucleo())\r
+ Die("Database contains amino acid sequences");\r
+ }\r
+\r
+ vector<ChimeHit2> Hits;\r
+ unsigned HitCount = 0;\r
+ for (unsigned i = 0; i < QuerySeqCount; ++i)\r
+ {\r
+ unsigned QuerySeqIndex = Order[i];\r
+\r
+ SeqData QSD;\r
+ Input.GetSeqData(QuerySeqIndex, QSD);\r
+\r
+ float QAb = -1.0;\r
+ if (g_UchimeDeNovo)\r
+ QAb = GetAbFromLabel(QSD.Label);\r
+\r
+ ChimeHit2 Hit;\r
+ AlnParams &AP = *(AlnParams *) 0;\r
+ AlnHeuristics &AH = *(AlnHeuristics *) 0;\r
+ HSPFinder &HF = *(HSPFinder *) 0;\r
+ bool Found = SearchChime(DB, QSD, QAb, AP, AH, HF, MinFractId, Hit);\r
+ if (Found)\r
+ ++HitCount;\r
+ else\r
+ {\r
+ if (g_UchimeDeNovo)\r
+ DB.AddSeq(QSD.Label, QSD.Seq, QSD.L);\r
+ }\r
+\r
+ WriteChimeHit(g_fUChime, Hit);\r
+\r
+ ProgressStep(i, QuerySeqCount, "%u/%u chimeras found (%.1f%%)", HitCount, i, Pct(HitCount, i+1));\r
+ }\r
+\r
+ Log("\n");\r
+ Log("%s: %u/%u chimeras found (%.1f%%)\n",\r
+ opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount));\r
+\r
+ CloseStdioFile(g_fUChime);\r
+ CloseStdioFile(g_fUChimeAlns);\r
+\r
+ ProgressExit();\r
+ return 0;\r
+ }\r
--- /dev/null
+#ifndef ultra_h
+#define ultra_h
+
+#include "seqdb.h"
+#define Ultra SeqDB
+#define GetSeedLabel GetLabel
+
+#endif // ultra_h
--- /dev/null
+#if UCHIMES\r
+\r
+#include "myutils.h"\r
+#include "seqdb.h"\r
+#include "seq.h"\r
+#include "alpha.h"\r
+\r
+void SortDescending(const vector<float> &Values, vector<unsigned> &Order);\r
+\r
+static byte *g_QueryHasWord;\r
+static unsigned g_WordCount;\r
+\r
+unsigned GetWord(const byte *Seq)\r
+ {\r
+ unsigned Word = 0;\r
+ const byte *Front = Seq;\r
+ for (unsigned i = 0; i < opt_w; ++i)\r
+ {\r
+ unsigned Letter = g_CharToLetterNucleo[*Front++];\r
+ Word = (Word*4) + Letter;\r
+ }\r
+ return Word;\r
+ }\r
+\r
+static void SetQuery(const SeqData &Query)\r
+ {\r
+ if (g_QueryHasWord == 0)\r
+ {\r
+ g_WordCount = 4;\r
+ for (unsigned i = 1; i < opt_w; ++i)\r
+ g_WordCount *= 4;\r
+\r
+ g_QueryHasWord = myalloc(byte, g_WordCount);\r
+ }\r
+\r
+ memset(g_QueryHasWord, 0, g_WordCount);\r
+\r
+ if (Query.L <= opt_w)\r
+ return;\r
+\r
+ const unsigned L = Query.L - opt_w + 1;\r
+ const byte *Seq = Query.Seq;\r
+ for (unsigned i = 0; i < L; ++i)\r
+ {\r
+ unsigned Word = GetWord(Seq++);\r
+ g_QueryHasWord[Word] = 1;\r
+ }\r
+ }\r
+\r
+static unsigned GetUniqueWordsInCommon(const SeqData &Target)\r
+ {\r
+ if (Target.L <= opt_w)\r
+ return 0;\r
+\r
+ unsigned Count = 0;\r
+ const unsigned L = Target.L - opt_w + 1;\r
+ const byte *Seq = Target.Seq;\r
+ for (unsigned i = 0; i < L; ++i)\r
+ {\r
+ unsigned Word = GetWord(Seq++);\r
+ if (g_QueryHasWord[Word])\r
+ ++Count;\r
+ }\r
+ return Count;\r
+ }\r
+\r
+void USort(const SeqData &Query, const SeqDB &DB, vector<float> &WordCounts, \r
+ vector<unsigned> &Order)\r
+ {\r
+ WordCounts.clear();\r
+ Order.clear();\r
+\r
+ SetQuery(Query);\r
+\r
+ const unsigned SeqCount = DB.GetSeqCount();\r
+ for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)\r
+ {\r
+ SeqData Target;\r
+ DB.GetSeqData(SeqIndex, Target);\r
+ float WordCount = (float) GetUniqueWordsInCommon(Target);\r
+ WordCounts.push_back(WordCount);\r
+ }\r
+ SortDescending(WordCounts, Order);\r
+ }\r
+\r
+#endif // UCHIMES\r
--- /dev/null
+#include "dp.h"
+#include "out.h"
+#include "evalue.h"
+
+#define CMP_SIMPLE 0
+\r
+#if SAVE_FAST
+static Mx<float> g_MxDPM;
+static Mx<float> g_MxDPD;
+static Mx<float> g_MxDPI;
+
+static Mx<char> g_MxTBM;
+static Mx<char> g_MxTBD;
+static Mx<char> g_MxTBI;
+
+static float **g_DPM;
+static float **g_DPD;
+static float **g_DPI;
+
+static char **g_TBM;
+static char **g_TBD;
+static char **g_TBI;
+
+#if CMP_SIMPLE
+static Mx<float> *g_DPMSimpleMx;
+static Mx<float> *g_DPDSimpleMx;
+static Mx<float> *g_DPISimpleMx;
+static float **g_DPMSimple;
+static float **g_DPDSimple;
+static float **g_DPISimple;
+
+#define cmpm(i, j, x) { if (!feq(x, g_DPMSimple[i][j])) \
+ { \
+ Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+ __FILE__, __LINE__, x, i, j, g_DPMSimple[i][j]); \
+ } \
+ }
+
+#define cmpd(i, j, x) { if (!feq(x, g_DPDSimple[i][j])) \
+ { \
+ Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+ __FILE__, __LINE__, x, i, j, g_DPDSimple[i][j]); \
+ } \
+ }
+
+#define cmpi(i, j, x) { if (!feq(x, g_DPISimple[i][j])) \
+ { \
+ Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \
+ __FILE__, __LINE__, x, i, j, g_DPISimple[i][j]); \
+ } \
+ }
+
+#else
+
+#define cmpm(i, j, x) /* empty */
+#define cmpd(i, j, x) /* empty */
+#define cmpi(i, j, x) /* empty */
+
+#endif
+
+static void AllocSave(unsigned LA, unsigned LB)
+ {
+#if CMP_SIMPLE
+ GetSimpleDPMxs(&g_DPMSimpleMx, &g_DPDSimpleMx, &g_DPISimpleMx);
+ g_DPMSimple = g_DPMSimpleMx->GetData();
+ g_DPDSimple = g_DPDSimpleMx->GetData();
+ g_DPISimple = g_DPISimpleMx->GetData();
+#endif
+ g_MxDPM.Alloc("FastM", LA+1, LB+1);\r
+ g_MxDPD.Alloc("FastD", LA+1, LB+1);\r
+ g_MxDPI.Alloc("FastI", LA+1, LB+1);\r
+\r
+ g_MxTBM.Alloc("FastTBM", LA+1, LB+1);\r
+ g_MxTBD.Alloc("FastTBD", LA+1, LB+1);\r
+ g_MxTBI.Alloc("FastTBI", LA+1, LB+1);\r
+\r
+ g_DPM = g_MxDPM.GetData();\r
+ g_DPD = g_MxDPD.GetData();\r
+ g_DPI = g_MxDPI.GetData();\r
+\r
+ g_TBM = g_MxTBM.GetData();\r
+ g_TBD = g_MxTBD.GetData();\r
+ g_TBI = g_MxTBI.GetData();\r
+ }
+
+static void SAVE_DPM(unsigned i, unsigned j, float x)
+ {
+ g_DPM[i][j] = x;
+#if CMP_SIMPLE
+ if (i > 0 && j > 0)
+ asserta(feq(x, g_DPMSimple[i][j]));
+#endif
+ }
+
+static void SAVE_DPD(unsigned i, unsigned j, float x)
+ {
+ g_DPD[i][j] = x;
+#if CMP_SIMPLE
+ if (i > 0 && j > 0)
+ asserta(feq(x, g_DPDSimple[i][j]));
+#endif
+ }
+
+static void SAVE_DPI(unsigned i, unsigned j, float x)
+ {
+ g_DPI[i][j] = x;
+#if CMP_SIMPLE
+ if (i > 0 && j > 0)
+ asserta(feq(x, g_DPISimple[i][j]));
+#endif
+ }
+
+static void SAVE_TBM(unsigned i, unsigned j, char x)
+ {
+ g_TBM[i][j] = x;
+ }
+
+static void SAVE_TBD(unsigned i, unsigned j, char x)
+ {
+ g_TBD[i][j] = x;
+ }
+
+static void SAVE_TBI(unsigned i, unsigned j, char x)
+ {
+ g_TBI[i][j] = x;
+ }
+
+void GetFastMxs(Mx<float> **M, Mx<float> **D, Mx<float> **I)
+ {
+ *M = &g_MxDPM;
+ *D = &g_MxDPD;
+ *I = &g_MxDPI;
+ }
+
+#else // SAVE_FAST
+
+#define SAVE_DPM(i, j, x) /* empty */
+#define SAVE_DPD(i, j, x) /* empty */
+#define SAVE_DPI(i, j, x) /* empty */
+
+#define SAVE_TBM(i, j, x) /* empty */
+#define SAVE_TBD(i, j, x) /* empty */
+#define SAVE_TBI(i, j, x) /* empty */
+
+#define AllocSave(LA, LB) /* empty */
+
+#define cmpm(i, j, x) /* empty */
+#define cmpd(i, j, x) /* empty */
+#define cmpi(i, j, x) /* empty */
+
+#endif // SAVE_FAST
+
+float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB,
+ const AlnParams &AP, PathData &PD)
+ {
+ if (LA*LB > 100*1000*1000)
+ Die("ViterbiFast, too long LA=%u, LB=%u", LA, LB);
+
+ AllocBit(LA, LB);
+ AllocSave(LA, LB);
+
+ StartTimer(ViterbiFast);
+
+ const float * const *Mx = AP.SubstMx;
+ float OpenA = AP.LOpenA;
+ float ExtA = AP.LExtA;
+
+ byte **TB = g_TBBit;
+ float *Mrow = g_DPRow1;
+ float *Drow = g_DPRow2;
+
+// Use Mrow[-1], so...
+ Mrow[-1] = MINUS_INFINITY;
+ for (unsigned j = 0; j <= LB; ++j)
+ {
+ Mrow[j] = MINUS_INFINITY;
+ SAVE_DPM(0, j, MINUS_INFINITY);
+ SAVE_TBM(0, j, '?');
+
+ Drow[j] = MINUS_INFINITY;
+ SAVE_DPD(0, j, MINUS_INFINITY);
+ SAVE_TBD(0, j, '?');
+ }
+
+// Main loop
+ float M0 = float (0);
+ SAVE_DPM(0, 0, 0);
+ for (unsigned i = 0; i < LA; ++i)
+ {
+ byte a = A[i];
+ const float *MxRow = Mx[a];
+ float OpenB = AP.LOpenB;
+ float ExtB = AP.LExtB;
+ float I0 = MINUS_INFINITY;
+
+ SAVE_TBM(i, 0, '?');
+
+ SAVE_DPI(i, 0, MINUS_INFINITY);
+ SAVE_DPI(i, 1, MINUS_INFINITY);
+
+ SAVE_TBI(i, 0, '?');
+ SAVE_TBI(i, 1, '?');
+
+ byte *TBrow = TB[i];
+ for (unsigned j = 0; j < LB; ++j)
+ {
+ byte b = B[j];
+ byte TraceBits = 0;
+ float SavedM0 = M0;
+
+ // MATCH
+ {
+ // M0 = DPM[i][j]
+ // I0 = DPI[i][j]
+ // Drow[j] = DPD[i][j]
+ cmpm(i, j, M0);
+ cmpd(i, j, Drow[j]);
+ cmpi(i, j, I0);
+
+ float xM = M0;
+ SAVE_TBM(i+1, j+1, 'M');
+ if (Drow[j] > xM)
+ {
+ xM = Drow[j];
+ TraceBits = TRACEBITS_DM;
+ SAVE_TBM(i+1, j+1, 'D');
+ }
+ if (I0 > xM)
+ {
+ xM = I0;
+ TraceBits = TRACEBITS_IM;
+ SAVE_TBM(i+1, j+1, 'I');
+ }
+ M0 = Mrow[j];
+ cmpm(i, j+1, M0);
+
+ Mrow[j] = xM + MxRow[b];
+ // Mrow[j] = DPM[i+1][j+1])
+ SAVE_DPM(i+1, j+1, Mrow[j]);
+ }
+
+ // DELETE
+ {
+ // SavedM0 = DPM[i][j]
+ // Drow[j] = DPD[i][j]
+ cmpm(i, j, SavedM0);
+ cmpd(i, j, Drow[j]);
+
+ float md = SavedM0 + OpenB;
+ Drow[j] += ExtB;
+ SAVE_TBD(i+1, j, 'D');
+ if (md >= Drow[j])
+ {
+ Drow[j] = md;
+ TraceBits |= TRACEBITS_MD;
+ SAVE_TBD(i+1, j, 'M');
+ }
+ // Drow[j] = DPD[i+1][j]
+ SAVE_DPD(i+1, j, Drow[j]);
+ }
+
+ // INSERT
+ {
+ // SavedM0 = DPM[i][j]
+ // I0 = DPI[i][j]
+ cmpm(i, j, SavedM0);
+ cmpi(i, j, I0);
+
+ float mi = SavedM0 + OpenA;
+ I0 += ExtA;
+ SAVE_TBI(i, j+1, 'I');
+ if (mi >= I0)
+ {
+ I0 = mi;
+ TraceBits |= TRACEBITS_MI;
+ SAVE_TBI(i, j+1, 'M');
+ }
+ // I0 = DPI[i][j+1]
+ SAVE_DPI(i, j+1, I0);
+ }
+
+ OpenB = AP.OpenB;
+ ExtB = AP.ExtB;
+
+ TBrow[j] = TraceBits;
+ }
+
+ // Special case for end of Drow[]
+ {
+ // M0 = DPM[i][LB]
+ // Drow[LB] = DPD[i][LB]
+
+ TBrow[LB] = 0;
+ float md = M0 + AP.ROpenB;
+ Drow[LB] += AP.RExtB;
+ SAVE_TBD(i+1, LB, 'D');
+ if (md >= Drow[LB])
+ {
+ Drow[LB] = md;
+ TBrow[LB] = TRACEBITS_MD;
+ SAVE_TBD(i+1, LB, 'M');
+ }
+ // Drow[LB] = DPD[i+1][LB]
+ SAVE_DPD(i+1, LB, Drow[LB]);
+ }
+
+ SAVE_DPM(i+1, 0, MINUS_INFINITY);
+ M0 = MINUS_INFINITY;
+
+ OpenA = AP.OpenA;
+ ExtA = AP.ExtA;
+ }
+
+ SAVE_TBM(LA, 0, '?');
+
+// Special case for last row of DPI
+ byte *TBrow = TB[LA];
+ float I1 = MINUS_INFINITY;
+
+ SAVE_DPI(LA, 0, MINUS_INFINITY);
+ SAVE_TBI(LA, 0, '?');
+
+ SAVE_DPI(LA, 1, MINUS_INFINITY);
+ SAVE_TBI(LA, 1, '?');
+
+ for (unsigned j = 1; j < LB; ++j)
+ {
+ // Mrow[j-1] = DPM[LA][j]
+ // I1 = DPI[LA][j]
+
+ TBrow[j] = 0;
+ float mi = Mrow[int(j)-1] + AP.ROpenA;
+ I1 += AP.RExtA;
+ SAVE_TBI(LA, j+1, 'I');
+ if (mi > I1)
+ {
+ I1 = mi;
+ TBrow[j] = TRACEBITS_MI;
+ SAVE_TBI(LA, j+1, 'M');
+ }
+ SAVE_DPI(LA, j+1, I1);
+ }
+
+ float FinalM = Mrow[LB-1];
+ float FinalD = Drow[LB];
+ float FinalI = I1;
+// FinalM = DPM[LA][LB]
+// FinalD = DPD[LA][LB]
+// FinalI = DPI[LA][LB]
+
+ float Score = FinalM;
+ byte State = 'M';
+ if (FinalD > Score)
+ {
+ Score = FinalD;
+ State = 'D';
+ }
+ if (FinalI > Score)
+ {
+ Score = FinalI;
+ State = 'I';
+ }
+
+ EndTimer(ViterbiFast);
+ TraceBackBit(LA, LB, State, PD);
+
+#if SAVE_FAST
+ g_MxDPM.LogMe();
+ g_MxDPD.LogMe();
+ g_MxDPI.LogMe();
+
+ g_MxTBM.LogMe();
+ g_MxTBD.LogMe();
+ g_MxTBI.LogMe();
+#endif
+
+ return Score;
+ }
--- /dev/null
+#ifndef windex_h\r
+#define windex_h\r
+\r
+class SFasta;\r
+struct SeqDB;\r
+\r
+typedef uint32 word_t;\r
+typedef uint16 wordcount_t;\r
+typedef uint32 arrsize_t;\r
+typedef uint16 seqcountperword_t;\r
+typedef uint32 seqindex_t;\r
+typedef uint16 commonwordcount_t;\r
+\r
+const uint32 WindexFileHdr_Magic1 = 0x312DE41;\r
+const uint32 WindexFileHdr_Magic2 = 0x312DE42;\r
+const uint32 WindexFileHdr_Magic3 = 0x312DE43;\r
+const uint32 WindexFileHdr_Magic4 = 0x312DE44;\r
+\r
+struct WindexFileHdr\r
+ {\r
+ uint32 Magic1;\r
+ uint32 IsNucleo;\r
+ uint32 WordLength;\r
+ uint32 Magic2;\r
+ };\r
+\r
+class Windex\r
+ {\r
+public:\r
+ bool m_Nucleo;\r
+ bool m_RedAlpha;\r
+ unsigned m_WordLength;\r
+ unsigned m_AlphaSize;\r
+ unsigned m_WordCount;\r
+ unsigned m_Hi;\r
+ unsigned m_CapacityInc;\r
+ arrsize_t *m_Capacities;\r
+ arrsize_t *m_Sizes;\r
+ float *m_WordScores;\r
+ seqindex_t **m_SeedIndexes;\r
+ byte *m_UniqueCounts;\r
+ unsigned m_CharToLetter[256];\r
+\r
+public:\r
+ Windex();\r
+ void ToFile(const string &FileName) const;\r
+ void FromFile(const string &FileName);\r
+ void FromSFasta(SFasta &SF);\r
+ void FromSeqDB(const SeqDB &DB);\r
+ void Clear(bool ctor = false);\r
+ void AddWords(unsigned SeqIndex, const word_t *Words, unsigned N);\r
+ void Init(bool Nucleo, unsigned WordLength);\r
+ void Init2(bool Nucleo, unsigned TableSize);\r
+ void InitRed(unsigned WordLength);\r
+ void InitWordScores(const float *const *SubstMx);\r
+ void Reset();\r
+ void LogMe() const;\r
+ unsigned LogMemSize() const;\r
+ void LogWordStats(unsigned TopWords = 10) const;\r
+ const char *WordToStr(word_t Word) const;\r
+ word_t SeqToWord(const byte *Seq) const;\r
+ unsigned SeqToWords(const byte *Seq, unsigned L, word_t *Words) const;\r
+ unsigned SeqToWordsStep(unsigned Step, const byte *Seq, unsigned L, word_t *Words) const;\r
+ unsigned WordsToCounts(const word_t *Words, unsigned N,\r
+ word_t *UniqueWords, seqcountperword_t *Counts) const;\r
+ unsigned GetUniqueWords(const word_t *Words, unsigned N,\r
+ word_t *UniqueWords) const;\r
+ void LogSizeHisto() const;\r
+ };\r
+\r
+#endif // windex_h\r
--- /dev/null
+#include "myutils.h"\r
+#include "chime.h"\r
+\r
+void WriteChimeFileHdr(FILE *f)\r
+ {\r
+ if (f == 0)\r
+ return;\r
+\r
+ fprintf(f,\r
+ "\tQuery" // 1\r
+ "\tA" // 2\r
+ "\tB" // 3\r
+ "\tIdQM" // 4\r
+ "\tIdQA" // 5\r
+ "\tIdQB" // 6\r
+ "\tIdAB" // 7\r
+ "\tIdQT" // 8\r
+ "\tLY" // 9\r
+ "\tLN" // 10\r
+ "\tLA" // 11\r
+ "\tRY" // 12\r
+ "\tRN" // 13\r
+ "\tRA" // 14\r
+ "\tDiv" // 15\r
+ "\tY" // 16\r
+ "\n"\r
+ );\r
+ }\r
+\r
+void WriteChimeHit(FILE *f, const ChimeHit2 &Hit)\r
+ {\r
+ if (f == 0)\r
+ return;\r
+\r
+ if (Hit.Div <= 0.0)\r
+ {\r
+ fprintf(f, "0.0000"); // 0\r
+\r
+ fprintf(f,\r
+ "\t%s", Hit.QLabel.c_str()); // 1\r
+\r
+ fprintf(f,\r
+ "\t*" // 2\r
+ "\t*" // 3\r
+ "\t*" // 4\r
+ "\t*" // 5\r
+ "\t*" // 6\r
+ "\t*" // 7\r
+ "\t*" // 8\r
+ "\t*" // 9\r
+ "\t*" // 10\r
+ "\t*" // 11\r
+ "\t*" // 12\r
+ "\t*" // 13\r
+ "\t*" // 14\r
+ "\t*" // 15\r
+ "\tN" // 16\r
+ "\n"\r
+ );\r
+ return;\r
+ }\r
+\r
+ fprintf(f, "%.4f", Hit.Score); // 0\r
+\r
+ fputc('\t', f);\r
+ fputs(Hit.QLabel.c_str(), f); // 1\r
+\r
+ fputc('\t', f);\r
+ fputs(Hit.ALabel.c_str(), f); // 2\r
+\r
+ fputc('\t', f);\r
+ fputs(Hit.BLabel.c_str(), f); // 3\r
+\r
+ fprintf(f, "\t%.1f", Hit.PctIdQM); // 4\r
+ fprintf(f, "\t%.1f", Hit.PctIdQA); // 5\r
+ fprintf(f, "\t%.1f", Hit.PctIdQB); // 6\r
+ fprintf(f, "\t%.1f", Hit.PctIdAB); // 7\r
+ fprintf(f, "\t%.1f", Hit.PctIdQT); // 8\r
+\r
+ fprintf(f, "\t%u", Hit.CS_LY); // 9\r
+ fprintf(f, "\t%u", Hit.CS_LN); // 10\r
+ fprintf(f, "\t%u", Hit.CS_LA); // 11\r
+\r
+ fprintf(f, "\t%u", Hit.CS_RY); // 12\r
+ fprintf(f, "\t%u", Hit.CS_RN); // 13\r
+ fprintf(f, "\t%u", Hit.CS_RA); // 14\r
+\r
+ fprintf(f, "\t%.2f", Hit.Div); // 15\r
+\r
+ fprintf(f, "\t%c", yon(Hit.Accept())); // 16\r
+ fputc('\n', f);\r
+ }\r
+\r
+unsigned GetUngappedLength(const byte *Seq, unsigned L)\r
+ {\r
+ unsigned UL = 0;\r
+ for (unsigned i = 0; i < L; ++i)\r
+ if (!isgap(Seq[i]))\r
+ ++UL;\r
+ return UL;\r
+ }\r
+\r
+void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit)\r
+ {\r
+ if (f == 0)\r
+ return;\r
+\r
+ if (Hit.Div <= 0.0)\r
+ return;\r
+\r
+ const string &Q3 = Hit.Q3;\r
+ const string &A3 = Hit.A3;\r
+ const string &B3 = Hit.B3;\r
+\r
+ const byte *Q3Seq = (const byte *) Q3.c_str();\r
+ const byte *A3Seq = (const byte *) A3.c_str();\r
+ const byte *B3Seq = (const byte *) B3.c_str();\r
+\r
+// Aligned\r
+ unsigned ColCount = SIZE(Q3);\r
+ asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount);\r
+\r
+ unsigned LQ = GetUngappedLength(Q3Seq, ColCount);\r
+ unsigned LA = GetUngappedLength(A3Seq, ColCount);\r
+ unsigned LB = GetUngappedLength(B3Seq, ColCount);\r
+\r
+ fprintf(f, "\n");\r
+ fprintf(f, "------------------------------------------------------------------------\n");\r
+ fprintf(f, "Query (%5u nt) %s\n", LQ, Hit.QLabel.c_str());\r
+ fprintf(f, "ParentA (%5u nt) %s\n", LA, Hit.ALabel.c_str());\r
+ fprintf(f, "ParentB (%5u nt) %s\n", LB, Hit.BLabel.c_str());\r
+\r
+// Strip terminal gaps in query\r
+ unsigned FromCol = UINT_MAX;\r
+ unsigned ToCol = UINT_MAX;\r
+ for (unsigned Col = 0; Col < ColCount; ++Col)\r
+ {\r
+ if (!isgap(Q3Seq[Col]))\r
+ {\r
+ if (FromCol == UINT_MAX)\r
+ FromCol = Col;\r
+ ToCol = Col;\r
+ }\r
+ }\r
+\r
+ unsigned QPos = 0;\r
+ unsigned APos = 0;\r
+ unsigned BPos = 0;\r
+ for (unsigned Col = 0; Col < FromCol; ++Col)\r
+ {\r
+ if (!isgap(A3Seq[Col]))\r
+ ++APos;\r
+ if (!isgap(B3Seq[Col]))\r
+ ++BPos;\r
+ }\r
+\r
+ unsigned Range = ToCol - FromCol + 1;\r
+ unsigned RowCount = (Range + 79)/80;\r
+ unsigned RowFromCol = FromCol;\r
+ for (unsigned RowIndex = 0; RowIndex < RowCount; ++RowIndex)\r
+ {\r
+ fprintf(f, "\n");\r
+ unsigned RowToCol = RowFromCol + 79;\r
+ if (RowToCol > ToCol)\r
+ RowToCol = ToCol;\r
+\r
+ // A row\r
+ fprintf(f, "A %5u ", APos + 1);\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ if (a != q)\r
+ a = tolower(a);\r
+ fprintf(f, "%c", a);\r
+ if (!isgap(a))\r
+ ++APos;\r
+ }\r
+ fprintf(f, " %u\n", APos);\r
+\r
+ // Q row\r
+ fprintf(f, "Q %5u ", QPos + 1);\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ fprintf(f, "%c", q);\r
+ if (!isgap(q))\r
+ ++QPos;\r
+ }\r
+ fprintf(f, " %u\n", QPos);\r
+\r
+ // B row\r
+ fprintf(f, "B %5u ", BPos + 1);\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char b = B3Seq[Col];\r
+ if (b != q)\r
+ b = tolower(b);\r
+ fprintf(f, "%c", b);\r
+ if (!isgap(b))\r
+ ++BPos;\r
+ }\r
+ fprintf(f, " %u\n", BPos);\r
+\r
+ // Diffs\r
+ fprintf(f, "Diffs ");\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ char c = ' ';\r
+ if (isgap(q) || isgap(a) || isgap(b))\r
+ c = ' ';\r
+ else if (Col < Hit.ColXLo)\r
+ {\r
+ if (q == a && q == b)\r
+ c = ' ';\r
+ else if (q == a && q != b)\r
+ c = 'A';\r
+ else if (q == b && q != a)\r
+ c = 'b';\r
+ else if (a == b && q != a)\r
+ c = 'N';\r
+ else\r
+ c = '?';\r
+ }\r
+ else if (Col > Hit.ColXHi)\r
+ {\r
+ if (q == a && q == b)\r
+ c = ' ';\r
+ else if (q == b && q != a)\r
+ c = 'B';\r
+ else if (q == a && q != b)\r
+ c = 'a';\r
+ else if (a == b && q != a)\r
+ c = 'N';\r
+ else\r
+ c = '?';\r
+ }\r
+\r
+ fprintf(f, "%c", c);\r
+ }\r
+ fprintf(f, "\n");\r
+\r
+ // SNPs\r
+ fprintf(f, "Votes ");\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ char q = Q3Seq[Col];\r
+ char a = A3Seq[Col];\r
+ char b = B3Seq[Col];\r
+\r
+ bool PrevGap = Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]));\r
+ bool NextGap = Col+1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]));\r
+\r
+ char c = ' ';\r
+ if (isgap(q) || isgap(a) || isgap(b) || PrevGap || NextGap)\r
+ c = ' ';\r
+ else if (Col < Hit.ColXLo)\r
+ {\r
+ if (q == a && q == b)\r
+ c = ' ';\r
+ else if (q == a && q != b)\r
+ c = '+';\r
+ else if (q == b && q != a)\r
+ c = '!';\r
+ else\r
+ c = '0';\r
+ }\r
+ else if (Col > Hit.ColXHi)\r
+ {\r
+ if (q == a && q == b)\r
+ c = ' ';\r
+ else if (q == b && q != a)\r
+ c = '+';\r
+ else if (q == a && q != b)\r
+ c = '!';\r
+ else\r
+ c = '0';\r
+ }\r
+\r
+ fprintf(f, "%c", c);\r
+ }\r
+ fprintf(f, "\n");\r
+\r
+ // LR row\r
+ fprintf(f, "Model ");\r
+ for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col)\r
+ {\r
+ if (Col < Hit.ColXLo)\r
+ fprintf(f, "A");\r
+ else if (Col >= Hit.ColXLo && Col <= Hit.ColXHi)\r
+ fprintf(f, "x");\r
+ else\r
+ fprintf(f, "B");\r
+ }\r
+\r
+ fprintf(f, "\n");\r
+\r
+ RowFromCol += 80;\r
+ }\r
+ fprintf(f, "\n");\r
+\r
+ double PctIdBestP = max(Hit.PctIdQA, Hit.PctIdQB);\r
+ double Div = (Hit.PctIdQM - PctIdBestP)*100.0/PctIdBestP;\r
+\r
+ unsigned LTot = Hit.CS_LY + Hit.CS_LN + Hit.CS_LA;\r
+ unsigned RTot = Hit.CS_RY + Hit.CS_RN + Hit.CS_RA;\r
+\r
+ double PctL = Pct(Hit.CS_LY, LTot);\r
+ double PctR = Pct(Hit.CS_RY, RTot);\r
+\r
+ fprintf(f,\r
+ "Ids. QA %.1f%%, QB %.1f%%, AB %.1f%%, QModel %.1f%%, Div. %+.1f%%\n",\r
+ Hit.PctIdQA,\r
+ Hit.PctIdQB,\r
+ Hit.PctIdAB,\r
+ Hit.PctIdQM,\r
+ Div);\r
+\r
+ fprintf(f,\r
+ "Diffs Left %u: N %u, A %u, Y %u (%.1f%%); Right %u: N %u, A %u, Y %u (%.1f%%), Score %.4f\n",\r
+ LTot, Hit.CS_LN, Hit.CS_LA, Hit.CS_LY, PctL,\r
+ RTot, Hit.CS_RN, Hit.CS_RA, Hit.CS_RY, PctR,\r
+ Hit.Score);\r
+ }\r
*/
#include "unifracunweightedcommand.h"
+#include "treereader.h"
+#include "subsample.h"
+#include "consensus.h"
//**********************************************************************************************************************
vector<string> UnifracUnweightedCommand::setParameters(){
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
- CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
+ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+ CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus);
+ CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
+ helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
+ helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results of the subsampling, as well as a consensus tree built from these trees. Default=F.\n";
helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
outputTypes["uwsummary"] = tempOutNames;
outputTypes["phylip"] = tempOutNames;
outputTypes["column"] = tempOutNames;
+ outputTypes["tree"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
outputTypes["uwsummary"] = tempOutNames;
outputTypes["phylip"] = tempOutNames;
outputTypes["column"] = tempOutNames;
+ outputTypes["tree"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
}
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
- //check for required parameters
+ //check for required parameters
treefile = validParameter.validFile(parameters, "tree", true);
if (treefile == "not open") { abort = true; }
else if (treefile == "not found") { //if there is a current design file, use it
else if (namefile == "not found") { namefile = ""; }
else { m->setNameFile(namefile); }
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+ temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+ else {
+ if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
+ else { subsample = false; }
+ }
+
+ if (!subsample) { subsampleIters = 0; }
+ else { subsampleIters = iters; }
+
+ temp = validParameter.validFile(parameters, "consensus", false); if (temp == "not found") { temp = "F"; }
+ consensus = m->isTrue(temp);
+
+ if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; }
+ if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; }
+ if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
+ if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
+
if (!random) { iters = 0; } //turn off random calcs
//if user selects distance = true and no groups it won't calc the pairwise
m->setTreeFile(treefile);
- if (groupfile != "") {
- //read in group map info.
- tmap = new TreeMap(groupfile);
- tmap->readMap();
- }else{ //fake out by putting everyone in one group
- Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
- tmap = new TreeMap();
-
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
- }
-
- if (namefile != "") { readNamesFile(); }
-
- read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
-
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- T = read->getTrees();
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (namefile != "") {
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
- }else { numNamesInTree = m->Treenames.size(); }
-
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
-
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
-
+ TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+ T = reader->getTrees();
+ tmap = T[0]->getTreeMap();
+ map<string, string> nameMap = reader->getNames();
+ delete reader;
+
sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
m->openOutputFile(sumFile, outSum);
- util = new SharedUtil();
+ SharedUtil util;
Groups = m->getGroups();
vector<string> namesGroups = tmap->getNamesOfGroups();
- util->setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
- util->getCombos(groupComb, Groups, numComp);
- m->setGroups(Groups);
- delete util;
-
- if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
+ util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
- unweighted = new Unweighted(tmap, includeRoot);
+ Unweighted unweighted(includeRoot);
int start = time(NULL);
-
- userData.resize(numComp,0); //data[0] = unweightedscore
- randomData.resize(numComp,0); //data[0] = unweightedscore
- //create new tree with same num nodes and leaves as users
-
+
+ //set or check size
+ if (subsample) {
+ //user has not set size, set size = smallest samples size
+ if (subsampleSize == -1) {
+ vector<string> temp; temp.push_back(Groups[0]);
+ subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+ for (int i = 1; i < Groups.size(); i++) {
+ temp.clear(); temp.push_back(Groups[i]);
+ int thisSize = (tmap->getNamesSeqs(temp)).size();
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
+ }else { //eliminate any too small groups
+ vector<string> newGroups = Groups;
+ Groups.clear();
+ for (int i = 0; i < newGroups.size(); i++) {
+ vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
+ vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
+ int thisSize = thisGroupsSeqs.size();
+
+ if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); }
+ else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+ }
+ m->setGroups(Groups);
+ }
+ }
+
+ util.getCombos(groupComb, Groups, numComp);
+ m->setGroups(Groups);
+
+ if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
+
if (numComp < processors) { processors = numComp; }
+
+ if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
outSum << "Tree#" << '\t' << "Groups" << '\t' << "UWScore" <<'\t';
m->mothurOut("Tree#\tGroups\tUWScore\t");
//get pscores for users trees
for (int i = 0; i < T.size(); i++) {
- if (m->control_pressed) {
- delete tmap; delete unweighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; }
- outSum.close();
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
- return 0;
- }
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
counter = 0;
rCumul.resize(numComp);
utreeScores.resize(numComp);
UWScoreSig.resize(numComp);
+
+ vector<double> userData; userData.resize(numComp,0); //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
- userData = unweighted->getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
+ userData = unweighted.getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
- if (m->control_pressed) { delete tmap; delete unweighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; }
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; }
//output scores for each combination
for(int k = 0; k < numComp; k++) {
//add users score to validscores
validScores[userData[k]] = userData[k];
+
+ if (!random) { UWScoreSig[k].push_back(0.0); }
}
-
- //get unweighted scores for random trees - if random is false iters = 0
- for (int j = 0; j < iters; j++) {
-
- //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
- randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
-
- if (m->control_pressed) { delete tmap; delete unweighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
- for(int k = 0; k < numComp; k++) {
- //add trees unweighted score to map of scores
- map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
- if (it != rscoreFreq[k].end()) {//already have that score
- rscoreFreq[k][randomData[k]]++;
- }else{//first time we have seen this score
- rscoreFreq[k][randomData[k]] = 1;
- }
-
- //add randoms score to validscores
- validScores[randomData[k]] = randomData[k];
- }
-
- //report progress
-// m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
- }
-
- for(int a = 0; a < numComp; a++) {
- float rcumul = 1.0000;
-
- if (random) {
- //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
- for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
- //make rscoreFreq map and rCumul
- map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
- rCumul[a][it->first] = rcumul;
- //get percentage of random trees with that info
- if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
- else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
- }
- UWScoreSig[a].push_back(rCumul[a][userData[a]]);
- }else { UWScoreSig[a].push_back(0.0); }
-
- }
-
- if (m->control_pressed) { delete tmap; delete unweighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
- //print output files
+
+ if (random) { runRandomCalcs(T[i], userData); }
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //subsample loop
+ vector< vector<double> > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files
+ for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
+
+ if (m->control_pressed) { break; }
+
+ //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
+ TreeMap* newTmap = new TreeMap();
+ newTmap->getCopy(*tmap);
+
+ SubSample sample;
+ Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+
+ //call new weighted function
+ vector<double> iterData; iterData.resize(numComp,0);
+ Unweighted thisUnweighted(includeRoot);
+ iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
+
+ //save data to make ave dist, std dist
+ calcDistsTotals.push_back(iterData);
+
+ delete newTmap;
+ delete subSampleTree;
+
+ if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); }
+ }
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); }
+ if (consensus) { getConsensusTrees(calcDistsTotals, i); }
+
+ //print output files
printUWSummaryFile(i);
if (random) { printUnweightedFile(); delete output; }
if (phylip) { createPhylipFile(i); }
outSum.close();
- m->clearGroups();
- delete tmap; delete unweighted;
+ delete tmap;
for (int i = 0; i < T.size(); i++) { delete T[i]; }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
exit(1);
}
}
+/**************************************************************************************************/
+int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
+ try {
+ //we need to find the average distance and standard deviation for each groups distance
+
+ //finds sum
+ vector<double> averages; averages.resize(numComp, 0);
+ for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
+ for (int i = 0; i < dists[thisIter].size(); i++) {
+ averages[i] += dists[thisIter][i];
+ }
+ }
+
+ //finds average.
+ for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; }
+
+ //find standard deviation
+ vector<double> stdDev; stdDev.resize(numComp, 0);
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int j = 0; j < dists[thisIter].size(); j++) {
+ stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+ }
+ }
+ for (int i = 0; i < stdDev.size(); i++) {
+ stdDev[i] /= (float) subsampleIters;
+ stdDev[i] = sqrt(stdDev[i]);
+ }
+
+ //make matrix with scores in it
+ vector< vector<double> > avedists; avedists.resize(m->getNumGroups());
+ for (int i = 0; i < m->getNumGroups(); i++) {
+ avedists[i].resize(m->getNumGroups(), 0.0);
+ }
+
+ //make matrix with scores in it
+ vector< vector<double> > stddists; stddists.resize(m->getNumGroups());
+ for (int i = 0; i < m->getNumGroups(); i++) {
+ stddists[i].resize(m->getNumGroups(), 0.0);
+ }
+
+ //flip it so you can print it
+ int count = 0;
+ for (int r=0; r<m->getNumGroups(); r++) {
+ for (int l = 0; l < r; l++) {
+ avedists[r][l] = averages[count];
+ avedists[l][r] = averages[count];
+ stddists[r][l] = stdDev[count];
+ stddists[l][r] = stdDev[count];
+ count++;
+ }
+ }
+
+ string aveFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".unweighted.ave.dist";
+ outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName);
+
+ ofstream out;
+ m->openOutputFile(aveFileName, out);
+
+ string stdFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".unweighted.std.dist";
+ outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName);
+
+ ofstream outStd;
+ m->openOutputFile(stdFileName, outStd);
+
+ if ((outputForm == "lt") || (outputForm == "square")) {
+ //output numSeqs
+ out << m->getNumGroups() << endl;
+ outStd << m->getNumGroups() << endl;
+ }
+
+ //output to file
+ for (int r=0; r<m->getNumGroups(); r++) {
+ //output name
+ string name = (m->getGroups())[r];
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+
+ if (outputForm == "lt") {
+ out << name << '\t';
+ outStd << name << '\t';
+
+ //output distances
+ for (int l = 0; l < r; l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t';}
+ out << endl; outStd << endl;
+ }else if (outputForm == "square") {
+ out << name << '\t';
+ outStd << name << '\t';
+
+ //output distances
+ for (int l = 0; l < m->getNumGroups(); l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t'; }
+ out << endl; outStd << endl;
+ }else{
+ //output distances
+ for (int l = 0; l < r; l++) {
+ string otherName = (m->getGroups())[l];
+ if (otherName.length() < 10) { //pad with spaces to make compatible
+ while (otherName.length() < 10) { otherName += " "; }
+ }
+
+ out << name << '\t' << otherName << avedists[r][l] << endl;
+ outStd << name << '\t' << otherName << stddists[r][l] << endl;
+ }
+ }
+ }
+ out.close();
+ outStd.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracUnweightedCommand", "getAverageSTDMatrices");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
+ try {
+
+ //used in tree constructor
+ m->runParse = false;
+
+ //create treemap class from groupmap for tree class to use
+ TreeMap newTmap;
+ newTmap.makeSim(m->getGroups());
+
+ //clear old tree names if any
+ m->Treenames.clear();
+
+ //fills globaldatas tree names
+ m->Treenames = m->getGroups();
+
+ vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+
+ if (m->control_pressed) { return 0; }
+
+ Consensus con;
+ Tree* conTree = con.getTree(newTrees);
+
+ //create a new filename
+ string conFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".unweighted.cons.tre";
+ outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
+ ofstream outTree;
+ m->openOutputFile(conFile, outTree);
+
+ if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
+ outTree.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracUnweightedCommand", "getConsensusTrees");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+ try {
+
+ vector<Tree*> trees;
+
+ //create a new filename
+ string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".unweighted.all.tre";
+ outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
+
+ ofstream outAll;
+ m->openOutputFile(outputFile, outAll);
+
+
+ for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
+
+ if (m->control_pressed) { break; }
+
+ //make matrix with scores in it
+ vector< vector<double> > sims; sims.resize(m->getNumGroups());
+ for (int j = 0; j < m->getNumGroups(); j++) {
+ sims[j].resize(m->getNumGroups(), 0.0);
+ }
+
+ int count = 0;
+ for (int r=0; r<m->getNumGroups(); r++) {
+ for (int l = 0; l < r; l++) {
+ double sim = -(dists[i][count]-1.0);
+ sims[r][l] = sim;
+ sims[l][r] = sim;
+ count++;
+ }
+ }
+
+ //create tree
+ Tree* tempTree = new Tree(&mytmap, sims);
+ map<string, string> empty;
+ tempTree->assembleTree(empty);
+
+ trees.push_back(tempTree);
+
+ //print tree
+ tempTree->print(outAll);
+ }
+
+ outAll.close();
+
+ if (m->control_pressed) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
+
+ return trees;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracUnweightedCommand", "buildTrees");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+int UnifracUnweightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
+ try {
+ vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
+
+ Unweighted unweighted(includeRoot);
+
+ //get unweighted scores for random trees - if random is false iters = 0
+ for (int j = 0; j < iters; j++) {
+
+ //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
+ randomData = unweighted.getValues(thisTree, "", "", processors, outputDir);
+
+ if (m->control_pressed) { return 0; }
+
+ for(int k = 0; k < numComp; k++) {
+ //add trees unweighted score to map of scores
+ map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
+ if (it != rscoreFreq[k].end()) {//already have that score
+ rscoreFreq[k][randomData[k]]++;
+ }else{//first time we have seen this score
+ rscoreFreq[k][randomData[k]] = 1;
+ }
+
+ //add randoms score to validscores
+ validScores[randomData[k]] = randomData[k];
+ }
+ }
+
+ for(int a = 0; a < numComp; a++) {
+ float rcumul = 1.0000;
+
+ //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
+ for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
+ //make rscoreFreq map and rCumul
+ map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
+ rCumul[a][it->first] = rcumul;
+ //get percentage of random trees with that info
+ if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
+ else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
+ }
+ UWScoreSig[a].push_back(rCumul[a][usersScores[a]]);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracUnweightedCommand", "runRandomCalcs");
+ exit(1);
+ }
+}
/***********************************************************/
void UnifracUnweightedCommand::printUnweightedFile() {
try {
m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
exit(1);
}
-}/*****************************************************************/
-int UnifracUnweightedCommand::readNamesFile() {
- try {
- m->names.clear();
- numUniquesInName = 0;
-
- ifstream in;
- m->openInputFile(namefile, in);
-
- string first, second;
- map<string, string>::iterator itNames;
-
- while(!in.eof()) {
- in >> first >> second; m->gobble(in);
-
- numUniquesInName++;
-
- itNames = m->names.find(first);
- if (itNames == m->names.end()) {
- m->names[first] = second;
-
- //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
- vector<string> dupNames;
- m->splitAtComma(second, dupNames);
-
- for (int i = 0; i < dupNames.size(); i++) {
- nameMap[dupNames[i]] = dupNames[i];
- if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
- }
- }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
- }
- in.close();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
- exit(1);
- }
}
/***********************************************************/
private:
- ReadTree* read;
- SharedUtil* util;
FileOutput* output;
vector<Tree*> T; //user trees
TreeMap* tmap;
- Unweighted* unweighted;
string sumFile, allGroups;
vector<string> groupComb; // AB. AC, BC...
- int iters, numGroups, numComp, counter, processors, numUniquesInName;
- EstOutput userData; //unweighted score info for user tree
- EstOutput randomData; //unweighted score info for random trees
+ int iters, numGroups, numComp, counter, processors, subsampleSize, subsampleIters;
vector< vector<float> > utreeScores; //scores for users trees for each comb.
vector< vector<float> > UWScoreSig; //tree score signifigance when compared to random trees - percentage of random trees with that score or higher.
map<float, float> validScores; //map contains scores from random
vector< map<float, float> > rscoreFreq; //map <unweighted score, number of random trees with that score.> -vector entry for each combination.
vector< map<float, float> > rCumul; //map <unweighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each combination.
- bool abort, phylip, random, includeRoot;
+ bool abort, phylip, random, includeRoot, consensus, subsample;
string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile;
vector<string> Groups, outputNames; //holds groups to be used
ofstream outSum, out;
ifstream inFile;
- map<string, string> nameMap;
+ int runRandomCalcs(Tree*, vector<double>);
void printUWSummaryFile(int);
void printUnweightedFile();
void createPhylipFile(int);
- int readNamesFile();
-
+ vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+ int getConsensusTrees(vector< vector<double> >&, int);
+ int getAverageSTDMatrices(vector< vector<double> >&, int);
};
*/
#include "unifracweightedcommand.h"
+#include "consensus.h"
+#include "subsample.h"
+#include "treereader.h"
//**********************************************************************************************************************
vector<string> UnifracWeightedCommand::setParameters(){
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
+ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+ CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus);
+ CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
string UnifracWeightedCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n";
+ helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n";
helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n";
helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n";
helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
- helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
+ helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
+ helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results, as well as a consensus tree built from these trees. Default=F.\n";
+ helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
helpString += "Example unifrac.weighted(groups=A-B-C, iters=500).\n";
helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
helpString += "The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n";
outputTypes["wsummary"] = tempOutNames;
outputTypes["phylip"] = tempOutNames;
outputTypes["column"] = tempOutNames;
+ outputTypes["tree"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
outputTypes["wsummary"] = tempOutNames;
outputTypes["phylip"] = tempOutNames;
outputTypes["column"] = tempOutNames;
+ outputTypes["tree"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
}
}
- m->runParse = true;
- m->clearGroups();
- m->clearAllGroups();
- m->Treenames.clear();
- m->names.clear();
-
//check for required parameters
treefile = validParameter.validFile(parameters, "tree", true);
if (treefile == "not open") { treefile = ""; abort = true; }
else if (namefile == "not found") { namefile = ""; }
else { m->setNameFile(namefile); }
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); }
//check for optional parameter and set defaults
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
-
- if (!random) { iters = 0; } //turn off random calcs
-
+
+ temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+ else {
+ if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
+ else { subsample = false; }
+ }
+
+ if (!subsample) { subsampleIters = 0; }
+ else { subsampleIters = iters; }
+
+ temp = validParameter.validFile(parameters, "consensus", false); if (temp == "not found") { temp = "F"; }
+ consensus = m->isTrue(temp);
+
+ if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; }
+ if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; }
+ if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
+ if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
+
if (namefile == "") {
vector<string> files; files.push_back(treefile);
parser.getNameFile(files);
m->setTreeFile(treefile);
- if (groupfile != "") {
- //read in group map info.
- tmap = new TreeMap(groupfile);
- tmap->readMap();
- }else{ //fake out by putting everyone in one group
- Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
- tmap = new TreeMap();
-
- for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
- }
-
- if (namefile != "") { readNamesFile(); }
-
- read = new ReadNewickTree(treefile);
- int readOk = read->read(tmap);
-
- if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
-
- read->AssembleTrees();
- T = read->getTrees();
- delete read;
-
- //make sure all files match
- //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
- int numNamesInTree;
- if (namefile != "") {
- if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
- else { numNamesInTree = m->Treenames.size(); }
- }else { numNamesInTree = m->Treenames.size(); }
-
-
- //output any names that are in group file but not in tree
- if (numNamesInTree < tmap->getNumSeqs()) {
- for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
- //is that name in the tree?
- int count = 0;
- for (int j = 0; j < m->Treenames.size(); j++) {
- if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
- count++;
- }
-
- if (m->control_pressed) {
- delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
- m->clearGroups();
- return 0;
- }
+ TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+ T = reader->getTrees();
+ tmap = T[0]->getTreeMap();
+ map<string, string> nameMap = reader->getNames();
+ delete reader;
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
- //then you did not find it so report it
- if (count == m->Treenames.size()) {
- //if it is in your namefile then don't remove
- map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-
- if (it == nameMap.end()) {
- m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
- tmap->removeSeq(tmap->namesOfSeqs[i]);
- i--; //need this because removeSeq removes name from namesOfSeqs
- }
- }
- }
- }
-
sumFile = outputDir + m->getSimpleName(treefile) + ".wsummary";
m->openOutputFile(sumFile, outSum);
outputNames.push_back(sumFile); outputTypes["wsummary"].push_back(sumFile);
-
- util = new SharedUtil();
+
+ SharedUtil util;
string s; //to make work with setgroups
Groups = m->getGroups();
vector<string> nameGroups = tmap->getNamesOfGroups();
- util->setGroups(Groups, nameGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze
- util->getCombos(groupComb, Groups, numComp);
+ util.setGroups(Groups, nameGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze
m->setGroups(Groups);
- delete util;
- weighted = new Weighted(tmap, includeRoot);
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
+
+ Weighted weighted(includeRoot);
int start = time(NULL);
-
- //get weighted for users tree
- userData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
- randomData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
-
- if (numComp < processors) { processors = numComp; }
-
- //get weighted scores for users trees
- for (int i = 0; i < T.size(); i++) {
-
- if (m->control_pressed) { delete tmap; delete weighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
- counter = 0;
- rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
- uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
-
- if (random) {
- output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted", itersString);
- outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
- outputTypes["weighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
- }
-
- userData = weighted->getValues(T[i], processors, outputDir); //userData[0] = weightedscore
-
- if (m->control_pressed) { delete tmap; delete weighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
- //save users score
- for (int s=0; s<numComp; s++) {
- //add users score to vector of user scores
- uScores[s].push_back(userData[s]);
-
- //save users tree score for summary file
- utreeScores.push_back(userData[s]);
- }
-
- if (random) {
-
- //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
- vector< vector<string> > namesOfGroupCombos;
- for (int a=0; a<numGroups; a++) {
- for (int l = 0; l < a; l++) {
- vector<string> groups; groups.push_back((m->getGroups())[a]); groups.push_back((m->getGroups())[l]);
- namesOfGroupCombos.push_back(groups);
- }
- }
-
- lines.clear();
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- if(processors != 1){
- int numPairs = namesOfGroupCombos.size();
- int numPairsPerProcessor = numPairs / processors;
-
- for (int i = 0; i < processors; i++) {
- int startPos = i * numPairsPerProcessor;
- if(i == processors - 1){
- numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
- }
- lines.push_back(linePair(startPos, numPairsPerProcessor));
- }
- }
- #endif
-
-
- //get scores for random trees
- for (int j = 0; j < iters; j++) {
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- if(processors == 1){
- driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
- }else{
- createProcesses(T[i], namesOfGroupCombos, rScores);
- }
- #else
- driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
- #endif
-
- if (m->control_pressed) { delete tmap; delete weighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
- //report progress
-// m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
- }
- lines.clear();
-
- //find the signifigance of the score for summary file
- for (int f = 0; f < numComp; f++) {
- //sort random scores
- sort(rScores[f].begin(), rScores[f].end());
-
- //the index of the score higher than yours is returned
- //so if you have 1000 random trees the index returned is 100
- //then there are 900 trees with a score greater then you.
- //giving you a signifigance of 0.900
- int index = findIndex(userData[f], f); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
-
- //the signifigance is the number of trees with the users score or higher
- WScoreSig.push_back((iters-index)/(float)iters);
- }
-
- //out << "Tree# " << i << endl;
- calculateFreqsCumuls();
- printWeightedFile();
-
- delete output;
-
- }
-
- //clear data
- rScores.clear();
- uScores.clear();
- validScores.clear();
- }
-
-
- if (m->control_pressed) { delete tmap; delete weighted;
- for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
+
+ //set or check size
+ if (subsample) {
+ //user has not set size, set size = smallest samples size
+ if (subsampleSize == -1) {
+ vector<string> temp; temp.push_back(Groups[0]);
+ subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+ for (int i = 1; i < Groups.size(); i++) {
+ temp.clear(); temp.push_back(Groups[i]);
+ int thisSize = (tmap->getNamesSeqs(temp)).size();
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
+ }else { //eliminate any too small groups
+ vector<string> newGroups = Groups;
+ Groups.clear();
+ for (int i = 0; i < newGroups.size(); i++) {
+ vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
+ vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
+ int thisSize = thisGroupsSeqs.size();
+
+ if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); }
+ else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+ }
+ m->setGroups(Groups);
+ }
+ }
+
+ //here in case some groups are removed by subsample
+ util.getCombos(groupComb, Groups, numComp);
+
+ if (numComp < processors) { processors = numComp; }
+
+ if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
+
+ //get weighted scores for users trees
+ for (int i = 0; i < T.size(); i++) {
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ counter = 0;
+ rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
+ uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
+
+ vector<double> userData; userData.resize(numComp,0); //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
+ vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
+
+ if (random) {
+ output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted", itersString);
+ outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
+ outputTypes["weighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
+ }
+
+ userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //save users score
+ for (int s=0; s<numComp; s++) {
+ //add users score to vector of user scores
+ uScores[s].push_back(userData[s]);
+ //save users tree score for summary file
+ utreeScores.push_back(userData[s]);
+ }
+
+ if (random) { runRandomCalcs(T[i], userData); }
+
+ //clear data
+ rScores.clear();
+ uScores.clear();
+ validScores.clear();
+
+ //subsample loop
+ vector< vector<double> > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files
+ for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
+
+ if (m->control_pressed) { break; }
+
+ //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
+ TreeMap* newTmap = new TreeMap();
+ newTmap->getCopy(*tmap);
+
+ SubSample sample;
+ Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+
+ //call new weighted function
+ vector<double> iterData; iterData.resize(numComp,0);
+ Weighted thisWeighted(includeRoot);
+ iterData = thisWeighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
+
+ //save data to make ave dist, std dist
+ calcDistsTotals.push_back(iterData);
+
+ delete newTmap;
+ delete subSampleTree;
+
+ if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); }
+ }
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); }
+ if (consensus) { getConsensusTrees(calcDistsTotals, i); }
+ }
+
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ if (phylip) { createPhylipFile(); }
+
printWSummaryFile();
- if (phylip) { createPhylipFile(); }
-
//clear out users groups
m->clearGroups();
- delete tmap; delete weighted;
+ delete tmap;
for (int i = 0; i < T.size(); i++) { delete T[i]; }
-
- if (m->control_pressed) {
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
- return 0;
- }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine();
}
}
/**************************************************************************************************/
+int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
+ try {
+ //we need to find the average distance and standard deviation for each groups distance
+
+ //finds sum
+ vector<double> averages; averages.resize(numComp, 0);
+ for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
+ for (int i = 0; i < dists[thisIter].size(); i++) {
+ averages[i] += dists[thisIter][i];
+ }
+ }
+
+ //finds average.
+ for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; }
+
+ //find standard deviation
+ vector<double> stdDev; stdDev.resize(numComp, 0);
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int j = 0; j < dists[thisIter].size(); j++) {
+ stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+ }
+ }
+ for (int i = 0; i < stdDev.size(); i++) {
+ stdDev[i] /= (float) subsampleIters;
+ stdDev[i] = sqrt(stdDev[i]);
+ }
+
+ //make matrix with scores in it
+ vector< vector<double> > avedists; avedists.resize(m->getNumGroups());
+ for (int i = 0; i < m->getNumGroups(); i++) {
+ avedists[i].resize(m->getNumGroups(), 0.0);
+ }
+
+ //make matrix with scores in it
+ vector< vector<double> > stddists; stddists.resize(m->getNumGroups());
+ for (int i = 0; i < m->getNumGroups(); i++) {
+ stddists[i].resize(m->getNumGroups(), 0.0);
+ }
+
+ //flip it so you can print it
+ int count = 0;
+ for (int r=0; r<m->getNumGroups(); r++) {
+ for (int l = 0; l < r; l++) {
+ avedists[r][l] = averages[count];
+ avedists[l][r] = averages[count];
+ stddists[r][l] = stdDev[count];
+ stddists[l][r] = stdDev[count];
+ count++;
+ }
+ }
+
+ string aveFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".weighted.ave.dist";
+ outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName);
+
+ ofstream out;
+ m->openOutputFile(aveFileName, out);
+
+ string stdFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".weighted.std.dist";
+ outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName);
+
+ ofstream outStd;
+ m->openOutputFile(stdFileName, outStd);
+
+ if ((outputForm == "lt") || (outputForm == "square")) {
+ //output numSeqs
+ out << m->getNumGroups() << endl;
+ outStd << m->getNumGroups() << endl;
+ }
+
+ //output to file
+ for (int r=0; r<m->getNumGroups(); r++) {
+ //output name
+ string name = (m->getGroups())[r];
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+
+ if (outputForm == "lt") {
+ out << name << '\t';
+ outStd << name << '\t';
+
+ //output distances
+ for (int l = 0; l < r; l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t';}
+ out << endl; outStd << endl;
+ }else if (outputForm == "square") {
+ out << name << '\t';
+ outStd << name << '\t';
+
+ //output distances
+ for (int l = 0; l < m->getNumGroups(); l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t'; }
+ out << endl; outStd << endl;
+ }else{
+ //output distances
+ for (int l = 0; l < r; l++) {
+ string otherName = (m->getGroups())[l];
+ if (otherName.length() < 10) { //pad with spaces to make compatible
+ while (otherName.length() < 10) { otherName += " "; }
+ }
+
+ out << name << '\t' << otherName << avedists[r][l] << endl;
+ outStd << name << '\t' << otherName << stddists[r][l] << endl;
+ }
+ }
+ }
+ out.close();
+ outStd.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracWeightedCommand", "getAverageSTDMatrices");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
+ try {
+
+ //used in tree constructor
+ m->runParse = false;
+
+ //create treemap class from groupmap for tree class to use
+ TreeMap newTmap;
+ newTmap.makeSim(m->getGroups());
+
+ //clear old tree names if any
+ m->Treenames.clear();
+
+ //fills globaldatas tree names
+ m->Treenames = m->getGroups();
+
+ vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+
+ if (m->control_pressed) { return 0; }
+
+ Consensus con;
+ Tree* conTree = con.getTree(newTrees);
+
+ //create a new filename
+ string conFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.cons.tre";
+ outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
+ ofstream outTree;
+ m->openOutputFile(conFile, outTree);
+
+ if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
+ outTree.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracWeightedCommand", "getConsensusTrees");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+ try {
+
+ vector<Tree*> trees;
+
+ //create a new filename
+ string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.all.tre";
+ outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
+
+ ofstream outAll;
+ m->openOutputFile(outputFile, outAll);
+
+
+ for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
+
+ if (m->control_pressed) { break; }
+
+ //make matrix with scores in it
+ vector< vector<double> > sims; sims.resize(m->getNumGroups());
+ for (int j = 0; j < m->getNumGroups(); j++) {
+ sims[j].resize(m->getNumGroups(), 0.0);
+ }
+
+ int count = 0;
+ for (int r=0; r<m->getNumGroups(); r++) {
+ for (int l = 0; l < r; l++) {
+ double sim = -(dists[i][count]-1.0);
+ sims[r][l] = sim;
+ sims[l][r] = sim;
+ count++;
+ }
+ }
+
+ //create tree
+ Tree* tempTree = new Tree(&mytmap, sims);
+ map<string, string> empty;
+ tempTree->assembleTree(empty);
+
+ trees.push_back(tempTree);
+
+ //print tree
+ tempTree->print(outAll);
+ }
+
+ outAll.close();
+
+ if (m->control_pressed) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
+
+ return trees;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracWeightedCommand", "buildTrees");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
+ try {
+
+ //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
+ vector< vector<string> > namesOfGroupCombos;
+ for (int a=0; a<numGroups; a++) {
+ for (int l = 0; l < a; l++) {
+ vector<string> groups; groups.push_back((m->getGroups())[a]); groups.push_back((m->getGroups())[l]);
+ namesOfGroupCombos.push_back(groups);
+ }
+ }
+
+ lines.clear();
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if(processors != 1){
+ int numPairs = namesOfGroupCombos.size();
+ int numPairsPerProcessor = numPairs / processors;
+
+ for (int i = 0; i < processors; i++) {
+ int startPos = i * numPairsPerProcessor;
+ if(i == processors - 1){
+ numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
+ }
+ lines.push_back(linePair(startPos, numPairsPerProcessor));
+ }
+ }
+#endif
+
+
+ //get scores for random trees
+ for (int j = 0; j < iters; j++) {
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if(processors == 1){
+ driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
+ }else{
+ createProcesses(thisTree, namesOfGroupCombos, rScores);
+ }
+#else
+ driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
+#endif
+
+ if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //report progress
+ // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
+ }
+ lines.clear();
+
+ //find the signifigance of the score for summary file
+ for (int f = 0; f < numComp; f++) {
+ //sort random scores
+ sort(rScores[f].begin(), rScores[f].end());
+
+ //the index of the score higher than yours is returned
+ //so if you have 1000 random trees the index returned is 100
+ //then there are 900 trees with a score greater then you.
+ //giving you a signifigance of 0.900
+ int index = findIndex(usersScores[f], f); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
+
+ //the signifigance is the number of trees with the users score or higher
+ WScoreSig.push_back((iters-index)/(float)iters);
+ }
+
+ //out << "Tree# " << i << endl;
+ calculateFreqsCumuls();
+ printWeightedFile();
+
+ delete output;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "UnifracWeightedCommand", "runRandomCalcs");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
try {
int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) {
try {
Tree* randT = new Tree(tmap);
-
+
+ Weighted weighted(includeRoot);
+
for (int h = start; h < (start+num); h++) {
if (m->control_pressed) { return 0; }
if (m->control_pressed) { delete randT; return 0; }
//get wscore of random tree
- EstOutput randomData = weighted->getValues(randT, groupA, groupB);
+ EstOutput randomData = weighted.getValues(randT, groupA, groupB);
if (m->control_pressed) { delete randT; return 0; }
exit(1);
}
}
-/*****************************************************************/
-int UnifracWeightedCommand::readNamesFile() {
- try {
- m->names.clear();
- numUniquesInName = 0;
-
- ifstream in;
- m->openInputFile(namefile, in);
-
- string first, second;
- map<string, string>::iterator itNames;
-
- while(!in.eof()) {
- in >> first >> second; m->gobble(in);
-
- numUniquesInName++;
-
- itNames = m->names.find(first);
- if (itNames == m->names.end()) {
- m->names[first] = second;
-
- //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
- vector<string> dupNames;
- m->splitAtComma(second, dupNames);
-
- for (int i = 0; i < dupNames.size(); i++) {
- nameMap[dupNames[i]] = dupNames[i];
- if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
- }
- }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
- }
- in.close();
-
- return 0;
- }
- catch(exception& e) {
- m->errorOut(e, "UnifracWeightedCommand", "readNamesFile");
- exit(1);
- }
-}
/***********************************************************/
linePair(int i, int j) : start(i), num(j) {}
};
vector<linePair> lines;
-
- ReadTree* read;
- SharedUtil* util;
+ TreeMap* tmap;
FileOutput* output;
vector<Tree*> T; //user trees
vector<double> utreeScores; //user tree unweighted scores
vector<double> WScoreSig; //tree weighted score signifigance when compared to random trees - percentage of random trees with that score or lower.
vector<string> groupComb; // AB. AC, BC...
- TreeMap* tmap;
- Weighted* weighted;
string sumFile, outputDir;
int iters, numGroups, numComp, counter;
- EstOutput userData; //weighted score info for user tree
- EstOutput randomData; //weighted score info for random trees
vector< vector<double> > rScores; //vector<weighted scores for random trees.> each group comb has an entry
vector< vector<double> > uScores; //vector<weighted scores for user trees.> each group comb has an entry
vector< map<float, float> > rScoreFreq; //map <weighted score, number of random trees with that score.> -vector entry for each combination.
vector< map<float, float> > rCumul; //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c
map<float, float> validScores; //map contains scores from random
- bool abort, phylip, random, includeRoot;
+ bool abort, phylip, random, includeRoot, subsample, consensus;
string groups, itersString, outputForm, treefile, groupfile, namefile;
vector<string> Groups, outputNames; //holds groups to be used
- int processors, numUniquesInName;
+ int processors, subsampleSize, subsampleIters;
ofstream outSum;
map<string, string> nameMap;
void calculateFreqsCumuls();
int createProcesses(Tree*, vector< vector<string> >, vector< vector<double> >&);
int driver(Tree*, vector< vector<string> >, int, int, vector< vector<double> >&);
- int readNamesFile();
+ int runRandomCalcs(Tree*, vector<double>);
+ vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+ int getConsensusTrees(vector< vector<double> >&, int);
+ int getAverageSTDMatrices(vector< vector<double> >&, int);
};
try {
processors = p;
outputDir = o;
-
+
+ TreeMap* tmap = t->getTreeMap();
+
//if the users enters no groups then give them the score of all groups
int numGroups = m->getNumGroups();
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
}else{
int numPairs = namesOfGroupCombos.size();
lines.push_back(linePair(startPos, numPairsPerProcessor));
}
- data = createProcesses(t, namesOfGroupCombos);
+ data = createProcesses(t, namesOfGroupCombos, tmap);
lines.clear();
}
#else
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
#endif
return data;
}
/**************************************************************************************************/
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
process++;
}else if (pid == 0){
EstOutput myresults;
- myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+ myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
if (m->control_pressed) { exit(0); }
}
}
- results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+ results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
//force parent to wait until all the processes are done
for (int i=0;i<(processors-1);i++) {
}
}
/**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) {
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) {
try {
processors = p;
outputDir = o;
+ TreeMap* tmap = t->getTreeMap();
+
//if the users enters no groups then give them the score of all groups
int numGroups = m->getNumGroups();
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true);
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
}else{
int numPairs = namesOfGroupCombos.size();
lines.push_back(linePair(startPos, numPairsPerProcessor));
}
- data = createProcesses(t, namesOfGroupCombos, true);
+ data = createProcesses(t, namesOfGroupCombos, true, tmap);
lines.clear();
}
#else
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true);
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
#endif
return data;
}
/**************************************************************************************************/
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups, TreeMap* tmap) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
process++;
}else if (pid == 0){
EstOutput myresults;
- myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups);
+ myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, tmap);
if (m->control_pressed) { exit(0); }
}
}
- results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups);
+ results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, tmap);
//force parent to wait until all the processes are done
for (int i=0;i<(processors-1);i++) {
}
}
/**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups) {
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups, TreeMap* tmap) {
try {
EstOutput results; results.resize(num);
class Unweighted : public TreeCalculator {
public:
- Unweighted(TreeMap* t, bool r) : tmap(t), includeRoot(r) {};
+ Unweighted(bool r) : includeRoot(r) {};
~Unweighted() {};
EstOutput getValues(Tree*, int, string);
EstOutput getValues(Tree*, string, string, int, string);
vector<linePair> lines;
EstOutput data;
- TreeMap* tmap;
int processors;
string outputDir;
map< vector<string>, set<int> > rootForGrouping; //maps a grouping combo to the roots for that combo
bool includeRoot;
- EstOutput driver(Tree*, vector< vector<string> >, int, int);
- EstOutput createProcesses(Tree*, vector< vector<string> >);
- EstOutput driver(Tree*, vector< vector<string> >, int, int, bool);
- EstOutput createProcesses(Tree*, vector< vector<string> >, bool);
+ EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*);
+ EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
+ EstOutput driver(Tree*, vector< vector<string> >, int, int, bool, TreeMap*);
+ EstOutput createProcesses(Tree*, vector< vector<string> >, bool, TreeMap*);
int getRoot(Tree*, int, vector<string>);
};
vector<double> D;
processors = p;
outputDir = o;
+
+ TreeMap* tmap = t->getTreeMap();
numGroups = m->getNumGroups();
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
}else{
int numPairs = namesOfGroupCombos.size();
lines.push_back(linePair(startPos, numPairsPerProcessor));
}
- data = createProcesses(t, namesOfGroupCombos);
+ data = createProcesses(t, namesOfGroupCombos, tmap);
lines.clear();
}
#else
- data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
+ data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
#endif
return data;
}
/**************************************************************************************************/
-EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
+EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
}else if (pid == 0){
EstOutput Myresults;
- Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num);
+ Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
- m->mothurOut("Merging results."); m->mothurOutEndLine();
+ //m->mothurOut("Merging results."); m->mothurOutEndLine();
//pass numSeqs to parent
ofstream out;
}
}
- results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num);
+ results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
//force parent to wait until all the processes are done
for (int i=0;i<(processors-1);i++) {
m->mothurRemove(s);
}
- m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine();
+ //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine();
return results;
#endif
}
}
/**************************************************************************************************/
-EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num) {
+EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) {
try {
EstOutput results;
vector<double> D;
try {
data.clear(); //clear out old values
+
+ TreeMap* tmap = t->getTreeMap();
if (m->control_pressed) { return data; }
class Weighted : public TreeCalculator {
public:
- Weighted(TreeMap* t, bool r) : tmap(t), includeRoot(r) {};
+ Weighted( bool r) : includeRoot(r) {};
~Weighted() {};
EstOutput getValues(Tree*, string, string);
vector<linePair> lines;
EstOutput data;
- TreeMap* tmap;
map<string, int>::iterator it;
map<string, double> WScore; //a score for each group combination i.e. AB, AC, BC.
int processors;
map< vector<string>, set<int> > rootForGrouping; //maps a grouping combo to the root for that combo
bool includeRoot;
- EstOutput driver(Tree*, vector< vector<string> >, int, int);
- EstOutput createProcesses(Tree*, vector< vector<string> >);
+ EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*);
+ EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
double getLengthToRoot(Tree*, int, string, string);
};