From 57b3c96832667c1b70d4d526331f52e3d49e8237 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 27 Mar 2012 12:35:08 -0400 Subject: [PATCH] added pcr.seqs command. fixed bug in rarefacftion.single that caused parsing error if group names contained '_' in shared file. metastats - modified to more closely follow results from metastats website. trim.flows and trim.seqs allow for characters other than ATGC in reverse primers. removed name file warning when mother calls command internally. --- Mothur.xcodeproj/project.pbxproj | 18 +- chimeraperseuscommand.cpp | 5 +- chimeraslayercommand.cpp | 5 +- chimerauchimecommand.cpp | 5 +- classifyseqscommand.cpp | 12 +- commandfactory.cpp | 5 + deconvolutecommand.cpp | 5 +- filterseqscommand.cpp | 7 +- fisher2.c | 2158 ------------------------------ fisher2.h | 29 - makefile | 10 +- metastats.h | 45 - metastats2.c | 559 -------- metastatscommand.cpp | 16 +- mothurmetastats.cpp | 675 ++++------ mothurmetastats.h | 7 +- mothurout.h | 3 +- optionparser.cpp | 2 +- pcrseqscommand.h | 356 +++++ phylotypecommand.cpp | 5 +- prcseqscommand.cpp | 1027 ++++++++++++++ preclustercommand.cpp | 5 +- rarefactcommand.cpp | 43 +- screenseqscommand.cpp | 3 +- screenseqscommand.h | 7 +- sharedcommand.cpp | 41 +- shhhercommand.cpp | 38 +- shhhseqscommand.cpp | 5 +- subsamplecommand.cpp | 6 +- trialswap2.h | 4 +- trimflowscommand.cpp | 50 +- trimflowscommand.h | 3 +- trimseqscommand.cpp | 47 +- trimseqscommand.h | 1 + 34 files changed, 1896 insertions(+), 3311 deletions(-) delete mode 100644 fisher2.c delete mode 100644 fisher2.h delete mode 100644 metastats.h delete mode 100644 metastats2.c create mode 100644 pcrseqscommand.h create mode 100644 prcseqscommand.cpp diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 4d52554..2550873 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -44,6 +44,7 @@ A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A754149714840CF7005850D1 /* summaryqualcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A754149614840CF7005850D1 /* summaryqualcommand.cpp */; }; A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; }; + A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A76CDD811510F143004C8458 /* prcseqscommand.cpp */; }; A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7730EFE13967241007433A3 /* countseqscommand.cpp */; }; A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774101314695AF60098E6AC /* shhhseqscommand.cpp */; }; A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; }; @@ -129,7 +130,6 @@ A7E9B8C412D37EC400DA6239 /* fastamap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6DE12D37EC400DA6239 /* fastamap.cpp */; }; A7E9B8C512D37EC400DA6239 /* fileoutput.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E012D37EC400DA6239 /* fileoutput.cpp */; }; A7E9B8C612D37EC400DA6239 /* filterseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E312D37EC400DA6239 /* filterseqscommand.cpp */; }; - A7E9B8C712D37EC400DA6239 /* fisher2.c in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E512D37EC400DA6239 /* fisher2.c */; }; A7E9B8C812D37EC400DA6239 /* flowdata.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E712D37EC400DA6239 /* flowdata.cpp */; }; A7E9B8C912D37EC400DA6239 /* formatcolumn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E912D37EC400DA6239 /* formatcolumn.cpp */; }; A7E9B8CA12D37EC400DA6239 /* formatphylip.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6EC12D37EC400DA6239 /* formatphylip.cpp */; }; @@ -183,7 +183,6 @@ A7E9B8FB12D37EC400DA6239 /* memeuclidean.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B74F12D37EC400DA6239 /* memeuclidean.cpp */; }; A7E9B8FC12D37EC400DA6239 /* mempearson.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75112D37EC400DA6239 /* mempearson.cpp */; }; A7E9B8FD12D37EC400DA6239 /* mergefilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75312D37EC400DA6239 /* mergefilecommand.cpp */; }; - A7E9B8FE12D37EC400DA6239 /* metastats2.c in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75612D37EC400DA6239 /* metastats2.c */; }; A7E9B8FF12D37EC400DA6239 /* metastatscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75712D37EC400DA6239 /* metastatscommand.cpp */; }; A7E9B90012D37EC400DA6239 /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75912D37EC400DA6239 /* mgclustercommand.cpp */; }; A7E9B90112D37EC400DA6239 /* mothur.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75B12D37EC400DA6239 /* mothur.cpp */; }; @@ -458,6 +457,8 @@ A754149614840CF7005850D1 /* summaryqualcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = summaryqualcommand.cpp; sourceTree = ""; }; A75790571301749D00A30DAB /* homovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = homovacommand.h; sourceTree = ""; }; A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = ""; }; + A76CDD7F1510F09A004C8458 /* pcrseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcrseqscommand.h; sourceTree = ""; }; + A76CDD811510F143004C8458 /* prcseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prcseqscommand.cpp; sourceTree = ""; }; A7730EFD13967241007433A3 /* countseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countseqscommand.h; sourceTree = ""; }; A7730EFE13967241007433A3 /* countseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countseqscommand.cpp; sourceTree = ""; }; A774101214695AF60098E6AC /* shhhseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shhhseqscommand.h; sourceTree = ""; }; @@ -641,8 +642,6 @@ A7E9B6E212D37EC400DA6239 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = ""; }; A7E9B6E312D37EC400DA6239 /* filterseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = filterseqscommand.cpp; sourceTree = ""; }; A7E9B6E412D37EC400DA6239 /* filterseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filterseqscommand.h; sourceTree = ""; }; - A7E9B6E512D37EC400DA6239 /* fisher2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fisher2.c; sourceTree = ""; }; - A7E9B6E612D37EC400DA6239 /* fisher2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fisher2.h; sourceTree = ""; }; A7E9B6E712D37EC400DA6239 /* flowdata.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = flowdata.cpp; sourceTree = ""; }; A7E9B6E812D37EC400DA6239 /* flowdata.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flowdata.h; sourceTree = ""; }; A7E9B6E912D37EC400DA6239 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = ""; }; @@ -751,8 +750,6 @@ A7E9B75212D37EC400DA6239 /* mempearson.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mempearson.h; sourceTree = ""; }; A7E9B75312D37EC400DA6239 /* mergefilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergefilecommand.cpp; sourceTree = ""; }; A7E9B75412D37EC400DA6239 /* mergefilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mergefilecommand.h; sourceTree = ""; }; - A7E9B75512D37EC400DA6239 /* metastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metastats.h; sourceTree = ""; }; - A7E9B75612D37EC400DA6239 /* metastats2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = metastats2.c; sourceTree = ""; }; A7E9B75712D37EC400DA6239 /* metastatscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = metastatscommand.cpp; sourceTree = ""; }; A7E9B75812D37EC400DA6239 /* metastatscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metastatscommand.h; sourceTree = ""; }; A7E9B75912D37EC400DA6239 /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = ""; }; @@ -1451,6 +1448,8 @@ A7FC486612D795D60055BC5C /* pcacommand.cpp */, A7E9B78812D37EC400DA6239 /* pcoacommand.h */, A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */, + A76CDD7F1510F09A004C8458 /* pcrseqscommand.h */, + A76CDD811510F143004C8458 /* prcseqscommand.cpp */, A7E9B78C12D37EC400DA6239 /* phylodiversitycommand.h */, A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */, A7E9B79212D37EC400DA6239 /* phylotypecommand.h */, @@ -1883,10 +1882,6 @@ isa = PBXGroup; children = ( A7D161E7149F7F50000523E8 /* fortran */, - A7E9B6E512D37EC400DA6239 /* fisher2.c */, - A7E9B6E612D37EC400DA6239 /* fisher2.h */, - A7E9B75512D37EC400DA6239 /* metastats.h */, - A7E9B75612D37EC400DA6239 /* metastats2.c */, A79234D513C74BF6002B08E2 /* mothurfisher.h */, A79234D613C74BF6002B08E2 /* mothurfisher.cpp */, A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */, @@ -2028,7 +2023,6 @@ A7E9B8C412D37EC400DA6239 /* fastamap.cpp in Sources */, A7E9B8C512D37EC400DA6239 /* fileoutput.cpp in Sources */, A7E9B8C612D37EC400DA6239 /* filterseqscommand.cpp in Sources */, - A7E9B8C712D37EC400DA6239 /* fisher2.c in Sources */, A7E9B8C812D37EC400DA6239 /* flowdata.cpp in Sources */, A7E9B8C912D37EC400DA6239 /* formatcolumn.cpp in Sources */, A7E9B8CA12D37EC400DA6239 /* formatphylip.cpp in Sources */, @@ -2082,7 +2076,6 @@ A7E9B8FB12D37EC400DA6239 /* memeuclidean.cpp in Sources */, A7E9B8FC12D37EC400DA6239 /* mempearson.cpp in Sources */, A7E9B8FD12D37EC400DA6239 /* mergefilecommand.cpp in Sources */, - A7E9B8FE12D37EC400DA6239 /* metastats2.c in Sources */, A7E9B8FF12D37EC400DA6239 /* metastatscommand.cpp in Sources */, A7E9B90012D37EC400DA6239 /* mgclustercommand.cpp in Sources */, A7E9B90112D37EC400DA6239 /* mothur.cpp in Sources */, @@ -2301,6 +2294,7 @@ A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */, A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */, A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */, + A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index 138a6b9..e7294a8 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -466,14 +466,15 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){ string inputString = "fasta=" + inputFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); nameFile = filenames["name"][0]; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 8647e75..2c435ca 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -1158,14 +1158,15 @@ string ChimeraSlayerCommand::getNamesFile(string& inputFile){ string inputString = "fasta=" + inputFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); nameFile = filenames["name"][0]; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 98976ce..f238094 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -927,14 +927,15 @@ string ChimeraUchimeCommand::getNamesFile(string& inputFile){ string inputString = "fasta=" + inputFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); nameFile = filenames["name"][0]; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 664db6b..0504e66 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -457,10 +457,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { search = "kmer"; } - if (namefileNames.size() == 0){ - vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); - parser.getNameFile(files); - } + if (!abort) { + if (namefileNames.size() == 0){ + if (fastaFileNames.size() != 0) { + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } + } + } } diff --git a/commandfactory.cpp b/commandfactory.cpp index bdcfbba..579abe8 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -130,6 +130,7 @@ #include "sortseqscommand.h" #include "classifytreecommand.h" #include "cooccurrencecommand.h" +#include "pcrseqscommand.h" /*******************************************************/ @@ -281,6 +282,7 @@ CommandFactory::CommandFactory(){ commands["sort.seqs"] = "sort.seqs"; commands["classify.tree"] = "classify.tree"; commands["cooccurrence"] = "cooccurrence"; + commands["pcr.seqs"] = "pcr.seqs"; commands["quit"] = "MPIEnabled"; } @@ -445,6 +447,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "sort.seqs") { command = new SortSeqsCommand(optionString); } else if(commandName == "classify.tree") { command = new ClassifyTreeCommand(optionString); } else if(commandName == "cooccurrence") { command = new CooccurrenceCommand(optionString); } + else if(commandName == "pcr.seqs") { command = new PcrSeqsCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -593,6 +596,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "sort.seqs") { pipecommand = new SortSeqsCommand(optionString); } else if(commandName == "classify.tree") { pipecommand = new ClassifyTreeCommand(optionString); } else if(commandName == "cooccurrence") { pipecommand = new CooccurrenceCommand(optionString); } + else if(commandName == "pcr.seqs") { pipecommand = new PcrSeqsCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -729,6 +733,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "sort.seqs") { shellcommand = new SortSeqsCommand(); } else if(commandName == "classify.tree") { shellcommand = new ClassifyTreeCommand(); } else if(commandName == "cooccurrence") { shellcommand = new CooccurrenceCommand(); } + else if(commandName == "pcr.seqs") { shellcommand = new PcrSeqsCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index 3541e00..3d0c0d5 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -154,7 +154,10 @@ int DeconvoluteCommand::execute() { map nameMap; map::iterator itNames; - if (oldNameMapFName != "") { m->readNames(oldNameMapFName, nameMap); } + if (oldNameMapFName != "") { + m->readNames(oldNameMapFName, nameMap); + if (oldNameMapFName == outNameFile){ outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique.names"; } + } if (m->control_pressed) { return 0; } diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 93e18e8..a7d42b3 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -125,7 +125,12 @@ FilterSeqsCommand::FilterSeqsCommand(string option) { fasta = validParameter.validFile(parameters, "fasta", false); if (fasta == "not found") { fasta = m->getFastaFile(); - if (fasta != "") { fastafileNames.push_back(fasta); m->mothurOut("Using " + fasta + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + if (fasta != "") { + fastafileNames.push_back(fasta); + m->mothurOut("Using " + fasta + " as input file for the fasta parameter."); m->mothurOutEndLine(); + string simpleName = m->getSimpleName(fasta); + filterFileName += simpleName.substr(0, simpleName.find_first_of('.')); + } else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } } else { diff --git a/fisher2.c b/fisher2.c deleted file mode 100644 index c861834..0000000 --- a/fisher2.c +++ /dev/null @@ -1,2158 +0,0 @@ -#include "fisher2.h" - - -static void f2xact(int *nrow, int *ncol, double *table, int *ldtabl, - double *expect, double *percnt, double *emin, double - *prt, double *pre, double *fact, int *ico, int - *iro, int *kyy, int *idif, int *irn, int *key, - int *ldkey, int *ipoin, double *stp, int *ldstp, - int *ifrq, double *dlp, double *dsp, double *tm, - int *key2, int *iwk, double *rwk); -static void f3xact(int *nrow, int *irow, int *ncol, int *icol, - double *dlp, int *mm, double *fact, int *ico, int - *iro, int *it, int *lb, int *nr, int *nt, int - *nu, int *itc, int *ist, double *stv, double *alen, - const double *tol); -static void f4xact(int *nrow, int *irow, int *ncol, int *icol, - double *dsp, double *fact, int *icstk, int *ncstk, - int *lstk, int *mstk, int *nstk, int *nrstk, int - *irstk, double *ystk, const double *tol); -static void f5xact(double *pastp, const double *tol, int *kval, int *key, - int *ldkey, int *ipoin, double *stp, int *ldstp, - int *ifrq, int *npoin, int *nr, int *nl, int - *ifreq, int *itop, int *ipsh); -static void f6xact(int *nrow, int *irow, int *iflag, int *kyy, - int *key, int *ldkey, int *last, int *ipn); -static void f7xact(int *nrow, int *imax, int *idif, int *k, int *ks, - int *iflag); -static void f8xact(int *irow, int *is, int *i1, int *izero, int *myNew); -static double f9xact(int *n, int *mm, int *ir, double *fact); -static void f10act(int *nrow, int *irow, int *ncol, int *icol, - double *val, int *xmin, double *fact, int *nd, - int *ne, int *m); -static void f11act(int *irow, int *i1, int *i2, int *myNew); -static void prterr(int icode, char *mes); -static int iwork(int iwkmax, int *iwkpt, int number, int itype); -// void fexact(int *nrow, int *ncol, double *table, int *ldtabl, -// double *expect, double *percnt, double *emin, double *prt, -// double *pre, /* myNew in C : */ int *workspace); - static void isort(int *n, int *ix); - static double gammds(double *y, double *p, int *ifault); - static double alogam(double *x, int *ifault); - - -/* The only public function : */ -void fexact(int *nrow, int *ncol, double *table, int *ldtabl, - double *expect, double *percnt, double *emin, double *prt, - double *pre, /* myNew in C : */ int *workspace) { - -/* - ALGORITHM 643, COLLECTED ALGORITHMS FROM ACM. - THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE, - VOL. 19, NO. 4, DECEMBER, 1993, PP. 484-488. - ----------------------------------------------------------------------- - Name: FEXACT - Purpose: Computes Fisher's exact test probabilities and a hybrid - approximation to Fisher exact test probabilities for a - contingency table using the network algorithm. - Usage: CALL FEXACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT, - EMIN, PRT, PRE) - Arguments: - NROW - The number of rows in the table. (Input) - NCOL - The number of columns in the table. (Input) - TABLE - NROW by NCOL matrix containing the contingency - table. (Input) - LDTABL - Leading dimension of TABLE exactly as specified - in the dimension statement in the calling - program. (Input) - EXPECT - Expected value used in the hybrid algorithm for - deciding when to use asymptotic theory - probabilities. (Input) - If EXPECT <= 0.0 then asymptotic theory probabilities - are not used and Fisher exact test probabilities are - computed. Otherwise, if PERCNT or more of the cells in - the remaining table have estimated expected values of - EXPECT or more, with no remaining cell having expected - value less than EMIN, then asymptotic chi-squared - probabilities are used. See the algorithm section of the - manual document for details. - Use EXPECT = 5.0 to obtain the 'Cochran' condition. - PERCNT - Percentage of remaining cells that must have - estimated expected values greater than EXPECT - before asymptotic probabilities can be used. (Input) - See argument EXPECT for details. - Use PERCNT = 80.0 to obtain the 'Cochran' condition. - EMIN - Minimum cell estimated expected value allowed for - asymptotic chi-squared probabilities to be used. (Input) - See argument EXPECT for details. - Use EMIN = 1.0 to obtain the 'Cochran' condition. - PRT - Probability of the observed table for fixed - marginal totals. (Output) - PRE - Table p-value. (Output) - PRE is the probability of a more extreme table, - where `extreme' is in a probabilistic sense. - If EXPECT < 0 then the Fisher exact probability - is returned. Otherwise, an approximation to the - Fisher exact probability is computed based upon - asymptotic chi-squared probabilities for ``large'' - table expected values. The user defines ``large'' - through the arguments EXPECT, PERCNT, and EMIN. - - Remarks: - 1. For many problems one megabyte or more of workspace can be - required. If the environment supports it, the user should begin - by increasing the workspace used to 200,000 units. - 2. In FEXACT, LDSTP = 30*LDKEY. The proportion of table space used - by STP may be changed by changing the line MULT = 30 below to - another value. - 3. FEXACT may be converted to single precision by setting IREAL = 3, - and converting all DOUBLE PRECISION specifications (except the - specifications for RWRK, IWRK, and DWRK) to REAL. This will - require changing the names and specifications of the intrinsic - functions ALOG, AMAX1, AMIN1, EXP, and REAL. In addition, the - machine specific constants will need to be changed, and the name - DWRK will need to be changed to RWRK in the call to F2XACT. - 4. Machine specific constants are specified and documented in F2XACT. - A missing value code is specified in both FEXACT and F2XACT. - 5. Although not a restriction, is is not generally practical to call - this routine with large tables which are not sparse and in - which the 'hybrid' algorithm has little effect. For example, - although it is feasible to compute exact probabilities for the - table - 1 8 5 4 4 2 2 - 5 3 3 4 3 1 0 - 10 1 4 0 0 0 0, - computing exact probabilities for a similar table which has been - enlarged by the addition of an extra row (or column) may not be - feasible. - ----------------------------------------------------------------------- - */ - - /* CONSTANT Parameters : */ - - /* To increase the length of the table of paste path lengths relative - to the length of the hash table, increase MULT. - */ - const int mult = 30; - /* AMISS is a missing value indicator which is returned when the - probability is not defined. - */ - const double amiss = -12345.; - /* - Set IREAL = 4 for DOUBLE PRECISION - Set IREAL = 3 for SINGLE PRECISION - */ -#define i_real 4 -#define i_int 2 - - /* System generated locals */ - int ikh; - /* Local variables */ - int nco, nro, ntot, numb, iiwk, irwk; - int i, j, k, kk, ldkey, ldstp, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10; - int i3a, i3b, i3c, i9a, iwkmax, iwkpt; - - /* Workspace Allocation (freed at end) */ - double *equiv; - iwkmax = 2 * (int) (*workspace / 2); -// equiv = (double *) R_alloc(iwkmax / 2, sizeof(double)); - equiv = (double *) calloc(iwkmax / 2, sizeof(double)); - - /* The check could never happen with Calloc! - equiv = Calloc(iwkmax / 2, double); - if (!equiv) { - prterr(0, "Can not allocate specified workspace"); - } */ - -#define dwrk (equiv) -#define iwrk ((int *)equiv) -#define rwrk ((float *)equiv) - - /* Parameter adjustments */ - table -= *ldtabl + 1; - - /* Function Body */ - iwkpt = 0; - - if (*nrow > *ldtabl) - prterr(1, "NROW must be less than or equal to LDTABL."); - - ntot = 0; - for (i = 1; i <= *nrow; ++i) { - for (j = 1; j <= *ncol; ++j) { - if (table[i + j * *ldtabl] < 0.) - prterr(2, "All elements of TABLE must be positive."); - ntot = (int) (ntot + table[i + j * *ldtabl]); - } - } - if (ntot == 0) { - prterr(3, "All elements of TABLE are zero.\n" - "PRT and PRE are set to missing values."); - *prt = amiss; - *pre = amiss; - goto L_End; - } - - nco = max(*nrow, *ncol); - nro = *nrow + *ncol - nco;/* = min(*nrow, *ncol) */ - k = *nrow + *ncol + 1; - kk = k * nco; - - ikh = ntot + 1; - i1 = iwork(iwkmax, &iwkpt, ikh, i_real); - i2 = iwork(iwkmax, &iwkpt, nco, i_int); - i3 = iwork(iwkmax, &iwkpt, nco, i_int); - i3a = iwork(iwkmax, &iwkpt, nco, i_int); - i3b = iwork(iwkmax, &iwkpt, nro, i_int); - i3c = iwork(iwkmax, &iwkpt, nro, i_int); - ikh = max(k * 5 + (kk << 1), nco * 7 + 800); - iiwk= iwork(iwkmax, &iwkpt, ikh, i_int); - ikh = max(nco + 401, k); - irwk= iwork(iwkmax, &iwkpt, ikh, i_real); - - /* NOTE: - What follows below splits the remaining amount iwkmax - iwkpt of - (int) workspace into hash tables as follows. - type size index - INT 2 * ldkey i4 i5 i11 - REAL 2 * ldkey i8 i9 i10 - REAL 2 * ldstp i6 - INT 6 * ldstp i7 - Hence, we need ldkey times - 3 * 2 + 3 * 2 * s + 2 * mult * s + 6 * mult - chunks of integer memory, where s = sizeof(REAL) / sizeof(INT). - If doubles are used and are twice as long as ints, this gives - 18 + 10 * mult - so that the value of ldkey can be obtained by dividing available - (int) workspace by this number. - - In fact, because iwork() can actually s * n + s - 1 int chunks - when allocating a REAL, we use ldkey = available / numb - 1. - - FIXME: - Can we always assume that sizeof(double) / sizeof(int) is 2? - */ - - if (i_real == 4) { /* Double precision reals */ - numb = 18 + 10 * mult; - } else { /* Single precision reals */ - numb = (mult << 3) + 12; - } - ldkey = (iwkmax - iwkpt) / numb - 1; - ldstp = mult * ldkey; - ikh = ldkey << 1; i4 = iwork(iwkmax, &iwkpt, ikh, i_int); - ikh = ldkey << 1; i5 = iwork(iwkmax, &iwkpt, ikh, i_int); - ikh = ldstp << 1; i6 = iwork(iwkmax, &iwkpt, ikh, i_real); - ikh = ldstp * 6; i7 = iwork(iwkmax, &iwkpt, ikh, i_int); - ikh = ldkey << 1; i8 = iwork(iwkmax, &iwkpt, ikh, i_real); - ikh = ldkey << 1; i9 = iwork(iwkmax, &iwkpt, ikh, i_real); - ikh = ldkey << 1; i9a = iwork(iwkmax, &iwkpt, ikh, i_real); - ikh = ldkey << 1; i10 = iwork(iwkmax, &iwkpt, ikh, i_int); - - /* To convert to double precision, change RWRK to DWRK in the next CALL. - */ - f2xact(nrow, - ncol, - &table[*ldtabl + 1], - ldtabl, - expect, - percnt, - emin, - prt, - pre, - dwrk + i1, - iwrk + i2, - iwrk + i3, - iwrk + i3a, - iwrk + i3b, - iwrk + i3c, - iwrk + i4, - &ldkey, - iwrk + i5, - dwrk + i6, - &ldstp, - iwrk + i7, - dwrk + i8, - dwrk + i9, - dwrk + i9a, - iwrk + i10, - iwrk + iiwk, - dwrk + irwk); - -L_End: - /* Free(equiv); */ - free(equiv); - return; -} - -#undef rwrk -#undef iwrk -#undef dwrk - - -/* - ----------------------------------------------------------------------- - Name: F2XACT - Purpose: Computes Fisher's exact test for a contingency table, - routine with workspace variables specified. - Usage: F2XACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT, - EMIN, PRT, PRE, FACT, ICO, IRO, KYY, IDIF, - IRN, KEY, LDKEY, IPOIN, STP, LDSTP, IFRQ, - DLP, DSP, TM, KEY2, IWK, RWK) - ----------------------------------------------------------------------- - */ -void -f2xact(int *nrow, int *ncol, double *table, int *ldtabl, - double *expect, double *percnt, double *emin, double *prt, - double *pre, double *fact, int *ico, int *iro, int *kyy, - int *idif, int *irn, int *key, int *ldkey, int *ipoin, - double *stp, int *ldstp, int *ifrq, double *dlp, double *dsp, - double *tm, int *key2, int *iwk, double *rwk) -{ - /* IMAX is the largest representable int on the machine. */ - const int imax = SINT_MAX; -// const int imax = 2147483647; //xx: I DONÂ¥T like this, and -// thanks to the hint from Jason Turner I don't do it anymore. (R.D-U). - - /* AMISS is a missing value indicator which is returned when the - probability is not defined. */ - const double amiss = -12345.; - - /* TOL is chosen as the square root of the smallest relative spacing. */ -#ifndef Macintosh - const static double tol = 3.45254e-7; -#else - static double tol = 3.45254e-7; -#endif - /* EMX is a large positive value used in comparing expected values. */ - const static double emx = 1e30; - - /* Local variables {{any really need to be static ???}} */ - static int kval, kmax, jkey, last, ipsh, itmp, itop, jstp, ntot, - jstp2, jstp3, jstp4, i, ii, j, k, n, iflag, ncell, ifreq, chisq, - ikkey, ikstp, ikstp2, k1, kb, kd, ks, - i31, i32, i33, i34, i35, i36, i37, i38, i39, - i41, i42, i43, i44, i45, i46, i47, i48, i310, i311, - nco, nrb, ipn, ipo, itp, nro, nro2; - static double dspt, dd, df,ddf, drn,dro, emn, obs, obs2, obs3, - pastp, pv, tmp; - double d1; -#ifndef USING_R - double d2; - static int ifault; -#endif - int nr_gt_nc=0; - - /* Parameter adjustments */ - table -= *ldtabl + 1; - --ico; - --iro; - --kyy; - --idif; - --irn; - --key; - --ipoin; - --stp; - --ifrq; - --dlp; - --dsp; - --tm; - --key2; - --iwk; - --rwk; - - - /* Check table dimensions */ - if (*nrow > *ldtabl) - prterr(1, "NROW must be less than or equal to LDTABL."); - if (*ncol <= 1) - prterr(4, "NCOL must be at least 2"); - - /* Initialize KEY array */ - for (i = 1; i <= *ldkey << 1; ++i) { - key[i] = -9999; - key2[i] = -9999; - } - /* Initialize parameters */ - *pre = 0.; - itop = 0; - if (*expect > 0.) - emn = *emin; - else - emn = emx; - if (*nrow > *ncol){ - nr_gt_nc = 1; -} -else{ - nr_gt_nc = 0; -} - /* nco := max(nrow, ncol) : */ - if(nr_gt_nc) - nco = *nrow; - else - nco = *ncol; - /* Initialize pointers for workspace */ - /* f3xact */ - i31 = 1; - i32 = i31 + nco; - i33 = i32 + nco; - i34 = i33 + nco; - i35 = i34 + nco; - i36 = i35 + nco; - i37 = i36 + nco; - i38 = i37 + nco; - i39 = i38 + 400; - i310 = 1; - i311 = 401; - /* f4xact */ - k = *nrow + *ncol + 1; - i41 = 1; - i42 = i41 + k; - i43 = i42 + k; - i44 = i43 + k; - i45 = i44 + k; - i46 = i45 + k; - i47 = i46 + k * nco; - i48 = 1; - - /* Compute row marginals and total */ - ntot = 0; - for (i = 1; i <= *nrow; ++i) { - iro[i] = 0; - for (j = 1; j <= *ncol; ++j) { - if (table[i + j * *ldtabl] < -1e-4) - prterr(2, "All elements of TABLE must be positive."); - iro[i] += (int) table[i + j * *ldtabl]; - } - ntot += iro[i]; - } - - if (ntot == 0) { - prterr(3, "All elements of TABLE are zero.\n" - "PRT and PRE are set to missing values."); - *pre = *prt = amiss; - return; - } - - /* Column marginals */ - for (i = 1; i <= *ncol; ++i) { - ico[i] = 0; - for (j = 1; j <= *nrow; ++j) - ico[i] += (int) table[j + i * *ldtabl]; - } - - /* sort marginals */ - isort(nrow, &iro[1]); - isort(ncol, &ico[1]); - - /* Determine row and column marginals. - Define max(nrow,ncol) =: nco >= nro := min(nrow,ncol) - nco is defined above - - Swap marginals if necessary to ico[1:nco] & iro[1:nro] - */ - if (nr_gt_nc) { - nro = *ncol; - /* Swap marginals */ - for (i = 1; i <= nco; ++i) { - itmp = iro[i]; - if (i <= nro) - iro[i] = ico[i]; - ico[i] = itmp; - } - } else - nro = *nrow; - - - /* Get multiplers for stack */ - kyy[1] = 1; - for (i = 2; i <= nro; ++i) { - /* Hash table multipliers */ - if (iro[i - 1] + 1 <= imax / kyy[i - 1]) { - kyy[i] = kyy[i - 1] * (iro[i - 1] + 1); - j /= kyy[i - 1]; - } else - goto L_ERR_5; - } - /* Maximum product */ - if (iro[nro - 1] + 1 <= imax / kyy[nro - 1]) { - kmax = (iro[nro] + 1) * kyy[nro - 1]; - } else { - L_ERR_5: - prterr(5, "The hash table key cannot be computed because " - "the largest key\n" - "is larger than the largest representable int.\n" - "The algorithm cannot proceed.\n" - "Reduce the workspace size, or use `exact = FALSE'."); - return; - } - - /* Compute log factorials */ - fact[0] = 0.; - fact[1] = 0.; - if(ntot >= 2) fact[2] = log(2.); -/* MM: old code assuming log() to be SLOW */ - for (i = 3; i <= ntot; i += 2) { - fact[i] = fact[i - 1] + log((double) i); - j = i + 1; - if (j <= ntot) - fact[j] = fact[i] + fact[2] + fact[j / 2] - fact[j / 2 - 1]; - } - /* Compute obs := observed path length */ - obs = tol; - ntot = 0; - for (j = 1; j <= nco; ++j) { - dd = 0.; - for (i = 1; i <= nro; ++i) { - if (nr_gt_nc) { - dd += fact[(int) table[j + i * *ldtabl]]; - ntot += (int) table[j + i * *ldtabl]; - } else { - dd += fact[(int) table[i + j * *ldtabl]]; - ntot += (int) table[i + j * *ldtabl]; - } - } - obs += fact[ico[j]] - dd; - } - /* Denominator of observed table: DRO */ - dro = f9xact(&nro, &ntot, &iro[1], fact); - *prt = exp(obs - dro); - /* Initialize pointers */ - k = nco; - last = *ldkey + 1; - jkey = *ldkey + 1; - jstp = *ldstp + 1; - jstp2 = *ldstp * 3 + 1; - jstp3 = (*ldstp << 2) + 1; - jstp4 = *ldstp * 5 + 1; - ikkey = 0; - ikstp = 0; - ikstp2 = *ldstp << 1; - ipo = 1; - ipoin[1] = 1; - stp[1] = 0.; - ifrq[1] = 1; - ifrq[ikstp2 + 1] = -1; - -Outer_Loop: - kb = nco - k + 1; - ks = 0; - n = ico[kb]; - kd = nro + 1; - kmax = nro; - /* IDIF is the difference in going to the daughter */ - for (i = 1; i <= nro; ++i) - idif[i] = 0; - - /* Generate the first daughter */ - do { - --kd; - ntot = min(n, iro[kd]); - idif[kd] = ntot; - if (idif[kmax] == 0) - --kmax; - n -= ntot; - } - while (n > 0 && kd != 1); - - if (n != 0) { - goto L310; - } - - k1 = k - 1; - n = ico[kb]; - ntot = 0; - for (i = kb + 1; i <= nco; ++i) - ntot += ico[i]; - - -L150: - /* Arc to daughter length=ICO(KB) */ - for (i = 1; i <= nro; ++i) - irn[i] = iro[i] - idif[i]; - - /* Sort irn */ - if (k1 > 1) { - if (nro == 2) { - if (irn[1] > irn[2]) { - ii = irn[1]; - irn[1] = irn[2]; - irn[2] = ii; - } - } else if (nro == 3) { - ii = irn[1]; - if (ii > irn[3]) { - if (ii > irn[2]) { - if (irn[2] > irn[3]) { - irn[1] = irn[3]; - irn[3] = ii; - } else { - irn[1] = irn[2]; - irn[2] = irn[3]; - irn[3] = ii; - } - } else { - irn[1] = irn[3]; - irn[3] = irn[2]; - irn[2] = ii; - } - } else if (ii > irn[2]) { - irn[1] = irn[2]; - irn[2] = ii; - } else if (irn[2] > irn[3]) { - ii = irn[2]; - irn[2] = irn[3]; - irn[3] = ii; - } - } else { - for (j = 2; j <= nro; ++j) { - i = j - 1; - ii = irn[j]; - - while (ii < irn[i]) { - irn[i + 1] = irn[i]; - --i; - if (i == 0) - break; - } - irn[i + 1] = ii; - } - } - /* Adjust start for zero */ - for (i = 1; i <= nro; ++i) { - if (irn[i] != 0) - break; - } - - nrb = i; - nro2 = nro - i + 1; - } else { - nrb = 1; - nro2 = nro; - } - /* Some table values */ - ddf = f9xact(&nro, &n, &idif[1], fact); - drn = f9xact(&nro2, &ntot, &irn[nrb], fact) - dro + ddf; - /* Get hash value */ - if (k1 > 1) { - kval = irn[1] + irn[2] * kyy[2]; - for (i = 3; i <= nro; ++i) { - kval += irn[i] * kyy[i]; - } - /* Get hash table entry */ - i = kval % (*ldkey << 1) + 1; - /* Search for unused location */ - for (itp = i; itp <= *ldkey << 1; ++itp) { - ii = key2[itp]; - if (ii == kval) { - goto L240; - } else if (ii < 0) { - key2[itp] = kval; - dlp[itp] = 1.; - dsp[itp] = 1.; - goto L240; - } - } - - for (itp = 1; itp <= i - 1; ++itp) { - ii = key2[itp]; - if (ii == kval) { - goto L240; - } else if (ii < 0) { - key2[itp] = kval; - dlp[itp] = 1.; - goto L240; - } - } - - /* KH - prterr(6, "LDKEY is too small.\n" - "It is not possible to give the value of LDKEY required,\n" - "but you could try doubling LDKEY (and possibly LDSTP)."); - */ - prterr(6, "LDKEY is too small for this problem.\n" - "Try increasing the size of the workspace."); - } - -L240: - ipsh = (1); - /* Recover pastp */ - ipn = ipoin[ipo + ikkey]; - pastp = stp[ipn + ikstp]; - ifreq = ifrq[ipn + ikstp]; - /* Compute shortest and longest path */ - if (k1 > 1) { - obs2 = obs - fact[ico[kb + 1]] - fact[ico[kb + 2]] - ddf; - for (i = 3; i <= k1; ++i) { - obs2 -= fact[ico[kb + i]]; - } - if (dlp[itp] > 0.) { - dspt = obs - obs2 - ddf; - /* Compute longest path */ - dlp[itp] = 0.; - f3xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dlp[itp], - &ntot, fact, &iwk[i31], &iwk[i32], &iwk[i33], - &iwk[i34], &iwk[i35], &iwk[i36], &iwk[i37], - &iwk[i38], &iwk[i39], &rwk[i310], &rwk[i311], &tol); - dlp[itp] = min(0., dlp[itp]); - /* Compute shortest path */ - dsp[itp] = dspt; - f4xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dsp[itp], fact, - &iwk[i47], &iwk[i41], &iwk[i42], &iwk[i43], - &iwk[i44], &iwk[i45], &iwk[i46], &rwk[i48], &tol); - dsp[itp] = min(0., dsp[itp] - dspt); - /* Use chi-squared approximation? */ - if ((irn[nrb] * ico[kb + 1]) > ntot * emn) { - ncell = 0.; - for (i = 0; i < nro2; ++i) - for (j = 1; j <= k1; ++j) - if (irn[nrb + i] * ico[kb + j] >= ntot * *expect) - ncell++; - - if (ncell * 100 >= k1 * nro2 * *percnt) { - tmp = 0.; - for (i = 0; i < nro2; ++i) - tmp += (fact[irn[nrb + i]] - - fact[irn[nrb + i] - 1]); - tmp *= k1 - 1; - for (j = 1; j <= k1; ++j) - tmp += (nro2 - 1) * (fact[ico[kb + j]] - - fact[ico[kb + j] - 1]); - df = (double) ((nro2 - 1) * (k1 - 1)); - tmp += df * 1.83787706640934548356065947281; - tmp -= (nro2 * k1 - 1) * (fact[ntot] - fact[ntot - 1]); - tm[itp] = (obs - dro) * -2. - tmp; - } else { - /* tm(itp) set to a flag value */ - tm[itp] = -9876.; - } - } else { - tm[itp] = -9876.; - } - } - obs3 = obs2 - dlp[itp]; - obs2 -= dsp[itp]; - if (tm[itp] == -9876.) { - chisq = (0); - } else { - chisq = (1); - tmp = tm[itp]; - } - } else { - obs2 = obs - drn - dro; - obs3 = obs2; - } - -L300: - /* Process node with new PASTP */ - if (pastp <= obs3) { - /* Update pre */ - *pre += (double) ifreq * exp(pastp + drn); - } else if (pastp < obs2) { - if (chisq) { - df = (double) ((nro2 - 1) * (k1 - 1)); -#ifdef USING_R - pv = pgamma(fmax2(0., tmp + (pastp + drn) * 2.) / 2., - df / 2., /*scale = */ 1., - /*lower_tail = */FALSE, /*log_p = */ FALSE); -#else - d1 = max(0., tmp + (pastp + drn) * 2.) / 2.; - d2 = df / 2.; - pv = 1. - gammds(&d1, &d2, &ifault); -#endif - *pre += (double) ifreq * exp(pastp + drn) * pv; - } else { - /* Put daughter on queue */ - d1 = pastp + ddf; - f5xact(&d1, &tol, &kval, &key[jkey], ldkey, &ipoin[jkey], - &stp[jstp], ldstp, &ifrq[jstp], &ifrq[jstp2], - &ifrq[jstp3], &ifrq[jstp4], &ifreq, &itop, &ipsh); - ipsh = (0); - } - } - /* Get next PASTP on chain */ - ipn = ifrq[ipn + ikstp2]; - if (ipn > 0) { - pastp = stp[ipn + ikstp]; - ifreq = ifrq[ipn + ikstp]; - goto L300; - } - /* Generate a new daughter node */ - f7xact(&kmax, &iro[1], &idif[1], &kd, &ks, &iflag); - if (iflag != 1) { - goto L150; - } - -L310: - /* Go get a new mother from stage K */ - do { - iflag = 1; - f6xact(&nro, &iro[1], &iflag, &kyy[1], &key[ikkey + 1], ldkey, - &last, &ipo); - /* Update pointers */ - if (iflag != 3) - goto Outer_Loop; - /* else iflag == 3 : no additional nodes to process */ - --k; - itop = 0; - ikkey = jkey - 1; - ikstp = jstp - 1; - ikstp2 = jstp2 - 1; - jkey = *ldkey - jkey + 2; - jstp = *ldstp - jstp + 2; - jstp2 = (*ldstp << 1) + jstp; - for (i = 1; i <= *ldkey << 1; ++i) - key2[i] = -9999; - - } while (k >= 2); -} - -/* - ----------------------------------------------------------------------- - Name: F3XACT - Purpose: Computes the shortest path length for a given table. - Usage: F3XACT (NROW, IROW, NCOL, ICOL, DLP, MM, FACT, ICO, IRO, - IT, LB, NR, NT, NU, ITC, IST, STV, ALEN, TOL) - Arguments: - NROW - The number of rows in the table. (Input) - IROW - Vector of length NROW containing the row sums - for the table. (Input) - NCOL - The number of columns in the table. (Input) - ICOL - Vector of length K containing the column sums - for the table. (Input) - DLP - The longest path for the table. (Output) - MM - The total count in the table. (Output) - FACT - Vector containing the logarithms of factorials. (Input) - ICO - Work vector of length MAX(NROW,NCOL). - IRO - Work vector of length MAX(NROW,NCOL). - IT - Work vector of length MAX(NROW,NCOL). - LB - Work vector of length MAX(NROW,NCOL). - NR - Work vector of length MAX(NROW,NCOL). - NT - Work vector of length MAX(NROW,NCOL). - NU - Work vector of length MAX(NROW,NCOL). - ITC - Work vector of length 400. - IST - Work vector of length 400. - STV - Work vector of length 400. - ALEN - Work vector of length MAX(NROW,NCOL). - TOL - Tolerance. (Input) - ----------------------------------------------------------------------- - */ - -void -f3xact(int *nrow, int *irow, int *ncol, int *icol, double *dlp, - int *mm, double *fact, int *ico, int *iro, int *it, - int *lb, int *nr, int *nt, int *nu, int *itc, int *ist, - double *stv, double *alen, const double *tol) -{ - /* Initialized data */ - static int ldst = 200; - static int nst = 0; - static int nitc = 0; - - /* Local variables */ - static int xmin; - static int i, k; - static double v; - static int n11, n12, ii, nn, ks, ic1, ic2, nc1, nn1; - static int nr1, nco; - static double val; - static int nct, ipn, irl, key, lev, itp, nro; - static double vmn; - static int nrt, kyy, nc1s; - - /* Parameter adjustments */ - --stv; - --ist; - --itc; - --nu; - --nt; - --nr; - --lb; - --it; - --iro; - --ico; - --icol; - --irow; - - /* Function Body */ - for (i = 0; i <= *ncol; ++i) { - alen[i] = 0.; - } - for (i = 1; i <= 400; ++i) { - ist[i] = -1; - } - /* nrow is 1 */ - if (*nrow <= 1) { - if (*nrow > 0) { - *dlp -= fact[icol[1]]; - for (i = 2; i <= *ncol; ++i) { - *dlp -= fact[icol[i]]; - } - } - return; - } - /* ncol is 1 */ - if (*ncol <= 1) { - if (*ncol > 0) { - *dlp = *dlp - fact[irow[1]] - fact[irow[2]]; - for (i = 3; i <= *nrow; ++i) { - *dlp -= fact[irow[i]]; - } - } - return; - } - /* 2 by 2 table */ - if (*nrow * *ncol == 4) { - n11 = (irow[1] + 1) * (icol[1] + 1) / (*mm + 2); - n12 = irow[1] - n11; - *dlp = *dlp - fact[n11] - fact[n12] - fact[icol[1] - n11] - - fact[icol[2] - n12]; - return; - } - /* Test for optimal table */ - val = 0.; - xmin = (0); - if (irow[*nrow] <= irow[1] + *ncol) { - f10act(nrow, &irow[1], ncol, &icol[1], &val, &xmin, fact, - &lb[1], &nu[1], &nr[1]); - } - if (! xmin) { - if (icol[*ncol] <= icol[1] + *nrow) { - f10act(ncol, &icol[1], nrow, &irow[1], &val, &xmin, fact, - &lb[1], &nu[1], &nr[1]); - } - } - - if (xmin) { - *dlp -= val; - return; - } - /* Setup for dynamic programming */ - nn = *mm; - /* Minimize ncol */ - if (*nrow >= *ncol) { - nro = *nrow; - nco = *ncol; - for (i = 1; i <= *nrow; ++i) { - iro[i] = irow[i]; - } - ico[1] = icol[1]; - nt[1] = nn - ico[1]; - for (i = 2; i <= *ncol; ++i) { - ico[i] = icol[i]; - nt[i] = nt[i - 1] - ico[i]; - } - } else { - nro = *ncol; - nco = *nrow; - ico[1] = irow[1]; - nt[1] = nn - ico[1]; - for (i = 2; i <= *nrow; ++i) { - ico[i] = irow[i]; - nt[i] = nt[i - 1] - ico[i]; - } - for (i = 1; i <= *ncol; ++i) - iro[i] = icol[i]; - } - /* Initialize pointers */ - vmn = 1e10; - nc1s = nco - 1; - irl = 1; - ks = 0; - k = ldst; - kyy = ico[nco] + 1; - -LnewNode: /* Setup to generate new node */ - - lev = 1; - nr1 = nro - 1; - nrt = iro[irl]; - nct = ico[1]; - lb[1] = (int) ((double) ((nrt + 1) * (nct + 1)) / - (double) (nn + nr1 * nc1s + 1) - *tol) - 1; - nu[1] = (int) ((double) ((nrt + nc1s) * (nct + nr1)) / - (double) (nn + nr1 + nc1s)) - lb[1] + 1; - nr[1] = nrt - lb[1]; - -LoopNode: /* Generate a node */ - --nu[lev]; - if (nu[lev] == 0) { - if (lev == 1) - goto L200; - - --lev; - goto LoopNode; - } - ++lb[lev]; - --nr[lev]; -L120: - alen[lev] = alen[lev - 1] + fact[lb[lev]]; - if (lev < nc1s) { - nn1 = nt[lev]; - nrt = nr[lev]; - ++lev; - nc1 = nco - lev; - nct = ico[lev]; - lb[lev] = (int) ((double) ((nrt + 1) * (nct + 1)) / - (double) (nn1 + nr1 * nc1 + 1) - *tol); - nu[lev] = (int) ((double) ((nrt + nc1) * (nct + nr1)) / - (double) (nn1 + nr1 + nc1) - lb[lev] + 1); - nr[lev] = nrt - lb[lev]; - goto L120; - } - alen[nco] = alen[lev] + fact[nr[lev]]; - lb[nco] = nr[lev]; - - v = val + alen[nco]; - if (nro == 2) { - /* Only 1 row left */ - v = v + fact[ico[1] - lb[1]] + fact[ico[2] - lb[2]]; - for (i = 3; i <= nco; ++i) { - v += fact[ico[i] - lb[i]]; - } - if (v < vmn) { - vmn = v; - } - } else if (nro == 3 && nco == 2) { - /* 3 rows and 2 columns */ - nn1 = nn - iro[irl] + 2; - ic1 = ico[1] - lb[1]; - ic2 = ico[2] - lb[2]; - n11 = (iro[irl + 1] + 1) * (ic1 + 1) / nn1; - n12 = iro[irl + 1] - n11; - v = v + fact[n11] + fact[n12] + fact[ic1 - n11] - + fact[ic2 - n12]; - if (v < vmn) { - vmn = v; - } - } else { - /* Column marginals are new node */ - for (i = 1; i <= nco; ++i) { - it[i] = ico[i] - lb[i]; - } - /* Sort column marginals */ - if (nco == 2) { - if (it[1] > it[2]) { - ii = it[1]; - it[1] = it[2]; - it[2] = ii; - } - } else if (nco == 3) { - ii = it[1]; - if (ii > it[3]) { - if (ii > it[2]) { - if (it[2] > it[3]) { - it[1] = it[3]; - it[3] = ii; - } else { - it[1] = it[2]; - it[2] = it[3]; - it[3] = ii; - } - } else { - it[1] = it[3]; - it[3] = it[2]; - it[2] = ii; - } - } else if (ii > it[2]) { - it[1] = it[2]; - it[2] = ii; - } else if (it[2] > it[3]) { - ii = it[2]; - it[2] = it[3]; - it[3] = ii; - } - } else { - isort(&nco, &it[1]); - } - /* Compute hash value */ - key = it[1] * kyy + it[2]; - for (i = 3; i <= nco; ++i) { - key = it[i] + key * kyy; - } - if(key < 0) - //PROBLEM "Bug in FEXACT: gave negative key" RECOVER(NULL_ENTRY); - printf("Bug in FEXACT: gave negative key \n"); //xx:another one of my ugly kluges (R.D-U) - - /* Table index */ - ipn = key % ldst + 1; - - /* Find empty position */ - for (itp = ipn, ii = ks + ipn; itp <= ldst; ++itp, ++ii) { - if (ist[ii] < 0) { - goto L180; - } else if (ist[ii] == key) { - goto L190; - } - } - - for (itp = 1, ii = ks + 1; itp <= ipn - 1; ++itp, ++ii) { - if (ist[ii] < 0) { - goto L180; - } else if (ist[ii] == key) { - goto L190; - } - } - - prterr(30, "Stack length exceeded in f3xact.\n" - "This problem should not occur."); - -L180: /* Push onto stack */ - ist[ii] = key; - stv[ii] = v; - ++nst; - ii = nst + ks; - itc[ii] = itp; - goto LoopNode; - -L190: /* Marginals already on stack */ - stv[ii] = min(v, stv[ii]); - } - goto LoopNode; - - -L200: /* Pop item from stack */ - if (nitc > 0) { - /* Stack index */ - itp = itc[nitc + k] + k; - --nitc; - val = stv[itp]; - key = ist[itp]; - ist[itp] = -1; - /* Compute marginals */ - for (i = nco; i >= 2; --i) { - ico[i] = key % kyy; - key /= kyy; - } - ico[1] = key; - /* Set up nt array */ - nt[1] = nn - ico[1]; - for (i = 2; i <= nco; ++i) - nt[i] = nt[i - 1] - ico[i]; - - /* Test for optimality (L90) */ - xmin = (0); - if (iro[nro] <= iro[irl] + nco) { - f10act(&nro, &iro[irl], &nco, &ico[1], &val, &xmin, fact, - &lb[1], &nu[1], &nr[1]); - } - if (!xmin && ico[nco] <= ico[1] + nro) - f10act(&nco, &ico[1], &nro, &iro[irl], &val, &xmin, fact, - &lb[1], &nu[1], &nr[1]); - if (xmin) { - if (vmn > val) - vmn = val; - goto L200; - } - else goto LnewNode; - - } else if (nro > 2 && nst > 0) { - /* Go to next level */ - nitc = nst; - nst = 0; - k = ks; - ks = ldst - ks; - nn -= iro[irl]; - ++irl; - --nro; - goto L200; - } - - *dlp -= vmn; -} - -/* - ----------------------------------------------------------------------- - Name: F4XACT - Purpose: Computes the longest path length for a given table. - Usage: CALL F4XACT (NROW, IROW, NCOL, ICOL, DSP, FACT, ICSTK, - NCSTK, LSTK, MSTK, NSTK, NRSTK, IRSTK, YSTK, - TOL) - Arguments: - NROW - The number of rows in the table. (Input) - IROW - Vector of length NROW containing the row sums for the - table. (Input) - NCOL - The number of columns in the table. (Input) - ICOL - Vector of length K containing the column sums for the - table. (Input) - DSP - The shortest path for the table. (Output) - FACT - Vector containing the logarithms of factorials. (Input) - ICSTK - NCOL by NROW+NCOL+1 work array. - NCSTK - Work vector of length NROW+NCOL+1. - LSTK - Work vector of length NROW+NCOL+1. - MSTK - Work vector of length NROW+NCOL+1. - NSTK - Work vector of length NROW+NCOL+1. - NRSTK - Work vector of length NROW+NCOL+1. - IRSTK - NROW by MAX(NROW,NCOL) work array. - YSTK - Work vector of length NROW+NCOL+1. - TOL - Tolerance. (Input) - ----------------------------------------------------------------------- - */ - -void -f4xact(int *nrow, int *irow, int *ncol, int *icol, double *dsp, - double *fact, int *icstk, int *ncstk, int *lstk, int *mstk, - int *nstk, int *nrstk, int *irstk, double *ystk, const double *tol) -{ - /* System generated locals */ - int ikh; - - /* Local variables */ - int i, j, k, l, m, n, mn, ic1, ir1, ict, irt, istk, nco, nro; - double y, amx; - - /* Parameter adjustments */ - irstk -= *nrow + 1; - --irow; - icstk -= *ncol + 1; - --icol; - --ncstk; - --lstk; - --mstk; - --nstk; - --nrstk; - --ystk; - - /* Function Body */ - /* Take care of the easy cases first */ - if (*nrow == 1) { - for (i = 1; i <= *ncol; ++i) { - *dsp -= fact[icol[i]]; - } - return; - } - if (*ncol == 1) { - for (i = 1; i <= *nrow; ++i) { - *dsp -= fact[irow[i]]; - } - return; - } - if (*nrow * *ncol == 4) { - if (irow[2] <= icol[2]) { - *dsp = *dsp - fact[irow[2]] - fact[icol[1]] - - fact[icol[2] - irow[2]]; - } else { - *dsp = *dsp - fact[icol[2]] - fact[irow[1]] - - fact[irow[2] - icol[2]]; - } - return; - } - /* initialization before loop */ - for (i = 1; i <= *nrow; ++i) { - irstk[i + *nrow] = irow[*nrow - i + 1]; - } - for (j = 1; j <= *ncol; ++j) { - icstk[j + *ncol] = icol[*ncol - j + 1]; - } - - nro = *nrow; - nco = *ncol; - nrstk[1] = nro; - ncstk[1] = nco; - ystk[1] = 0.; - y = 0.; - istk = 1; - l = 1; - amx = 0.; - - /* First LOOP */ - do { - ir1 = irstk[istk * *nrow + 1]; - ic1 = icstk[istk * *ncol + 1]; - if (ir1 > ic1) { - if (nro >= nco) { - m = nco - 1; n = 2; - } else { - m = nro; n = 1; - } - } else if (ir1 < ic1) { - if (nro <= nco) { - m = nro - 1; n = 1; - } else { - m = nco; n = 2; - } - } else { - if (nro <= nco) { - m = nro - 1; n = 1; - } else { - m = nco - 1; n = 2; - } - } - - L60: - if (n == 1) { - i = l; j = 1; - } else { - i = 1; j = l; - } - - irt = irstk[i + istk * *nrow]; - ict = icstk[j + istk * *ncol]; - mn = irt; - if (mn > ict) { - mn = ict; - } - y += fact[mn]; - if (irt == ict) { - --nro; - --nco; - f11act(&irstk[istk * *nrow + 1], &i, &nro, - &irstk[(istk + 1) * *nrow + 1]); - f11act(&icstk[istk * *ncol + 1], &j, &nco, - &icstk[(istk + 1) * *ncol + 1]); - } else if (irt > ict) { - --nco; - f11act(&icstk[istk * *ncol + 1], &j, &nco, - &icstk[(istk + 1) * *ncol + 1]); - ikh = irt - ict; - f8xact(&irstk[istk * *nrow + 1], &ikh, &i, - &nro, &irstk[(istk + 1) * *nrow + 1]); - } else { - --nro; - f11act(&irstk[istk * *nrow + 1], &i, &nro, - &irstk[(istk + 1) * *nrow + 1]); - ikh = ict - irt; - f8xact(&icstk[istk * *ncol + 1], &ikh, &j, - &nco, &icstk[(istk + 1) * *ncol + 1]); - } - - if (nro == 1) { - for (k = 1; k <= nco; ++k) { - y += fact[icstk[k + (istk + 1) * *ncol]]; - } - break; - } - if (nco == 1) { - for (k = 1; k <= nro; ++k) { - y += fact[irstk[k + (istk + 1) * *nrow]]; - } - break; - } - - lstk[istk] = l; - mstk[istk] = m; - nstk[istk] = n; - ++istk; - nrstk[istk] = nro; - ncstk[istk] = nco; - ystk[istk] = y; - l = 1; - } while(1);/* end do */ - -/* L90:*/ - if (y > amx) { - amx = y; - if (*dsp - amx <= *tol) { - *dsp = 0.; - return; - } - } - -L100: - --istk; - if (istk == 0) { - *dsp -= amx; - if (*dsp - amx <= *tol) { - *dsp = 0.; - } - return; - } - l = lstk[istk] + 1; - -/* L110: */ - for(;; ++l) { - if (l > mstk[istk]) goto L100; - - n = nstk[istk]; - nro = nrstk[istk]; - nco = ncstk[istk]; - y = ystk[istk]; - if (n == 1) { - if (irstk[l + istk * *nrow] < - irstk[l - 1 + istk * *nrow]) goto L60; - } - else if (n == 2) { - if (icstk[l + istk * *ncol] < - icstk[l - 1 + istk * *ncol]) goto L60; - } - } -} - -/* - ----------------------------------------------------------------------- - Name: F5XACT - Purpose: Put node on stack in network algorithm. - Usage: CALL F5XACT (PASTP, TOL, KVAL, KEY, LDKEY, IPOIN, STP, - LDSTP, IFRQ, NPOIN, NR, NL, IFREQ, ITOP, - IPSH) - Arguments: - PASTP - The past path length. (Input) - TOL - Tolerance for equivalence of past path lengths. (Input) - KVAL - Key value. (Input) - KEY - Vector of length LDKEY containing the key values. (in/out) - LDKEY - Length of vector KEY. (Input) - IPOIN - Vector of length LDKEY pointing to the - linked list of past path lengths. (in/out) - STP - Vector of length LSDTP containing the - linked lists of past path lengths. (in/out) - LDSTP - Length of vector STP. (Input) - IFRQ - Vector of length LDSTP containing the past path - frequencies. (in/out) - NPOIN - Vector of length LDSTP containing the pointers to - the next past path length. (in/out) - NR - Vector of length LDSTP containing the right object - pointers in the tree of past path lengths. (in/out) - NL - Vector of length LDSTP containing the left object - pointers in the tree of past path lengths. (in/out) - IFREQ - Frequency of the current path length. (Input) - ITOP - Pointer to the top of STP. (Input) - IPSH - Option parameter. (Input) - If IPSH is true, the past path length is found in the - table KEY. Otherwise the location of the past path - length is assumed known and to have been found in - a previous call. ==>>>>> USING "static" variables - ----------------------------------------------------------------------- - */ - -void -f5xact(double *pastp, const double *tol, int *kval, int *key, int *ldkey, - int *ipoin, double *stp, int *ldstp, int *ifrq, int *npoin, - int *nr, int *nl, int *ifreq, int *itop, int *ipsh) -{ - /* Local variables */ - static int itmp, ird, ipn, itp; - double test1, test2; - - /* Parameter adjustments */ - --nl; - --nr; - --npoin; - --ifrq; - --stp; - --ipoin; - --key; - - /* Function Body */ - if (*ipsh) { - /* Convert KVAL to int in range 1, ..., LDKEY. */ - ird = *kval % *ldkey + 1; - /* Search for an unused location */ - for (itp = ird; itp <= *ldkey; ++itp) { - if (key[itp] == *kval) { - goto L40; - } - if (key[itp] < 0) { - goto L30; - } - } - for (itp = 1; itp <= ird - 1; ++itp) { - if (key[itp] == *kval) { - goto L40; - } - if (key[itp] < 0) { - goto L30; - } - } - /* Return if KEY array is full */ - /* KH - prterr(6, "LDKEY is too small for this problem.\n" - "It is not possible to estimate the value of LDKEY " - "required,\n" - "but twice the current value may be sufficient."); - */ - prterr(6, "LDKEY is too small for this problem.\n" - "Try increasing the size of the workspace."); - - /* Update KEY */ -L30: - key[itp] = *kval; - ++(*itop); - ipoin[itp] = *itop; - /* Return if STP array full */ - if (*itop > *ldstp) { - /* KH - prterr(7, "LDSTP is too small for this problem.\n" - "It is not possible to estimate the value of LDSTP " - "required,\n" - "but twice the current value may be sufficient."); - */ - prterr(7, "LDSTP is too small for this problem.\n" - "Try increasing the size of the workspace."); - } - /* Update STP, etc. */ - npoin[*itop] = -1; - nr[*itop] = -1; - nl[*itop] = -1; - stp[*itop] = *pastp; - ifrq[*itop] = *ifreq; - return; - } - - /* Find location, if any, of pastp */ -L40: - ipn = ipoin[itp]; - test1 = *pastp - *tol; - test2 = *pastp + *tol; - -L50: - if (stp[ipn] < test1) { - ipn = nl[ipn]; - if (ipn > 0) { - goto L50; - } - } else if (stp[ipn] > test2) { - ipn = nr[ipn]; - if (ipn > 0) { - goto L50; - } - } else { - ifrq[ipn] += *ifreq; - return; - } - /* Return if STP array full */ - ++(*itop); - if (*itop > *ldstp) { - /* - prterr(7, "LDSTP is too small for this problem.\n" - "It is not possible to estimate the value of LDSTP " - "required,\n" - "but twice the current value may be sufficient."); - */ - prterr(7, "LDSTP is too small for this problem.\n" - "Try increasing the size of the workspace."); - return; - } - /* Find location to add value */ - ipn = ipoin[itp]; - itmp = ipn; - -L60: - if (stp[ipn] < test1) { - itmp = ipn; - ipn = nl[ipn]; - if (ipn > 0) { - goto L60; - } else { - nl[itmp] = *itop; - } - } else if (stp[ipn] > test2) { - itmp = ipn; - ipn = nr[ipn]; - if (ipn > 0) { - goto L60; - } else { - nr[itmp] = *itop; - } - } - /* Update STP, etc. */ - npoin[*itop] = npoin[itmp]; - npoin[itmp] = *itop; - stp[*itop] = *pastp; - ifrq[*itop] = *ifreq; - nl[*itop] = -1; - nr[*itop] = -1; -} - -/* - ----------------------------------------------------------------------- - Name: F6XACT - Purpose: Pop a node off the stack. - Usage: CALL F6XACT (NROW, IROW, IFLAG, KYY, KEY, LDKEY, LAST, IPN) - Arguments: - NROW - The number of rows in the table. (Input) - IROW - Vector of length nrow containing the row sums on - output. (Output) - IFLAG - Set to 3 if there are no additional nodes to process. - (Output) - KYY - Constant mutlipliers used in forming the hash - table key. (Input) - KEY - Vector of length LDKEY containing the hash table - keys. (In/out) - LDKEY - Length of vector KEY. (Input) - LAST - Index of the last key popped off the stack. (In/out) - IPN - Pointer to the linked list of past path lengths. (Output) - ----------------------------------------------------------------------- - */ -void -f6xact(int *nrow, int *irow, int *iflag, int *kyy, int *key, int - *ldkey, int *last, int *ipn) -{ - int kval, j; - - /* Parameter adjustments */ - --key; - --kyy; - --irow; - - /* Function Body */ -L10: - ++(*last); - if (*last <= *ldkey) { - if (key[*last] < 0) { - goto L10; - } - /* Get KVAL from the stack */ - kval = key[*last]; - key[*last] = -9999; - for (j = *nrow; j >= 2; --j) { - irow[j] = kval / kyy[j]; - kval -= irow[j] * kyy[j]; - } - irow[1] = kval; - *ipn = *last; - } else { - *last = 0; - *iflag = 3; - } - return; -} - -/* - ----------------------------------------------------------------------- - Name: F7XACT - Purpose: Generate the new nodes for given marinal totals. - Usage: CALL F7XACT (NROW, IMAX, IDIF, K, KS, IFLAG) - Arguments: - NROW - The number of rows in the table. (Input) - IMAX - The row marginal totals. (Input) - IDIF - The column counts for the new column. (in/out) - K - Indicator for the row to decrement. (in/out) - KS - Indicator for the row to increment. (in/out) - IFLAG - Status indicator. (Output) - If IFLAG is zero, a new table was generated. For - IFLAG = 1, no additional tables could be generated. - ----------------------------------------------------------------------- - */ - -void -f7xact(int *nrow, int *imax, int *idif, int *k, int *ks, - int *iflag) - -{ - int i, m, k1, mm; - - /* Parameter adjustments */ - --idif; - --imax; - - /* Function Body */ - *iflag = 0; - /* Find node which can be incremented, ks */ - if (*ks == 0) - do { - ++(*ks); - } while (idif[*ks] == imax[*ks]); - - /* Find node to decrement (>ks) */ - if (idif[*k] > 0 && *k > *ks) { - --idif[*k]; - do { - --(*k); - } while(imax[*k] == 0); - - m = *k; - - /* Find node to increment (>=ks) */ - while (idif[m] >= imax[m]) { - --m; - } - ++idif[m]; - /* Change ks */ - if (m == *ks) { - if (idif[m] == imax[m]) { - *ks = *k; - } - } - } - else { - Loop: - /* Check for finish */ - for (k1 = *k + 1; k1 <= *nrow; ++k1) { - if (idif[k1] > 0) { - goto L70; - } - } - *iflag = 1; - return; - - L70: - /* Reallocate counts */ - mm = 1; - for (i = 1; i <= *k; ++i) { - mm += idif[i]; - idif[i] = 0; - } - *k = k1; - - do { - --(*k); - m = min(mm, imax[*k]); - idif[*k] = m; - mm -= m; - } while (mm > 0 && *k != 1); - - /* Check that all counts reallocated */ - if (mm > 0) { - if (k1 != *nrow) { - *k = k1; - goto Loop; - } - *iflag = 1; - return; - } - /* Get ks */ - --idif[k1]; - *ks = 0; - do { - ++(*ks); - if (*ks > *k) { - return; - } - } while (idif[*ks] >= imax[*ks]); - } -} - -/* - ----------------------------------------------------------------------- - Name: F8XACT - Purpose: Routine for reducing a vector when there is a zero - element. - Usage: CALL F8XACT (IROW, IS, I1, IZERO, NEW) - Arguments: - IROW - Vector containing the row counts. (Input) - IS - Indicator. (Input) - I1 - Indicator. (Input) - IZERO - Position of the zero. (Input) - NEW - Vector of new row counts. (Output) - ----------------------------------------------------------------------- - */ - -void -f8xact(int *irow, int *is, int *i1, int *izero, int *myNew) -{ - int i; - - /* Parameter adjustments */ - --myNew; - --irow; - - /* Function Body */ - for (i = 1; i < *i1; ++i) - myNew[i] = irow[i]; - - for (i = *i1; i <= *izero - 1; ++i) { - if (*is >= irow[i + 1]) - break; - myNew[i] = irow[i + 1]; - } - - myNew[i] = *is; - - for(;;) { - ++i; - if (i > *izero) - return; - myNew[i] = irow[i]; - } -} - -/* - ----------------------------------------------------------------------- - Name: F9XACT - Purpose: Computes the log of a multinomial coefficient. - Usage: F9XACT(N, MM, IR, FACT) - Arguments: - N - Length of IR. (Input) - MM - Number for factorial in numerator. (Input) - IR - Vector of length N containing the numbers for - the denominator of the factorial. (Input) - FACT - Table of log factorials. (Input) - F9XACT - The log of the multinomal coefficient. (Output) - ----------------------------------------------------------------------- - */ - -double -f9xact(int *n, int *mm, int *ir, double *fact) -{ - double d; - int k; - - d = fact[*mm]; - for (k = 0; k < *n; k++) - d -= fact[ir[k]]; - return d; -} - -/* - ----------------------------------------------------------------------- - Name: F10ACT - Purpose: Computes the shortest path length for special tables. - Usage: F10ACT (NROW, IROW, NCOL, ICOL, VAL, XMIN, FACT, ND, NE, M) - Arguments: - NROW - The number of rows in the table. (Input) - IROW - Vector of length NROW containing the row totals. (Input) - NCOL - The number of columns in the table. (Input) - ICO - Vector of length NCOL containing the column totals.(Input) - VAL - The shortest path. (Output) - XMIN - Set to true if shortest path obtained. (Output) - FACT - Vector containing the logarithms of factorials. (Input) - ND - Workspace vector of length NROW. (Input) - NE - Workspace vector of length NCOL. (Input) - M - Workspace vector of length NCOL. (Input) - - Chapter: STAT/LIBRARY Categorical and Discrete Data Analysis - ----------------------------------------------------------------------- - */ - -void -f10act(int *nrow, int *irow, int *ncol, int *icol, double *val, - int *xmin, double *fact, int *nd, int *ne, int *m) -{ - /* Local variables */ - int i, is, ix, nrw1; - - /* Parameter adjustments */ - --m; - --ne; - --nd; - --icol; - --irow; - - /* Function Body */ - for (i = 1; i <= *nrow - 1; ++i) - nd[i] = 0; - - is = icol[1] / *nrow; - ix = icol[1] - *nrow * is; - ne[1] = is; - m[1] = ix; - if (ix != 0) - ++nd[ix]; - - for (i = 2; i <= *ncol; ++i) { - ix = icol[i] / *nrow; - ne[i] = ix; - is += ix; - ix = icol[i] - *nrow * ix; - m[i] = ix; - if (ix != 0) - ++nd[ix]; - } - - for (i = *nrow - 2; i >= 1; --i) - nd[i] += nd[i + 1]; - - ix = 0; - nrw1 = *nrow + 1; - for (i = *nrow; i >= 2; --i) { - ix = ix + is + nd[nrw1 - i] - irow[i]; - if (ix < 0) - return; - } - - for (i = 1; i <= *ncol; ++i) { - ix = ne[i]; - is = m[i]; - *val = *val + is * fact[ix + 1] + (*nrow - is) * fact[ix]; - } - *xmin = (1); - - return; -} - -/* - ----------------------------------------------------------------------- - Name: F11ACT - Purpose: Routine for revising row totals. - Usage: CALL F11ACT (IROW, I1, I2, NEW) - Arguments: - IROW - Vector containing the row totals. (Input) - I1 - Indicator. (Input) - I2 - Indicator. (Input) - NEW - Vector containing the row totals. (Output) - ----------------------------------------------------------------------- - */ -void -f11act(int *irow, int *i1, int *i2, int *myNew) -{ - int i; - - /* Parameter adjustments */ - --myNew; - --irow; - - for (i = 1; i <= (*i1 - 1); ++i) myNew[i] = irow[i]; - for (i = *i1; i <= *i2; ++i) myNew[i] = irow[i + 1]; - - return; -} - -/* - ----------------------------------------------------------------------- - Name: prterr - Purpose: Print an error message and stop. - Usage: prterr(icode, mes) - Arguments: - icode - Integer code for the error message. (Input) - mes - Character string containing the error message. (Input) - ----------------------------------------------------------------------- - */ -void -prterr(int icode, char *mes) -{ -// PROBLEM "FEXACT error %d.\n%s", icode, mes RECOVER(NULL_ENTRY); -// printf("FEXACT error %d.\n%s", icode, mes RECOVER(NULL_ENTRY)); - printf("FEXACT error %d.\n", icode); //xx:another one of my ugly kluges - return; -} - -/* - ----------------------------------------------------------------------- - Name: iwork - Purpose: Routine for allocating workspace. - Usage: iwork (iwkmax, iwkpt, number, itype) - Arguments: - iwkmax - Maximum (int) amount of workspace. (Input) - iwkpt - Amount of (int) workspace currently allocated. (in/out) - number - Number of elements of workspace desired. (Input) - itype - Workspace type. (Input) - ITYPE TYPE - 2 integer - 3 float - 4 double - iwork(): Index in rwrk, dwrk, or iwrk of the beginning of - the first free element in the workspace array. (Output) - ----------------------------------------------------------------------- - */ -int -iwork(int iwkmax, int *iwkpt, int number, int itype) -{ - int i; - - i = *iwkpt; - if (itype == 2 || itype == 3) - *iwkpt += number; - else { /* double */ - if (i % 2 != 0) - ++i; - *iwkpt += (number << 1); - i /= 2; - } - if (*iwkpt > iwkmax) - prterr(40, "Out of workspace."); - - return i; -} - -#ifndef USING_R - -void isort(int *n, int *ix) -{ -/* - ----------------------------------------------------------------------- - Name: ISORT - Purpose: Shell sort for an int vector. - Usage: CALL ISORT (N, IX) - Arguments: - N - Lenth of vector IX. (Input) - IX - Vector to be sorted. (in/out) - ----------------------------------------------------------------------- - */ - static int ikey, i, j, m, il[10], kl, it, iu[10], ku; - - /* Parameter adjustments */ - --ix; - - /* Function Body */ - m = 1; - i = 1; - j = *n; - -L10: - if (i >= j) { - goto L40; - } - kl = i; - ku = j; - ikey = i; - ++j; - /* Find element in first half */ -L20: - ++i; - if (i < j) { - if (ix[ikey] > ix[i]) { - goto L20; - } - } - /* Find element in second half */ -L30: - --j; - if (ix[j] > ix[ikey]) { - goto L30; - } - /* Interchange */ - if (i < j) { - it = ix[i]; - ix[i] = ix[j]; - ix[j] = it; - goto L20; - } - it = ix[ikey]; - ix[ikey] = ix[j]; - ix[j] = it; - /* Save upper and lower subscripts of the array yet to be sorted */ - if (m < 11) { - if (j - kl < ku - j) { - il[m - 1] = j + 1; - iu[m - 1] = ku; - i = kl; - --j; - } else { - il[m - 1] = kl; - iu[m - 1] = j - 1; - i = j + 1; - j = ku; - } - ++m; - goto L10; - } else { - prterr(20, "This should never occur."); - } - /* Use another segment */ -L40: - --m; - if (m == 0) { - return; - } - i = il[m - 1]; - j = iu[m - 1]; - goto L10; -} - -double gammds(double *y, double *p, int *ifault) -{ -/* - ----------------------------------------------------------------------- - Name: GAMMDS - Purpose: Cumulative distribution for the gamma distribution. - Usage: PGAMMA (Q, ALPHA,IFAULT) - Arguments: - Q - Value at which the distribution is desired. (Input) - ALPHA - Parameter in the gamma distribution. (Input) - IFAULT - Error indicator. (Output) - IFAULT DEFINITION - 0 No error - 1 An argument is misspecified. - 2 A numerical error has occurred. - PGAMMA - The cdf for the gamma distribution with parameter alpha - evaluated at Q. (Output) - ----------------------------------------------------------------------- - - Algorithm AS 147 APPL. Statist. (1980) VOL. 29, P. 113 - - Computes the incomplete gamma integral for positive parameters Y, P - using and infinite series. - */ - - static double a, c, f, g; - static int ifail; - - /* Checks for the admissibility of arguments and value of F */ - *ifault = 1; - g = 0.; - if (*y <= 0. || *p <= 0.) { - return g; - } - *ifault = 2; - - /* - ALOGAM is natural log of gamma function no need to test ifail as - an error is impossible - */ - - a = *p + 1.; - f = exp(*p * log(*y) - alogam(&a, &ifail) - *y); - if (f == 0.) { - return g; - } - *ifault = 0; - - /* Series begins */ - c = 1.; - g = 1.; - a = *p; -L10: - a += 1.; - c = c * *y / a; - g += c; - if (c / g > 1e-6) { - goto L10; - } - g *= f; - return g; -} - -/* - ----------------------------------------------------------------------- - Name: ALOGAM - Purpose: Value of the log-gamma function. - Usage: ALOGAM (X, IFAULT) - Arguments: - X - Value at which the log-gamma function is to be evaluated. - (Input) - IFAULT - Error indicator. (Output) - IFAULT DEFINITION - 0 No error - 1 X < 0 - ALGAMA - The value of the log-gamma function at XX. (Output) - ----------------------------------------------------------------------- - - Algorithm ACM 291, Comm. ACM. (1966) Vol. 9, P. 684 - - Evaluates natural logarithm of gamma(x) for X greater than zero. - */ - -double alogam(double *x, int *ifault) -{ - /* Initialized data */ - //printf("alogam x = %f\t%d\n",*x,*ifault); - static double a1 = .918938533204673; - static double a2 = 5.95238095238e-4; - static double a3 = 7.93650793651e-4; - static double a4 = .002777777777778; - static double a5 = .083333333333333; - - /* Local variables */ - static double f, y, z; - - *ifault = 1; - if (*x < 0.) { - return(0.); - } - *ifault = 0; - y = *x; - f = 0.; - if (y >= 7.) { - goto L30; - } - f = y; -L10: - y += 1.; - if (y >= 7.) { - goto L20; - } - f *= y; - goto L10; -L20: - f = -log(f); -L30: - z = 1. / (y * y); - - //printf("returning %f\n",(f + (y - .5) * log(y) - y + a1 + (((-a2 * z + a3) * z - a4) * z + a5) / y)); - return(f + (y - .5) * log(y) - y + a1 + - (((-a2 * z + a3) * z - a4) * z + a5) / y); -} - - -#endif /* not USING_R */ - diff --git a/fisher2.h b/fisher2.h deleted file mode 100644 index 905a1a0..0000000 --- a/fisher2.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef GUARD_fisher2 -#define GUARD_fisher2 - -#include -#include -#include -#include - - -#ifdef __cplusplus -extern "C" { -#endif - -#define SINT_MAX INT_MAX - -#define max(a, b) ((a) < (b) ? (b) : (a)) -#define min(a, b) ((a) > (b) ? (b) : (a)) - - -void fexact(int *, int *, double *, int *, - double *, double *, double *, double *, - double *, int *); - -#ifdef __cplusplus -} -#endif - -#endif - diff --git a/makefile b/makefile index 100df10..d8e6dcd 100644 --- a/makefile +++ b/makefile @@ -11,13 +11,14 @@ USEMPI ?= no 64BIT_VERSION ?= yes -USEREADLINE ?= no +USEREADLINE ?= yes CYGWIN_BUILD ?= no USECOMPRESSION ?= no MOTHUR_FILES="\"Enter_your_default_path_here\"" -RELEASE_DATE = "\"3/13/2012\"" -VERSION = "\"1.24.0\"" +RELEASE_DATE = "\"3/16/2012\"" +VERSION = "\"1.24.1\"" FORTAN_COMPILER = gfortran +FORTRAN_FLAGS = # Optimize to level 3: CXXFLAGS += -O3 @@ -39,6 +40,7 @@ ifeq ($(strip $(64BIT_VERSION)),yes) #CXXFLAGS += -mtune=native -march=native -m64 CXXFLAGS += -DBIT_VERSION + FORTRAN_FLAGS = -m64 endif @@ -102,7 +104,7 @@ uchime: cd uchime_src && ./mk && mv uchime .. && cd .. fortranSource: - ${FORTAN_COMPILER} -c -m64 *.f + ${FORTAN_COMPILER} -c $(FORTRAN_FLAGS) *.f install : mothur # cp mothur ../Release/mothur diff --git a/metastats.h b/metastats.h deleted file mode 100644 index 5bab288..0000000 --- a/metastats.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef METASTATS2 -#define METASTATS2 - -/* - * metastats.h - * Mothur - * - * Created by westcott on 9/16/10. - * Copyright 2010 Schloss Lab. All rights reserved. - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include -#include -#include "fisher2.h" - -void testp(double *permuted_ttests,int *B,double *permuted,double - *Imatrix,int *nc,int *nr,int *g,double *Tinitial,double *ps); -void permute_matrix(double *Imatrix,int *nc,int *nr,double - *permuted,int *g,double *trial_ts,double *Tinitial,double - *counter); -void permute_array(int *array, int n); -void calc_twosample_ts(double *Pmatrix,int *g,int *nc,int *nr,double - *Ts,double *Tinitial,double *counter1); -void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *storage); -void start(double *Imatrix,int *g,int *nr,int *nc,double *testing, - double storage[][9]); - -int metastat_main (char*, int, int, double, int, double**, int); - -#ifdef __cplusplus -} -#endif - -#endif - - - diff --git a/metastats2.c b/metastats2.c deleted file mode 100644 index 70edc3d..0000000 --- a/metastats2.c +++ /dev/null @@ -1,559 +0,0 @@ - -#include "metastats.h" - -//The following code has been modified using the original Metastats program from White, J.R., Nagarajan, N. & Pop, M. Statistical methods for detecting differentially abundant features in clinical metagenomic samples. PLoS Comput Biol 5, e1000352 (2009). - -int metastat_main (char* outputFileName, int numRows, int numCols, double threshold, int numPermutations, double** data, int secondGroupingStart){ - - int size,c=0,i=0,j=0,counter=0, bflag=0; - int B=numPermutations; - int row = numRows; - int col = numCols; - int g = secondGroupingStart; - double thresh=threshold; - double min=0; - - char output[1024]; - strcpy(output, outputFileName); - FILE *out; - - if (g>=col || g<=0){ - printf("Check your g value\n"); - } - - // Initialize the matrices - size = row*col; - double matrix[row][col]; - double pmatrix[size],permuted[size]; - double storage[row][9]; - - for (i=0;i2){ - for(i=1;i total[i]){ - min = total[i];} - } - } - if (min<=0){ - printf("Error, the sum of one of the columns <= 0."); - return 0; - } - - - // Ratio time... - for(i=0;i.999999999){ - *pre=1; - } - - //printf("feaxt = %f\t%f\t%f\t%f\t%f\t%f\n", *expect, *pre, f11, f12, f21, f22); - storage[i][8] = *pre; - pvalues[i]=*pre; - } - } - else{ - - testp(permuted_ttests, permutes, permuted,pmatrix, nc, nr, gvalue,tinitial,pvalues); - - // Checks to make sure the matrix isn't sparse. - double sparse[row], sparse2[row]; - for(i=0;i.999999999){ - *pre=1; - } - storage[i][8] = *pre; - pvalues[i]=*pre; - } - } - // End of else statement - bflag = 1; - } - - // Calculates the mean of counts (not normalized) - double temp[row][2]; - - for(j=0;j(fabs(Tinitial[i])+.0000000000001)){ //13th place - counter[i]++; - } - } -} - -void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *store){ - double temp[*nr], temp2[*nr],var[*nr],var2[*nr],a,b; - - int i,m,k,l,n; - - a = (double) *g-1; - b = (double) (*nc-a); - - for (i = 0; i<*nr; i++){ - temp[i]=0; - temp2[i]=0; - var[i]=0; - var2[i]=0; - } - - k = *nr; // number of rows - l = *nc; - n = k*l; - - m=0; - m=*g-1; - k=*nr; - m*=k; // m = g * nr now - for (i=0;i& th m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine(); outputNames.pop_back(); }else { + + ofstream outTemp; + string tempOut = outputDir + "data." + setA + "-" + setB + ".matrix"; + m->openOutputFile(tempOut, outTemp); + for (int i = 0; i < subset.size(); i++) { outTemp << '\t' << subset[i]->getGroup(); } + outTemp << endl; + + //fill data for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) { //data[j] = new double[subset.size()]; data2[j].resize(subset.size(), 0.0); + outTemp << "OTU" << (j+1); for (int i = 0; i < subset.size(); i++) { - //data[j][i] = (subset[i]->getAbundance(j)); data2[j][i] = (subset[i]->getAbundance(j)); + outTemp << '\t' << subset[i]->getAbundance(j); } + outTemp << endl; } - + outTemp.close(); m->mothurOut("Comparing " + setA + " and " + setB + "..."); m->mothurOutEndLine(); //metastat_main(output, thisLookUp[0]->getNumBins(), subset.size(), threshold, iters, data, setACount); @@ -500,7 +509,6 @@ int MetaStatsCommand::driver(int start, int num, vector& th MothurMetastats mothurMeta(threshold, iters); mothurMeta.runMetastats(outputFileName , data2, setACount); m->mothurOutEndLine(); - m->mothurOutEndLine(); } diff --git a/mothurmetastats.cpp b/mothurmetastats.cpp index ae3ab80..5678973 100644 --- a/mothurmetastats.cpp +++ b/mothurmetastats.cpp @@ -26,202 +26,182 @@ MothurMetastats::MothurMetastats(double t, int n) { /***********************************************************/ MothurMetastats::~MothurMetastats() {} /***********************************************************/ -//main metastats function -int MothurMetastats::runMetastats(string outputFileName, vector< vector >& data, int secondGroupingStart) { - try { - int bflag = 0; - row = data.size(); //numBins + //main metastats function +int MothurMetastats::runMetastats(string outputFileName, vector< vector >& data, int secGroupingStart) { + try { + row = data.size(); //numBins column = data[0].size(); //numGroups in subset - int size = row*column; - - //consistent with original, but this should never be true - if ((secondGroupingStart >= column) || (secondGroupingStart <= 0)) { m->mothurOut("[ERROR]: Check your g value."); m->mothurOutEndLine(); return 0; } - - //Initialize the matrices - vector pmatrix; pmatrix.resize(size, 0.0); - vector permuted; permuted.resize(size, 0.0); - vector< vector > storage; storage.resize(row); - for (int i = 0; i < storage.size(); i++) { storage[i].resize(9, 0.0); } - - //Produces the sum of each column - vector total; total.resize(column, 0.0); - vector ratio; ratio.resize(column, 0.0); - double total1 = 0.0; double total2 = 0.0; - - //total[i] = total abundance for group[i] + secondGroupingStart = secGroupingStart; //g + + vector< vector > Pmatrix; Pmatrix.resize(row); + for (int i = 0; i < row; i++) { Pmatrix[i].resize(column, 0.0); } // the relative proportion matrix + vector< vector > C1; C1.resize(row); + for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2 + vector< vector > C2; C2.resize(row); // mean[1], variance[2], standard error[3] + for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); } + vector T_statistics; T_statistics.resize(row, 1); // a place to store the true t-statistics + vector pvalues; pvalues.resize(row, 1); // place to store pvalues + vector qvalues; qvalues.resize(row, 1); // stores qvalues + + //************************************* + // convert to proportions + // generate Pmatrix + //************************************* + vector totals; totals.resize(column, 0); // sum of columns + //total[i] = total abundance for group[i] for (int i = 0; i < column; i++) { for (int j = 0; j < row; j++) { - total[i] += data[j][i]; + totals[i] += data[j][i]; } - } - - //total for first grouping - for (int i = 0; i < secondGroupingStart; i++) { total1 += total[i]; } - - //total for second grouping - for (int i = secondGroupingStart; i < column; i++) { total2 += total[i]; } - - //Creates the ratios by first finding the minimum of totals - double min = total[0]; - for (int i = 0; i < total.size(); i++) { - if (total[i] < min) { min = total[i]; } - } - - //sanity check - if (min <= 0.0) { m->mothurOut("[ERROR]: the sum of one of the columns <= 0."); m->mothurOutEndLine(); return 0; } - - //Ratio time... - for(int i = 0; i < ratio.size(); i++){ ratio[i] = total[i] / min; } - - //Change matrix into an array as received by R for compatibility - kept to be consistent with original - int count = 0; - for(int i = 0; i < column; i++){ - for(int j = 0; j < row; j++){ - pmatrix[count]=data[j][i]; - count++; + } + + for (int i = 0; i < column; i++) { + for (int j = 0; j < row; j++) { + Pmatrix[j][i] = data[j][i]/totals[i]; + } - } - - if(row == 1){ - for (int i =0; i < column; i++){ pmatrix[i] /= ratio[i]; } - }else { - count = 0; int j=-1; - - for (int i=0; i < size; i++) { - if (count % row == 0) { j++; } - pmatrix[i] /= ratio[j]; - count++; - } - } - - vector permuted_ttests; permuted_ttests.resize(row, 0.0); - vector pvalues; pvalues.resize(row, 0.0); - vector tinitial; tinitial.resize(row, 0.0); - - if (m->control_pressed) { return 1; } - - //Find the initial values for the matrix. - start(pmatrix, secondGroupingStart, tinitial, storage); - - if (m->control_pressed) { return 1; } - - // Start the calculations. - if ( (column == 2) || (secondGroupingStart < 8) || ((column-secondGroupingStart) < 8) ){ - - vector fish; fish.resize(row, 0.0); + } + + //#******************************************************************************** + //# ************************** STATISTICAL TESTING ******************************** + //#******************************************************************************** + + if (column == 2){ //# then we have a two sample comparison + //#************************************************************ + //# generate p values fisher's exact test + //#************************************************************ + double total1, total2; + //total for first grouping + for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; } + + //total for second grouping + for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; } + + vector fish; fish.resize(row, 0.0); vector fish2; fish2.resize(row, 0.0); - + for(int i = 0; i < row; i++){ for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; } for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; } - //vector tempData; tempData.resize(4, 0.0); double f11, f12, f21, f22; f11 = fish[i]; f12 = fish2[i]; f21 = total1 - fish[i]; f22 = total2 - fish2[i]; - double pre = 0.0; - MothurFisher fisher; - pre = fisher.fexact(f11, f12, f21, f22); - + double pre = fisher.fexact(f11, f12, f21, f22); + if (pre > 0.999999999) { pre = 1.0; } + if (m->control_pressed) { return 1; } - if (pre > 0.999999999) { pre = 1.0; } - storage[i][8] = pre; pvalues[i] = pre; } - - }else { - - testp(permuted_ttests, permuted, pmatrix, secondGroupingStart, tinitial, pvalues); - - if (m->control_pressed) { return 1; } - - // Checks to make sure the matrix isn't sparse. - vector sparse; sparse.resize(row, 0.0); - vector sparse2; sparse2.resize(row, 0.0); - - int c = 0; - + + //#************************************* + //# calculate q values from p values + //#************************************* + qvalues = calc_qvalues(pvalues); + + }else { //we have multiple subjects per population + + //#************************************* + //# generate statistics mean, var, stderr + //#************************************* + for(int i = 0; i < row; i++){ // for each taxa + //# find the mean of each group + double g1Total = 0.0; double g2Total = 0.0; + for (int j = 0; j < secondGroupingStart; j++) { g1Total += Pmatrix[i][j]; } + C1[i][0] = g1Total/(double)(secondGroupingStart); + for (int j = secondGroupingStart; j < column; j++) { g2Total += Pmatrix[i][j]; } + C2[i][0] = g2Total/(double)(column-secondGroupingStart); + + //# find the variance of each group + double g1Var = 0.0; double g2Var = 0.0; + for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((Pmatrix[i][j]-C1[i][0]), 2); } + C1[i][1] = g1Var/(double)(secondGroupingStart-1); + for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((Pmatrix[i][j]-C2[i][0]), 2); } + C2[i][1] = g2Var/(double)(column-secondGroupingStart-1); + + //# find the std error of each group -std err^2 (will change to std err at end) + C1[i][2] = C1[i][1]/(double)(secondGroupingStart); + C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart); + } + + //#************************************* + //# two sample t-statistics + //#************************************* + for(int i = 0; i < row; i++){ // # for each taxa + double xbar_diff = C1[i][0] - C2[i][0]; + double denom = sqrt(C1[i][2] + C2[i][2]); + T_statistics[i] = xbar_diff/denom; // calculate two sample t-statistic + } + + /*for (int i = 0; i < row; i++) { + for (int j = 0; j < 3; j++) { + cout << "C1[" << i+1 << "," << j+1 << "]=" << C1[i][j] << ";" << endl; + cout << "C2[" << i+1 << "," << j+1 << "]=" << C2[i][j] << ";" << endl; + } + cout << "T_statistics[" << i+1 << "]=" << T_statistics[i] << ";" << endl; + }*/ + //#************************************* + //# generate initial permuted p-values + //#************************************* + pvalues = permuted_pvalues(Pmatrix, T_statistics, data); + + //#************************************* + //# generate p values for sparse data + //# using fisher's exact test + //#************************************* + double total1, total2; + //total for first grouping + for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; } + + //total for second grouping + for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; } + + vector fish; fish.resize(row, 0.0); + vector fish2; fish2.resize(row, 0.0); + for(int i = 0; i < row; i++){ - for(int j = 0; j < secondGroupingStart; j++) { sparse[i] += data[i][j]; } - if(sparse[i] < (double)secondGroupingStart) { c++; } + for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; } + for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; } + + if ((fish[1] < secondGroupingStart) && (fish2[i] < (column-secondGroupingStart))) { + double f11, f12, f21, f22; + f11 = fish[i]; + f12 = fish2[i]; + f21 = total1 - fish[i]; + f22 = total2 - fish2[i]; - // ?<= for col - for(int j = secondGroupingStart; j < column; j++) { sparse2[i] += data[i][j]; } - if( (sparse2[i] < (double)(column-secondGroupingStart))) { c++; } + MothurFisher fisher; + double pre = fisher.fexact(f11, f12, f21, f22); + if (pre > 0.999999999) { pre = 1.0; } + + if (m->control_pressed) { return 1; } - if (c == 2) { - c=0; - double f11,f12,f21,f22; - - f11=sparse[i]; sparse[i]=0; - f12=sparse2[i]; sparse2[i]=0; - f21 = total1 - f11; - f22 = total2 - f12; - - double pre = 0.0; - - MothurFisher fisher; - pre = fisher.fexact(f11, f12, f21, f22); - - if (m->control_pressed) { return 1; } - - if (pre > 0.999999999){ - pre = 1.0; - } - - storage[i][8] = pre; - pvalues[i] = pre; - } + pvalues[i] = pre; + } } - - bflag = 1; - } - // Calculates the mean of counts (not normalized) - vector< vector > temp; temp.resize(row); - for (int i = 0; i < temp.size(); i++) { temp[i].resize(2, 0.0); } - - for (int j = 0; j < row; j++){ - if (m->control_pressed) { return 1; } - - for (int i = 0; i < secondGroupingStart; i++){ temp[j][0] += data[j][i]; } - temp[j][0] /= (double)secondGroupingStart; - - for(int i = secondGroupingStart; i < column; i++){ temp[j][1] += data[j][i]; } - temp[j][1] /= (double)(column-secondGroupingStart); - } - - for(int i = 0; i < row; i++){ - if (m->control_pressed) { return 1; } - - storage[i][3]=temp[i][0]; - storage[i][7]=temp[i][1]; - storage[i][8]=pvalues[i]; - } - - vector qvalues = calc_qvalues(pvalues); - - // BACKUP checks - cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint); - for (int i = 0; i < row; i++){ - - if (m->control_pressed) { return 1; } - - if(qvalues[i] < threshold){ - m->mothurOut("Feature " + toString((i+1)) + " is significant, q = "); - cout << qvalues[i]; - m->mothurOutJustToLog(toString(qvalues[i])); m->mothurOutEndLine(); - } - } - - // And now we write the files to a text file. + //#************************************* + //# calculate q values from p values + //#************************************* + qvalues = calc_qvalues(pvalues); + + //#************************************* + //# convert stderr^2 to std error + //#************************************* + for(int i = 0; i < row; i++){ + C1[i][2] = sqrt(C1[i][2]); + C2[i][2] = sqrt(C2[i][2]); + } + } + + // And now we write the files to a text file. struct tm *local; time_t t; t = time(NULL); local = localtime(&t); @@ -229,264 +209,149 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector ofstream out; m->openOutputFile(outputFileName, out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - + out << "Local time and date of test: " << asctime(local) << endl; out << "# rows = " << row << ", # col = " << column << ", g = " << secondGroupingStart << endl << endl; - if (bflag == 1){ out << numPermutations << " permutations" << endl << endl; } + out << numPermutations << " permutations" << endl << endl; //output column headings - not really sure... documentation labels 9 columns, there are 10 in the output file //storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293 - out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean_of_counts(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tmean_of_counts(group1)\tp-value\tq-value\n"; + out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tp-value\tq-value\n"; for(int i = 0; i < row; i++){ if (m->control_pressed) { out.close(); return 0; } - out << (i+1); - for(int j = 0; j < 9; j++){ out << '\t' << storage[i][j]; } - out << '\t' << qvalues[i]; - out << endl; + //if there are binlabels use them otherwise count. + if (m->binLabelsInFile.size() == row) { out << m->binLabelsInFile[i] << '\t'; } + else { out << (i+1) << '\t'; } + + out << C1[i][0] << '\t' << C1[i][1] << '\t' << C1[i][2] << '\t' << C2[i][0] << '\t' << C2[i][1] << '\t' << C2[i][2] << '\t' << pvalues[i] << '\t' << qvalues[i] << endl; } out << endl << endl; out.close(); - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "runMetastats"); - exit(1); - } -} -/***********************************************************/ -//Find the initial values for the matrix -int MothurMetastats::start(vector& Imatrix, int secondGroupingStart, vector& initial, vector< vector >& storage) { - try { - - int a = row; a*=4; - - double xbardiff = 0.0; double denom = 0.0; - vector store; store.resize(a, 0.0); - vector tool; tool.resize(a, 0.0); - vector< vector > C1; C1.resize(row); - for (int i = 0; i < C1.size(); i++) { C1[i].resize(3, 0.0); } - vector< vector > C2; C2.resize(row); - for (int i = 0; i < C2.size(); i++) { C2[i].resize(3, 0.0); } - - meanvar(Imatrix, secondGroupingStart, store); - - if (m->control_pressed) { return 0; } - - //copy store into tool - tool = store; - - for (int i = 0; i < row; i++){ - C1[i][0]=tool[i]; //mean group 1 - storage[i][0]=C1[i][0]; - C1[i][1]=tool[i+row+row]; // var group 1 - storage[i][1]=C1[i][1]; - C1[i][2]=C1[i][1]/(secondGroupingStart); - storage[i][2]=sqrt(C1[i][2]); - - C2[i][0]=tool[i+row]; // mean group 2 - storage[i][4]=C2[i][0]; - C2[i][1]=tool[i+row+row+row]; // var group 2 - storage[i][5]=C2[i][1]; - C2[i][2]=C2[i][1]/(column-secondGroupingStart); - storage[i][6]=sqrt(C2[i][2]); - } - - if (m->control_pressed) { return 0; } - - for (int i = 0; i < row; i++){ - xbardiff = C1[i][0]-C2[i][0]; - denom = sqrt(C1[i][2]+C2[i][2]); - initial[i]=fabs(xbardiff/denom); - } - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "start"); - exit(1); - } -} -/***********************************************************/ -int MothurMetastats::meanvar(vector& pmatrix, int secondGroupingStart, vector& store) { - try { - vector temp; temp.resize(row, 0.0); - vector temp2; temp2.resize(row, 0.0); - vector var; var.resize(row, 0.0); - vector var2; var2.resize(row, 0.0); - - double a = secondGroupingStart; - double b = column - a; - int m = a * row; - int n = row * column; - - for (int i = 0; i < m; i++) { temp[i%row] += pmatrix[i]; } - for (int i = 0; i < n; i++) { temp2[i%row]+= pmatrix[i]; } - for (int i = 0; i < row; i++) { temp2[i] -= temp[i]; } - for (int i = 0; i <= row-1;i++) { - store[i] = temp[i]/a; - store[i+row]=temp2[i]/b; - } - - //That completes the mean calculations. - - for (int i = 0; i < m; i++) { var[i%row] += pow((pmatrix[i]-store[i%row]),2); } - for (int i = m; i < n; i++) { var2[i%row]+= pow((pmatrix[i]-store[(i%row)+row]),2); } - for (int i = 0; i <= row-1; i++){ - store[i+2*row]=var[i]/(a-1); - store[i+3*row]=var2[i]/(b-1); - } - - // That completes var calculations. - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "meanvar"); - exit(1); - } -} -/***********************************************************/ -int MothurMetastats::testp(vector& permuted_ttests, vector& permuted, vector& Imatrix, int secondGroupingStart, vector& Tinitial, vector& ps) { - try { - - vector Tvalues; Tvalues.resize(row, 0.0); - vector counter; counter.resize(row, 0.0); - int a, b, n; - - a = numPermutations; - b = row; - n = a*b; - - for (int j = 1; j <= row; j++) { - if (m->control_pressed) { return 0; } - permute_matrix(Imatrix, permuted, secondGroupingStart, Tvalues, Tinitial, counter); - } - - for(int j = 0; j < row; j++) { - if (m->control_pressed) { return 0; } - ps[j] = ((counter[j]+1)/(double)(a+1)); - } - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "testp"); - exit(1); - } -} -/***********************************************************/ -int MothurMetastats::permute_matrix(vector& Imatrix, vector& permuted, int secondGroupingStart, vector& trial_ts, vector& Tinitial, vector& counter1){ - try { - - vector y; y.resize(column, 0); - for (int i = 1; i <= column; i++){ y[i-1] = i; } - - permute_array(y); - - int f = 0; int c = 0; int k = 0; - for (int i = 0; i < column; i++){ - - if (m->control_pressed) { return 0; } - - f = y[i]; //column number - c = 1; - c *= (f-1); - c *= row; - if (f == 1){ c = 0; } // starting value position in the Imatrix - - for(int j = 1; j <= row; j++){ - permuted[k] = Imatrix[c]; - c++; k++; - } - } - - calc_twosample_ts(permuted, secondGroupingStart, trial_ts, Tinitial, counter1); - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "permute_matrix"); - exit(1); - } + + return 0; + + }catch(exception& e) { + m->errorOut(e, "MothurMetastats", "runMetastats"); + exit(1); + } } /***********************************************************/ -int MothurMetastats::permute_array(vector& array) { +vector MothurMetastats::permuted_pvalues(vector< vector >& Imatrix, vector& tstats, vector< vector >& Fmatrix) { try { - static int seeded = 0; - - if (! seeded) { - seeded = 1; - srand(time(NULL)); - } - - for (int i = 0; i < array.size(); i++) { - if (m->control_pressed) { return 0; } - - int selection = rand() % (array.size() - i); - int tmp = array[i + selection]; - array[i + selection] = array[i]; - array[i] = tmp; - } - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "permute_array"); - exit(1); - } + //# matrix stores tstats for each taxa(row) for each permuted trial(column) + vector ps; ps.resize(row, 0.0); //# to store the pvalues + vector< vector > permuted_ttests; permuted_ttests.resize(numPermutations); + for (int i = 0; i < numPermutations; i++) { permuted_ttests[i].resize(row, 0.0); } + + //# calculate null version of tstats using B permutations. + for (int i = 0; i < numPermutations; i++) { + permuted_ttests[i] = permute_and_calc_ts(Imatrix); + } + + //# calculate each pvalue using the null ts + if ((secondGroupingStart) < 8 || (column-secondGroupingStart) < 8){ + vector< vector > cleanedpermuted_ttests; cleanedpermuted_ttests.resize(numPermutations); //# the array pooling just the frequently observed ts + //# then pool the t's together! + //# count how many high freq taxa there are + int hfc = 1; + for (int i = 0; i < row; i++) { // # for each taxa + double group1Total = 0.0; double group2Total = 0.0; + for(int j = 0; j < secondGroupingStart; j++) { group1Total += Fmatrix[i][j]; } + for(int j = secondGroupingStart; j < column; j++) { group2Total += Fmatrix[i][j]; } + + if (group1Total >= secondGroupingStart || group2Total >= (column-secondGroupingStart)){ + hfc++; + for (int j = 0; j < numPermutations; j++) { cleanedpermuted_ttests[j].push_back(permuted_ttests[j][i]); } + } + } + + //#now for each taxa + for (int i = 0; i < row; i++) { + //number of cleanedpermuted_ttests greater than tstat[i] + int numGreater = 0; + for (int j = 0; j < numPermutations; j++) { + for (int k = 0; k < hfc; k++) { + if (cleanedpermuted_ttests[j][k] > abs(tstats[i])) { numGreater++; } + } + } + + ps[i] = (1/(double)(numPermutations*hfc))*numGreater; + } + }else{ + for (int i = 0; i < row; i++) { + //number of permuted_ttests[i] greater than tstat[i] //(sum(permuted_ttests[i,] > abs(tstats[i]))+1) + int numGreater = 1; + for (int j = 0; j < numPermutations; j++) { if (permuted_ttests[j][i] > abs(tstats[i])) { numGreater++; } } + ps[i] = (1/(double)(numPermutations+1))*numGreater; + } + } + + return ps; + + }catch(exception& e) { + m->errorOut(e, "MothurMetastats", "permuted_pvalues"); + exit(1); + } } /***********************************************************/ -int MothurMetastats::calc_twosample_ts(vector& Pmatrix, int secondGroupingStart, vector& Ts, vector& Tinitial, vector& counter) { +vector MothurMetastats::permute_and_calc_ts(vector< vector >& Imatrix) { try { - int a = row * 4; - - vector< vector > C1; C1.resize(row); - for (int i = 0; i < C1.size(); i++) { C1[i].resize(3, 0.0); } - vector< vector > C2; C2.resize(row); - for (int i = 0; i < C2.size(); i++) { C2[i].resize(3, 0.0); } - vector storage; storage.resize(a, 0.0); - vector tool; tool.resize(a, 0.0); - double xbardiff = 0.0; double denom = 0.0; - - meanvar(Pmatrix, secondGroupingStart, storage); - - for(int i = 0;i <= (a-1); i++) { - if (m->control_pressed) { return 0; } - tool[i] = storage[i]; - } - - for (int i = 0; i < row; i++){ - if (m->control_pressed) { return 0; } - C1[i][0]=tool[i]; - C1[i][1]=tool[i+row+row]; - C1[i][2]=C1[i][1]/(secondGroupingStart); - - C2[i][0]=tool[i+row]; - C2[i][1]=tool[i+row+row+row]; // var group 2 - C2[i][2]=C2[i][1]/(column-secondGroupingStart); - } - - for (int i = 0; i < row; i++){ - if (m->control_pressed) { return 0; } - xbardiff = C1[i][0]-C2[i][0]; - denom = sqrt(C1[i][2]+C2[i][2]); - Ts[i]=fabs(xbardiff/denom); - if (fabs(Ts[i])>(fabs(Tinitial[i])+.0000000000001)){ //13th place - counter[i]++; - } - } - - return 0; - - }catch(exception& e) { - m->errorOut(e, "MothurMetastats", "calc_twosample_ts"); - exit(1); - } + vector< vector > permutedMatrix = Imatrix; + + //randomize columns, ie group abundances. + for (int i = 0; i < permutedMatrix.size(); i++) { random_shuffle(permutedMatrix[i].begin(), permutedMatrix[i].end()); } + + //calc ts + vector< vector > C1; C1.resize(row); + for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2 + vector< vector > C2; C2.resize(row); // mean[1], variance[2], standard error[3] + for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); } + vector Ts; Ts.resize(row, 0.0); // a place to store the true t-statistics + + //#************************************* + //# generate statistics mean, var, stderr + //#************************************* + for(int i = 0; i < row; i++){ // for each taxa + //# find the mean of each group + double g1Total = 0.0; double g2Total = 0.0; + for (int j = 0; j < secondGroupingStart; j++) { g1Total += permutedMatrix[i][j]; } + C1[i][0] = g1Total/(double)(secondGroupingStart); + for (int j = secondGroupingStart; j < column; j++) { g2Total += permutedMatrix[i][j]; } + C2[i][0] = g2Total/(double)(column-secondGroupingStart); + + //# find the variance of each group + double g1Var = 0.0; double g2Var = 0.0; + for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((permutedMatrix[i][j]-C1[i][0]), 2); } + C1[i][1] = g1Var/(double)(secondGroupingStart-1); + for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((permutedMatrix[i][j]-C2[i][0]), 2); } + C2[i][1] = g2Var/(double)(column-secondGroupingStart-1); + + //# find the std error of each group -std err^2 (will change to std err at end) + C1[i][2] = C1[i][1]/(double)(secondGroupingStart); + C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart); + } + + //#************************************* + //# two sample t-statistics + //#************************************* + for(int i = 0; i < row; i++){ // # for each taxa + double xbar_diff = C1[i][0] - C2[i][0]; + double denom = sqrt(C1[i][2] + C2[i][2]); + Ts[i] = abs(xbar_diff/denom); // calculate two sample t-statistic + } + + return Ts; + + + }catch(exception& e) { + m->errorOut(e, "MothurMetastats", "permuted_ttests"); + exit(1); + } } /***********************************************************/ vector MothurMetastats::calc_qvalues(vector& pValues) { diff --git a/mothurmetastats.h b/mothurmetastats.h index d80b687..4d6cf91 100644 --- a/mothurmetastats.h +++ b/mothurmetastats.h @@ -23,9 +23,12 @@ class MothurMetastats { private: MothurOut* m; - int row, column, numPermutations; + int row, column, numPermutations, secondGroupingStart; double threshold; - + + vector permuted_pvalues(vector< vector >&, vector&, vector< vector >&); + vector permute_and_calc_ts(vector< vector >&); + int start(vector&, int, vector&, vector< vector >&); //Find the initial values for the matrix int meanvar(vector&, int, vector&); int testp(vector&, vector&, vector&, int, vector&, vector&); diff --git a/mothurout.h b/mothurout.h index 12de987..e1c8222 100644 --- a/mothurout.h +++ b/mothurout.h @@ -138,7 +138,7 @@ class MothurOut { int getRandomIndex(int); //highest int control_pressed; - bool executing, runParse, jumble, gui; + bool executing, runParse, jumble, gui, mothurCalling; //current files - if you add a new type you must edit optionParser->getParameters, get.current command and mothurOut->printCurrentFiles/clearCurrentFiles. string getPhylipFile() { return phylipfile; } @@ -219,6 +219,7 @@ class MothurOut { gui = false; printedHeaders = false; commandInputsConvertError = false; + mothurCalling = false; sharedHeaderMode = ""; } ~MothurOut(); diff --git a/optionparser.cpp b/optionparser.cpp index 1e8c445..06a900d 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -123,7 +123,7 @@ bool OptionParser::getNameFile(vector files) { string namefile = m->getNameFile(); bool match = false; - if (namefile != "") { + if ((namefile != "")&&(!m->mothurCalling)) { string temp = m->getRootName(m->getSimpleName(namefile)); vector rootName; m->splitAtChar(temp, rootName, '.'); diff --git a/pcrseqscommand.h b/pcrseqscommand.h new file mode 100644 index 0000000..07ca8d5 --- /dev/null +++ b/pcrseqscommand.h @@ -0,0 +1,356 @@ +#ifndef Mothur_pcrseqscommand_h +#define Mothur_pcrseqscommand_h + +// +// pcrseqscommand.h +// Mothur +// +// Created by Sarah Westcott on 3/14/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + + +#include "command.hpp" +#include "sequence.hpp" +#include "trimoligos.h" +#include "alignment.hpp" +#include "needlemanoverlap.hpp" + +class PcrSeqsCommand : public Command { +public: + PcrSeqsCommand(string); + PcrSeqsCommand(); + ~PcrSeqsCommand(){} + + vector setParameters(); + string getCommandName() { return "pcr.seqs"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; } + string getDescription() { return "pcr.seqs"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + + struct linePair { + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} + linePair() {} + }; + + vector lines; + bool getOligos(vector >&, vector >&, vector >&); + bool abort, keepprimer; + string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch; + int start, end, pdiffs, processors, length; + + vector revPrimer, outputNames; + vector primers; + + int writeAccnos(set); + int readName(set&); + int readGroup(set); + int readTax(set); + bool readOligos(); + bool readEcoli(); + int driverPcr(string, string, string, set&, linePair); + int createProcesses(string, string, string, set&); + bool findForward(Sequence&, int&, int&); + bool findReverse(Sequence&, int&, int&); + bool isAligned(string, map&); + bool compareDNASeq(string, string); + string reverseOligo(string); +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct pcrData { + string filename; + string goodFasta, badFasta, oligosfile, ecolifile, nomatch; + unsigned long long fstart; + unsigned long long fend; + int count, start, end, length; + MothurOut* m; + vector primers; + vector revPrimer; + set badSeqNames; + bool keepprimer; + + + pcrData(){} + pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector pr, vector rpr, string nm, bool kp, int st, int en, int l, unsigned long long fst, unsigned long long fen) { + filename = f; + goodFasta = gf; + badFasta = bfn; + m = mout; + oligosfile = ol; + ecolifile = ec; + primers = pr; + revPrimer = rpr; + nomatch = nm; + keepprimer = kp; + start = st; + end = en; + length = l; + fstart = fst; + fend = fen; + count = 0; + } +}; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ + pcrData* pDataArray; + pDataArray = (pcrData*)lpParam; + + try { + ofstream goodFile; + pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile); + + ofstream badFile; + pDataArray->m->openOutputFile(pDataArray->badFasta, badFile); + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + + //print header if you are process 0 + if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); + } + + set lengths; + pDataArray->count = pDataArray->fend; + for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { break; } + + Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA); + + string trashCode = ""; + if (currSeq.getName() != "") { + + bool goodSeq = true; + if (pDataArray->oligosfile != "") { + map mapAligned; + //bool aligned = isAligned(currSeq.getAligned(), mapAligned); + /////////////////////////////////////////////////////////////// + bool aligned = false; + string seq = currSeq.getAligned(); + int countBases = 0; + for (int k = 0; k < seq.length(); k++) { + if (!isalpha(seq[k])) { aligned = true; } + else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned. + } //ie. the 3rd base may be at spot 10 in the alignment + //later when we trim we want to trim from spot 10. + /////////////////////////////////////////////////////////////// + + //process primers + if (pDataArray->primers.size() != 0) { + int primerStart = 0; int primerEnd = 0; + //bool good = findForward(currSeq, primerStart, primerEnd); + /////////////////////////////////////////////////////////////// + bool good = false; + string rawSequence = currSeq.getUnaligned(); + + for(int j=0;jprimers.size();j++){ + string oligo = pDataArray->primers[j]; + + if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } + + if(rawSequence.length() < oligo.length()) { break; } + + //search for primer + int olength = oligo.length(); + for (int l = 0; l < rawSequence.length()-olength; l++){ + if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } + string rawChunk = rawSequence.substr(l, olength); + //compareDNASeq(oligo, rawChunk) + //////////////////////////////////////////////////////// + bool success = 1; + for(int k=0;knomatch == "reject") { goodSeq = false; } trashCode += "f"; } + else{ + //are you aligned + if (aligned) { + if (!pDataArray->keepprimer) { currSeq.padToPos(mapAligned[primerEnd]); } + else { currSeq.padToPos(mapAligned[primerStart]); } + }else { + if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } + else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } + } + } + } + + //process reverse primers + if (pDataArray->revPrimer.size() != 0) { + int primerStart = 0; int primerEnd = 0; + bool good = false; + //findReverse(currSeq, primerStart, primerEnd); + /////////////////////////////////////////////////////////////// + string rawSequence = currSeq.getUnaligned(); + + for(int j=0;jrevPrimer.size();j++){ + string oligo = pDataArray->revPrimer[j]; + if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } + if(rawSequence.length() < oligo.length()) { break; } + + //search for primer + int olength = oligo.length(); + for (int l = rawSequence.length()-olength; l >= 0; l--){ + + string rawChunk = rawSequence.substr(l, olength); + //compareDNASeq(oligo, rawChunk) + //////////////////////////////////////////////////////// + bool success = 1; + for(int k=0;knomatch == "reject") { goodSeq = false; } trashCode += "r"; } + else{ + //are you aligned + if (aligned) { + if (!pDataArray->keepprimer) { currSeq.padFromPos(mapAligned[primerStart]); } + else { currSeq.padFromPos(mapAligned[primerEnd]); } + } + else { + if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); } + else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); } + } + } + } + }else if (pDataArray->ecolifile != "") { + //make sure the seqs are aligned + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; } + else if (currSeq.getAligned().length() != pDataArray->length) { + pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; + }else { + currSeq.padToPos(pDataArray->start); + currSeq.padFromPos(pDataArray->end); + } + }else{ //using start and end to trim + //make sure the seqs are aligned + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; } + else { + if (pDataArray->start != -1) { currSeq.padToPos(pDataArray->start); } + if (pDataArray->end != -1) { + if (pDataArray->end > currSeq.getAligned().length()) { pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; } + else { + currSeq.padFromPos(pDataArray->end); + } + } + } + } + + if(goodSeq == 1) { currSeq.printSequence(goodFile); } + else { + pDataArray->badSeqNames.insert(currSeq.getName()); + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.printSequence(badFile); + } + } + + //report progress + if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); } + } + //report progress + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + + goodFile.close(); + inFASTA.close(); + badFile.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction"); + exit(1); + } +} + +#endif + +/**************************************************************************************************/ + + + +#endif diff --git a/phylotypecommand.cpp b/phylotypecommand.cpp index f402f2a..2d2db08 100644 --- a/phylotypecommand.cpp +++ b/phylotypecommand.cpp @@ -232,6 +232,7 @@ int PhylotypeCommand::execute(){ ListVector list; list.setLabel(level); + //go through nodes and build listvector for (itCurrent = currentNodes.begin(); itCurrent != currentNodes.end(); itCurrent++) { @@ -244,6 +245,7 @@ int PhylotypeCommand::execute(){ //make the names compatable with listvector string name = ""; for (int i = 0; i < names.size(); i++) { + if (names[i] != "unknown") { if (namefile != "") { map::iterator itNames = namemap.find(names[i]); //make sure this name is in namefile @@ -255,9 +257,8 @@ int PhylotypeCommand::execute(){ } } name = name.substr(0, name.length()-1); //rip off extra ',' - //add bin to list vector - list.push_back(name); + if (name != "") { list.push_back(name); } //caused by unknown } //print listvector diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp new file mode 100644 index 0000000..7f34a03 --- /dev/null +++ b/prcseqscommand.cpp @@ -0,0 +1,1027 @@ +// +// prcseqscommand.cpp +// Mothur +// +// Created by Sarah Westcott on 3/14/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "pcrseqscommand.h" + +//********************************************************************************************************************** +vector PcrSeqsCommand::setParameters(){ + try { + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); + CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli); + CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); + CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string PcrSeqsCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The pcr.seqs command reads a fasta file ...\n"; + + helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; + helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "getHelpString"); + exit(1); + } +} + + +//********************************************************************************************************************** + +PcrSeqsCommand::PcrSeqsCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["taxonomy"] = tempOutNames; + outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand"); + exit(1); + } +} +//*************************************************************************************************************** + +PcrSeqsCommand::PcrSeqsCommand(string option) { + try { + + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["taxonomy"] = tempOutNames; + outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + + it = parameters.find("ecoli"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["ecoli"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not found") { + fastafile = m->getFastaFile(); + if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + }else if (fastafile == "not open") { fastafile = ""; abort = true; } + else { m->setFastaFile(fastafile); } + + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + string temp; + temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; } + keepprimer = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "oligos", true); + if (temp == "not found"){ oligosfile = ""; } + else if(temp == "not open"){ oligosfile = ""; abort = true; } + else { oligosfile = temp; m->setOligosFile(oligosfile); } + + ecolifile = validParameter.validFile(parameters, "ecoli", true); + if (ecolifile == "not found"){ ecolifile = ""; } + else if(ecolifile == "not open"){ ecolifile = ""; abort = true; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not found"){ namefile = ""; } + else if(namefile == "not open"){ namefile = ""; abort = true; } + else { m->setNameFile(namefile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not found"){ groupfile = ""; } + else if(groupfile == "not open"){ groupfile = ""; abort = true; } + else { m->setGroupFile(groupfile); } + + taxfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxfile == "not found"){ taxfile = ""; } + else if(taxfile == "not open"){ taxfile = ""; abort = true; } + else { m->setTaxonomyFile(taxfile); } + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, start); + + temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, end); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); + + nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; } + + if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; } + + //didnt set anything + if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) { + m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true; + } + + if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end != -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; } + + if ((ecolifile != "") && (start != -1) && (end != -1)) { + m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true; + } + + + if ((oligosfile != "") && (ecolifile != "")) { + m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true; + } + + //check to make sure you didn't forget the name file by mistake + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } + + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand"); + exit(1); + } +} +//*************************************************************************************************************** + +int PcrSeqsCommand::execute(){ + try{ + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + int start = time(NULL); + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta"; + outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); + + string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta"; + outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); + + length = 0; + if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; } + if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; } + + vector positions; + int numFastaSeqs = 0; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(fastafile, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + if (processors == 1) { + lines.push_back(linePair(0, 1000)); + }else { + positions = m->setFilePosFasta(fastafile, numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } + } +#endif + if (m->control_pressed) { return 0; } + + set badNames; + if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); } + else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); } + + if (m->control_pressed) { return 0; } + + writeAccnos(badNames); + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (namefile != "") { readName(badNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (groupfile != "") { readGroup(badNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (taxfile != "") { readTax(badNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + m->mothurOutEndLine(); + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + itTypes = outputTypes.find("accnos"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } + } + + itTypes = outputTypes.find("taxonomy"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } + } + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); + m->mothurOutEndLine(); + + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "execute"); + exit(1); + } +} +/**************************************************************************************************/ +int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set& badSeqNames) { + try { + + vector processIDS; + int process = 1; + int num = 0; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]); + + //pass numSeqs to parent + ofstream out; + string tempFile = filename + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << '\t' << badSeqNames.size() << endl; + for (set::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + out << (*it) << endl; + } + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, in); + int numBadNames = 0; string name = ""; + if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); } + for (int j = 0; j < numBadNames; j++) { + in >> name; m->gobble(in); + badSeqNames.insert(name); + } + in.close(); m->mothurRemove(tempFile); + + m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName); + m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); + m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + } + #else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to badSeqNames. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; icount; + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + for (int i = 0; i < processIDS.size(); i++) { + m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName); + m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); + m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + } + +#endif + + return num; + + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "createProcesses"); + exit(1); + } +} + +//********************************************************************************************************************** +int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set& badSeqNames, linePair filePos){ + try { + ofstream goodFile; + m->openOutputFile(goodFasta, goodFile); + + ofstream badFile; + m->openOutputFile(badFasta, badFile); + + ifstream inFASTA; + m->openInputFile(filename, inFASTA); + + inFASTA.seekg(filePos.start); + + bool done = false; + int count = 0; + set lengths; + + while (!done) { + + if (m->control_pressed) { break; } + + Sequence currSeq(inFASTA); m->gobble(inFASTA); + + string trashCode = ""; + if (currSeq.getName() != "") { + + bool goodSeq = true; + if (oligosfile != "") { + map mapAligned; + bool aligned = isAligned(currSeq.getAligned(), mapAligned); + + //process primers + if (primers.size() != 0) { + int primerStart = 0; int primerEnd = 0; + bool good = findForward(currSeq, primerStart, primerEnd); + + if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; } + else{ + //are you aligned + if (aligned) { + if (!keepprimer) { currSeq.padToPos(mapAligned[primerEnd]); } + else { currSeq.padToPos(mapAligned[primerStart]); } + }else { + if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } + else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } + } + } + } + + //process reverse primers + if (revPrimer.size() != 0) { + int primerStart = 0; int primerEnd = 0; + bool good = findReverse(currSeq, primerStart, primerEnd); + if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; } + else{ + //are you aligned + if (aligned) { + if (!keepprimer) { currSeq.padFromPos(mapAligned[primerStart]); } + else { currSeq.padFromPos(mapAligned[primerEnd]); } + } + else { + if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); } + else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); } + } + } + } + }else if (ecolifile != "") { + //make sure the seqs are aligned + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + else if (currSeq.getAligned().length() != length) { + m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break; + }else { + currSeq.padToPos(start); + currSeq.padFromPos(end); + } + }else{ //using start and end to trim + //make sure the seqs are aligned + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + else { + if (start != -1) { currSeq.padToPos(start); } + if (end != -1) { + if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; } + else { + currSeq.padFromPos(end); + } + } + } + } + + if(goodSeq == 1) { currSeq.printSequence(goodFile); } + else { + badSeqNames.insert(currSeq.getName()); + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.printSequence(badFile); + } + count++; + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos.end)) { break; } +#else + if (inFASTA.eof()) { break; } +#endif + + //report progress + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + } + //report progress + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + + badFile.close(); + goodFile.close(); + inFASTA.close(); + + return count; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "driverPcr"); + exit(1); + } +} +//********************************************************************/ +bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){ + try { + + string rawSequence = seq.getUnaligned(); + + for(int j=0;jcontrol_pressed) { primerStart = 0; primerEnd = 0; return false; } + string rawChunk = rawSequence.substr(j, olength); + if(compareDNASeq(oligo, rawChunk)) { + primerStart = j; + primerEnd = primerStart + olength; + return true; + } + + } + } + + primerStart = 0; primerEnd = 0; + return false; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } +} +//******************************************************************/ +bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ + try { + + string rawSequence = seq.getUnaligned(); + + for(int i=0;i= 0; j--){ + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + string rawChunk = rawSequence.substr(j, olength); + + if(compareDNASeq(oligo, rawChunk)) { + primerStart = j; + primerEnd = primerStart + olength; + return true; + } + + } + } + + primerStart = 0; primerEnd = 0; + return false; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "findReverse"); + exit(1); + } +} +//********************************************************************/ +bool PcrSeqsCommand::isAligned(string seq, map& aligned){ + try { + bool isAligned = false; + + int countBases = 0; + for (int i = 0; i < seq.length(); i++) { + if (!isalpha(seq[i])) { isAligned = true; } + else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned. + } //ie. the 3rd base may be at spot 10 in the alignment + //later when we trim we want to trim from spot 10. + return isAligned; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "isAligned"); + exit(1); + } +} +//********************************************************************/ +string PcrSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "reverseOligo"); + exit(1); + } +} + +//*************************************************************************************************************** +bool PcrSeqsCommand::readOligos(){ + try { + ifstream inOligos; + m->openInputFile(oligosfile, inOligos); + + string type, oligo, group; + + while(!inOligos.eof()){ + + inOligos >> type; + + if(type[0] == '#'){ //ignore + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); + }else{ + m->gobble(inOligos); + //make type case insensitive + for(int i=0;i> oligo; + + for(int i=0;i> group; + }else if((type == "LINKER")||(type == "SPACER")) {;} + else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; } + } + m->gobble(inOligos); + } + inOligos.close(); + + if ((primers.size() == 0) && (revPrimer.size() == 0)) { + m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine(); + m->control_pressed = true; + return false; + } + + return true; + + }catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readOligos"); + exit(1); + } +} +//*************************************************************************************************************** +bool PcrSeqsCommand::readEcoli(){ + try { + ifstream in; + m->openInputFile(ecolifile, in); + + //read seq + if (!in.eof()){ + Sequence ecoli(in); + length = ecoli.getAligned().length(); + start = ecoli.getStartPos(); + end = ecoli.getEndPos(); + }else { in.close(); m->control_pressed = true; return false; } + in.close(); + + return true; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readEcoli"); + exit(1); + } + +} +//*************************************************************************************************************** +int PcrSeqsCommand::writeAccnos(set badNames){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos"; + outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName); + + ofstream out; + m->openOutputFile(outputFileName, out); + + for (set::iterator it = badNames.begin(); it != badNames.end(); it++) { + if (m->control_pressed) { break; } + out << (*it) << endl; + } + + out.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "writeAccnos"); + exit(1); + } + +} +//******************************************************************/ +bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){ + try { + bool success = 1; + int length = oligo.length(); + + for(int i=0;ierrorOut(e, "PcrSeqsCommand", "compareDNASeq"); + exit(1); + } + +} +//*************************************************************************************************************** +int PcrSeqsCommand::readName(set& names){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(namefile, in); + string name, firstCol, secondCol; + + bool wroteSomething = false; + int removedCount = 0; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> firstCol; m->gobble(in); + in >> secondCol; + + string savedSecond = secondCol; + vector parsedNames; + m->splitAtComma(secondCol, parsedNames); + + vector validSecond; validSecond.clear(); + for (int i = 0; i < parsedNames.size(); i++) { + if (names.count(parsedNames[i]) == 0) { + validSecond.push_back(parsedNames[i]); + } + } + + if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone + for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); } + removedCount += parsedNames.size(); + }else { + out << firstCol << '\t' << savedSecond << endl; + wroteSomething = true; + } + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readName"); + exit(1); + } +} +//********************************************************************************************************************** +int PcrSeqsCommand::readGroup(set names){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(groupfile, in); + string name, group; + + bool wroteSomething = false; + int removedCount = 0; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; //read from first column + in >> group; //read from second column + + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + out << name << '\t' << group << endl; + }else { removedCount++; } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine(); + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readGroup"); + exit(1); + } +} +//********************************************************************************************************************** +int PcrSeqsCommand::readTax(set names){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(taxfile, in); + string name, tax; + + bool wroteSomething = false; + int removedCount = 0; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; //read from first column + in >> tax; //read from second column + + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + out << name << '\t' << tax << endl; + }else { removedCount++; } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readTax"); + exit(1); + } +} +/**************************************************************************************/ + + diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 3208087..23d7386 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -217,14 +217,15 @@ int PreClusterCommand::execute(){ m->mothurOutEndLine(); m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->renameFile(filenames["fasta"][0], newFastaFile); diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp index 47d55a0..dabcef4 100644 --- a/rarefactcommand.cpp +++ b/rarefactcommand.cpp @@ -351,7 +351,7 @@ int RareFactCommand::execute(){ rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs"))); outputNames.push_back(fileNameRoot+"r_nseqs"); outputTypes["r_nseqs"].push_back(fileNameRoot+"r_nseqs"); } - file2Group[outputNames.size()-1] = groups[p]; + if (inputFileNames.size() > 1) { file2Group[outputNames.size()-1] = groups[p]; } } } @@ -473,7 +473,8 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map vector newFileNames; //find different types of files - map > typesFiles; + map > typesFiles; + map temp; for (int i = 0; i < outputNames.size(); i++) { string extension = m->getExtension(outputNames[i]); @@ -485,7 +486,8 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map newLine += "\tGroup" + labels.substr(labels.find_first_of('\t')); - typesFiles[extension].push_back(outputNames[i]+"_"+file2Group[i]); + temp[outputNames[i]] = file2Group[i]; + typesFiles[extension] = temp; string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; @@ -499,23 +501,23 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map //for each type create a combo file map lineToNumber; - for (map >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) { + for (map >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) { ofstream out; string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first; m->openOutputFileAppend(combineFileName, out); newFileNames.push_back(combineFileName); - vector thisTypesFiles = it->second; + map thisTypesFiles = it->second; //open each type summary file map > files; //maps file name to lines in file int maxLines = 0; int numColumns = 0; - for (int i=0; i::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) { - string thisfilename = thisTypesFiles[i].substr(0, thisTypesFiles[i].find_last_of('_')); - string group = thisTypesFiles[i].substr(thisTypesFiles[i].find_last_of('_')+1); + string thisfilename = itFileNameGroup->first; + string group = itFileNameGroup->second; ifstream temp; m->openInputFile(thisfilename, temp); @@ -524,15 +526,6 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map m->getline(temp); m->gobble(temp); vector thisFilesLines; - //string fileNameRoot = m->getRootName(thisTypesFiles[i]); - //map::iterator itName = nameMap.find(fileNameRoot); - //string group = ""; - //if (itName != nameMap.end()) { - // group = itName->second; - //}else { - // group = "not found" + i; - // m->mothurOut("[ERROR]: can't parse filename."); m->mothurOutEndLine(); - //} thisFilesLines.push_back(group); int count = 1; @@ -552,7 +545,7 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map m->gobble(temp); } - files[thisTypesFiles[i]] = thisFilesLines; + files[thisfilename] = thisFilesLines; //save longest file for below if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); } @@ -566,17 +559,19 @@ vector RareFactCommand::createGroupFile(vector& outputNames, map for (int k = 1; k < maxLines; k++) { //grab data for each group - for (int i=0; i::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) { + + string thisfilename = itFileNameGroup->first; + map::iterator itLine = lineToNumber.find(k); if (itLine != lineToNumber.end()) { string output = toString(itLine->second); - if (k < files[thisTypesFiles[i]].size()) { - string line = files[thisTypesFiles[i]][k]; + if (k < files[thisfilename].size()) { + string line = files[thisfilename][k]; output = line.substr(0, line.find_first_of('\t')); - output += '\t' + files[thisTypesFiles[i]][0] + '\t' + line.substr(line.find_first_of('\t')); + output += '\t' + files[thisfilename][0] + '\t' + line.substr(line.find_first_of('\t')); }else{ - output += '\t' + files[thisTypesFiles[i]][0] + '\t'; + output += '\t' + files[thisfilename][0] + '\t'; for (int h = 0; h < numColumns; h++) { output += "NA\t"; } diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 09ff9b5..7ac910d 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -1358,7 +1358,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); } // Allocate memory for thread data. - sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension, &badSeqNames); + sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension); pDataArray.push_back(tempSum); //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier @@ -1375,6 +1375,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } diff --git a/screenseqscommand.h b/screenseqscommand.h index cbeed46..291d8e6 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -107,11 +107,11 @@ struct sumScreenData { int count; MothurOut* m; string goodFName, badAccnosFName, filename; - set* badSeqNames; + set badSeqNames; sumScreenData(){} - sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf, set* bn) { + sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { startPos = s; endPos = e; minLength = minl; @@ -124,7 +124,6 @@ struct sumScreenData { m = mout; start = st; end = en; - badSeqNames = bn; count = 0; } }; @@ -233,7 +232,7 @@ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ } else{ badAccnosFile << currSeq.getName() << endl; - pDataArray->badSeqNames->insert(currSeq.getName()); + pDataArray->badSeqNames.insert(currSeq.getName()); } } diff --git a/sharedcommand.cpp b/sharedcommand.cpp index de48158..63f83e1 100644 --- a/sharedcommand.cpp +++ b/sharedcommand.cpp @@ -572,8 +572,8 @@ int SharedCommand::createMisMatchFile() { m->openOutputFile(outputMisMatchName, outMisMatch); - map listNames; - map::iterator itList; + set listNames; + set::iterator itList; //go through list and if group returns "not found" output it for (int i = 0; i < SharedList->getNumBins(); i++) { @@ -581,26 +581,19 @@ int SharedCommand::createMisMatchFile() { string names = SharedList->get(i); - while (names.find_first_of(',') != -1) { - string name = names.substr(0,names.find_first_of(',')); - names = names.substr(names.find_first_of(',')+1, names.length()); + vector binNames; + m->splitAtComma(names, binNames); + + for (int j = 0; j < binNames.size(); j++) { + string name = binNames[j]; string group = groupMap->getGroup(name); if(group == "not found") { outMisMatch << name << endl; } itList = listNames.find(name); if (itList != listNames.end()) { m->mothurOut(name + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); } - else { listNames[name] = name; } + else { listNames.insert(name); } } - - //get last name - string group = groupMap->getGroup(names); - if(group == "not found") { outMisMatch << names << endl; } - - itList = listNames.find(names); - if (itList != listNames.end()) { m->mothurOut(names + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); } - else { listNames[names] = names; } - } outMisMatch.close(); @@ -621,9 +614,12 @@ int SharedCommand::createMisMatchFile() { string names = SharedList->get(i); - while (names.find_first_of(',') != -1) { - string name = names.substr(0,names.find_first_of(',')); - names = names.substr(names.find_first_of(',')+1, names.length()); + vector binNames; + m->splitAtComma(names, binNames); + + for (int j = 0; j < binNames.size(); j++) { + + string name = binNames[j]; itList = namesInList.find(name); if (itList != namesInList.end()) { m->mothurOut(name + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); } @@ -631,12 +627,6 @@ int SharedCommand::createMisMatchFile() { namesInList[name] = name; } - - itList = namesInList.find(names); - if (itList != namesInList.end()) { m->mothurOut(names + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); } - - //get last name - namesInList[names] = names; } //get names of sequences in groupfile @@ -674,13 +664,12 @@ int SharedCommand::ListGroupSameSeqs() { int error = 0; vector groupMapsSeqs = groupMap->getNamesSeqs(); - + set groupNamesSeqs; for(int i = 0; i < groupMapsSeqs.size(); i++) { groupNamesSeqs.insert(groupMapsSeqs[i]); } - //go through list and if group returns "not found" output it for (int i = 0; i < SharedList->getNumBins(); i++) { if (m->control_pressed) { return 0; } diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 78bd73b..f22975e 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -67,6 +67,13 @@ ShhherCommand::ShhherCommand(){ ShhherCommand::ShhherCommand(string option) { try { + +#ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &ncpus); + + if(pid == 0){ +#endif abort = false; calledHelp = false; //allow user to run help @@ -314,6 +321,9 @@ ShhherCommand::ShhherCommand(string option) { } } +#ifdef USE_MPI + } +#endif } catch(exception& e) { m->errorOut(e, "ShhherCommand", "ShhherCommand"); @@ -328,9 +338,7 @@ int ShhherCommand::execute(){ int tag = 1976; MPI_Status status; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &ncpus); - + if(pid == 0){ for(int i=1;icontrol_pressed) { return 0; } getJointLookUp(); if (m->control_pressed) { return 0; } + vector flowFileVector; + if(flowFilesFileName != "not found"){ + string fName; + + ifstream flowFilesFile; + m->openInputFile(flowFilesFileName, flowFilesFile); + while(flowFilesFile){ + fName = m->getline(flowFilesFile); + flowFileVector.push_back(fName); + m->gobble(flowFilesFile); + } + } + else{ + flowFileVector.push_back(flowFileName); + } + int numFiles = flowFileVector.size(); for(int i=1;i 0){ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; - + int j=4; //need to get past the first four bases while(qualities[i][j] != -1){ - qualityFile << qualities[i][j] << ' '; - j++; + //cout << i << '\t' << j << '\t' << qualities[i][j] << endl; + qualityFile << qualities[i][j] << ' '; + j++; + } qualityFile << endl; } diff --git a/shhhseqscommand.cpp b/shhhseqscommand.cpp index f6bfd81..5c6359e 100644 --- a/shhhseqscommand.cpp +++ b/shhhseqscommand.cpp @@ -695,14 +695,15 @@ int ShhhSeqsCommand::deconvoluteResults(string fastaFile, string nameFile){ string inputString = "fasta=" + fastaFile + ", name=" + nameFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); string newnameFile = filenames["name"][0]; diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index c352feb..d4e2c75 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -531,14 +531,16 @@ int SubSampleCommand::getSubSampleFasta() { string inputString = "fasta=" + outputFileName; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; + outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]); m->mothurRemove(outputFileName); outputFileName = filenames["fasta"][0]; diff --git a/trialswap2.h b/trialswap2.h index ece071b..6e68e95 100644 --- a/trialswap2.h +++ b/trialswap2.h @@ -50,4 +50,6 @@ private: }; -#endif \ No newline at end of file +#endif + + diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index 00c4d94..f2d592a 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -387,7 +387,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); while(moreSeqs) { @@ -407,7 +407,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN success = 0; trashCode += 'l'; } - + cout << currSeq.getName() << endl << currSeq.getUnaligned() << endl; int primerIndex = 0; int barcodeIndex = 0; @@ -542,9 +542,8 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ oligosFile >> group; @@ -633,6 +632,47 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ exit(1); } } +//********************************************************************/ +string TrimFlowsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "reverseOligo"); + exit(1); + } +} + /**************************************************************************************************/ vector TrimFlowsCommand::getFlowFileBreaks() { diff --git a/trimflowscommand.h b/trimflowscommand.h index f4faacb..2594815 100644 --- a/trimflowscommand.h +++ b/trimflowscommand.h @@ -49,7 +49,8 @@ private: vector getFlowFileBreaks(); int createProcessesCreateTrim(string, string, string, string, vector >); int driverCreateTrim(string, string, string, string, vector >, linePair*); - + string reverseOligo(string); + vector outputNames; set filesToRemove; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index b0fa7a0..5d4eb39 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -1243,9 +1243,10 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< primerNameVector.push_back(group); } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; @@ -1468,6 +1469,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //*************************************************************************************************************** diff --git a/trimseqscommand.h b/trimseqscommand.h index 780b4c7..9ba64c4 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -51,6 +51,7 @@ private: bool cullLength(Sequence&); bool cullHomoP(Sequence&); bool cullAmbigs(Sequence&); + string reverseOligo(string); bool abort, createGroup; string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; -- 2.39.2