A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; };
A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; };
A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; };
+ A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; };
A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; };
A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC480D12D788F20055BC5C /* linearalgebra.cpp */; };
A7FC486712D795D60055BC5C /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC486612D795D60055BC5C /* pcacommand.cpp */; };
A7FE7C401330EA1000F7B327 /* getcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */; };
A7FE7E6D13311EA400F7B327 /* setcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */; };
+ A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FF19F1140FFDA500AD216D /* trimoligos.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; };
A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+ A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = "<group>"; };
+ A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = "<group>"; };
A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = "<group>"; };
A7FA10011302E096003860FE /* mantelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mantelcommand.cpp; sourceTree = "<group>"; };
A7FC480C12D788F20055BC5C /* linearalgebra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = linearalgebra.h; sourceTree = "<group>"; };
A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcurrentcommand.cpp; sourceTree = "<group>"; };
A7FE7E6B13311EA400F7B327 /* setcurrentcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = setcurrentcommand.h; sourceTree = "<group>"; };
A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setcurrentcommand.cpp; sourceTree = "<group>"; };
+ A7FF19F0140FFDA500AD216D /* trimoligos.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trimoligos.h; sourceTree = "<group>"; };
+ A7FF19F1140FFDA500AD216D /* trimoligos.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trimoligos.cpp; sourceTree = "<group>"; };
C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = "<group>"; };
/* End PBXFileReference section */
A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */,
A7E9B83012D37EC400DA6239 /* slibshuff.cpp */,
A7E9B83112D37EC400DA6239 /* slibshuff.h */,
+ A7FF19F0140FFDA500AD216D /* trimoligos.h */,
+ A7FF19F1140FFDA500AD216D /* trimoligos.cpp */,
A7E9B87412D37EC400DA6239 /* validcalculator.cpp */,
A7E9B87512D37EC400DA6239 /* validcalculator.h */,
A7E9B87612D37EC400DA6239 /* validparameter.cpp */,
A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */,
A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */,
A7E9B7F312D37EC400DA6239 /* sharedcommand.h */,
- A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
A7E9B82812D37EC400DA6239 /* shhhercommand.h */,
A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */,
+ A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
A7E9B84012D37EC400DA6239 /* splitabundcommand.h */,
A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */,
A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */,
A7E9B7DC12D37EC400DA6239 /* sequence.hpp */,
A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */,
A7E9B7DE12D37EC400DA6239 /* sequencedb.h */,
+ A7F9F5CD141A5E500032F693 /* sequenceparser.h */,
+ A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */,
A7E9B80412D37EC400DA6239 /* sharedlistvector.cpp */,
A7E9B80512D37EC400DA6239 /* sharedlistvector.h */,
A7E9B80E12D37EC400DA6239 /* sharedordervector.h */,
A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */,
A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */,
A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */,
+ A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */,
+ A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPIWroteAccnos = false;
MPI_Status status;
#else
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
positions = m->divideFile(candidateFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
delete candidateSeq;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<unsigned long int>& MPIPos){
+int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<unsigned long long>& MPIPos){
try {
string outputString = "";
MPI_Status statusReport;
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
vector<linePair*> lines;
void appendReportFiles(string, string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
string candidateFileName, templateFileName, distanceFileName, search, align, outputDir;
string align;
string search;
bool flip;
- unsigned long int start;
- unsigned long int end;
+ unsigned long long start;
+ unsigned long long end;
MothurOut* m;
//AlignmentDB* templateDB;
float match, misMatch, gapOpen, gapExtend, threshold;
int count, kmerSize, threadID;
alignData(){}
- alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long int st, unsigned long int en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
+ alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
alignFName = a;
reportFName = r;
accnosFName = ac;
#ifdef USE_MPI
int pid, processors;
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
MPI_Status status;
MPI_File inMPI;
//link designMap to rows/columns in distance matrix
map<string, vector<int> > origGroupSampleMap;
for(int i=0;i<sampleNames.size();i++){
- origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+ string group = designMap->getGroup(sampleNames[i]);
+
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else { origGroupSampleMap[group].push_back(i); }
+
}
int numGroups = origGroupSampleMap.size();
+ if (m->control_pressed) { delete designMap; return 0; }
+
//create a new filename
ofstream AMOVAFile;
string AMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "amova";
//link designMap to rows/columns in distance matrix
map<string, vector<int> > origGroupSampleMap;
for(int i=0;i<sampleNames.size();i++){
- origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+ string group = designMap->getGroup(sampleNames[i]);
+
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else { origGroupSampleMap[group].push_back(i); }
}
int numGroups = origGroupSampleMap.size();
+ if (m->control_pressed) { delete designMap; return 0; }
+
//create a new filename
ofstream ANOSIMFile;
string ANOSIMFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "anosim";
threadID = tid;
string baseName = tempFile;
+
if (baseName == "saved") { baseName = rdb->getSavedReference(); }
string baseTName = tfile;
}
//if you want to save, but you dont need to calculate then just read
- if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood) {
+ if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) {
ifstream saveIn;
m->openInputFile(tempFile, saveIn);
#ifdef USE_MPI
int pid, num, num2, processors;
- vector<unsigned long int> positions;
- vector<unsigned long int> positions2;
+ vector<unsigned long long> positions;
+ vector<unsigned long long> positions2;
MPI_Status status;
MPI_File inMPI;
string line = m->getline(in); m->gobble(in);
in >> numKmers; m->gobble(in);
-
+ //cout << threadID << '\t' << line << '\t' << numKmers << &in << '\t' << &inNum << '\t' << genusNodes.size() << endl;
//initialze probabilities
wordGenusProb.resize(numKmers);
//read version
string line2 = m->getline(inNum); m->gobble(inNum);
-
+ //cout << threadID << '\t' << line2 << '\t' << this << endl;
while (inNum) {
inNum >> zeroCountProb[count] >> num[count];
count++;
m->gobble(inNum);
+ //cout << threadID << '\t' << count << endl;
}
inNum.close();
-
+ //cout << threadID << '\t' << "here1 " << &wordGenusProb << '\t' << &num << endl; //
+ //cout << threadID << '\t' << &genusTotals << '\t' << endl;
+ //cout << threadID << '\t' << genusNodes.size() << endl;
while(in) {
in >> kmer;
-
+ //cout << threadID << '\t' << kmer << endl;
//set them all to zero value
for (int i = 0; i < genusNodes.size(); i++) {
wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
}
-
+ //cout << threadID << '\t' << num[kmer] << "here" << endl;
//get probs for nonzero values
for (int i = 0; i < num[kmer]; i++) {
in >> name >> prob;
m->gobble(in);
}
in.close();
-
+ //cout << threadID << '\t' << "here" << endl;
#endif
}
catch(exception& e) {
numSeqsPerProcessor = iters / processors;
//each process hits this only once
- unsigned long int startPos = pid * numSeqsPerProcessor;
+ unsigned long long startPos = pid * numSeqsPerProcessor;
if(pid == processors - 1){
numSeqsPerProcessor = iters - pid * numSeqsPerProcessor;
}
int numSeqsPerProcessor = iters / processors;
for (int i = 0; i < processors; i++) {
- unsigned long int startPos = i * numSeqsPerProcessor;
+ unsigned long long startPos = i * numSeqsPerProcessor;
if(i == processors - 1){
numSeqsPerProcessor = iters - i * numSeqsPerProcessor;
}
private:
struct linePair {
- unsigned long int start;
+ unsigned long long start;
int num;
- linePair(unsigned long int i, int j) : start(i), num(j) {}
+ linePair(unsigned long long i, int j) : start(i), num(j) {}
};
vector<linePair> lines;
#ifdef USE_MPI
int pid, processors;
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
int numSeqs;
int tag = 2001;
};
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
linePair(){}
};
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
outHeader.close();
- vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+ vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
delete candidateSeq;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
+int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long long>& MPIPos){
try {
MPI_Status status;
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
vector<linePair*> lines;
int createProcesses(string, string, string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
bool abort, filter, save;
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- vector<unsigned long int> positions = m->divideFile(fastaFileNames[i], processors);
+ vector<unsigned long long> positions = m->divideFile(fastaFileNames[i], processors);
for (int s = 0; s < (positions.size()-1); s++) {
lines.push_back(new linePair(positions[s], positions[(s+1)]));
delete candidateSeq;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos){
+int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos){
try {
MPI_File outAccMPI;
MPI_Status status;
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
int createProcesses(string, string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
bool abort, svg, save;
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_File_close(&outMPIAccnos);
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+ vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
delete candidateSeq;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
+int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long long>& MPIPos){
try {
MPI_Status status;
ReferenceDB* rdb;
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
int createProcesses(string, string, string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
bool abort, filter, save;
string trimFastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.fasta";
//create chimera here if you are mac or linux because fork will copy for you. Create in create processes instead if you are windows.
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI
if (templatefile != "self") { //you want to run slayer with a reference template
chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand());
}else {
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
#else
//break up file
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
#endif
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI
delete chimera;
#endif
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector<unsigned long int>& MPIPos){
+int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector<unsigned long long>& MPIPos){
try {
MPI_Status status;
int pid;
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
map<string, int> sortFastaFile(string, string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
bool abort, realign, trim, trimera, save;
string blastlocation;
bool trimera;
bool trim, realign;
- unsigned long int start;
- unsigned long int end;
+ unsigned long long start;
+ unsigned long long end;
int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
MothurOut* m;
float divR;
int threadId;
slayerData(){}
- slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long int st, unsigned long int en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
+ slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
outputFName = o;
fasta = fa;
accnos = ac;
m->mothurOut("Generating search database... "); cout.flush();
#ifdef USE_MPI
int pid, processors;
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
int tag = 2001;
MPI_Status status;
#ifdef USE_MPI
int pid, num, processors;
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
int tag = 2001;
MPI_Status status;
*/
#include "classifyseqscommand.h"
-#include "sequence.hpp"
-#include "bayesian.h"
-#include "phylotree.h"
-#include "phylosummary.h"
-#include "knn.h"
temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; }
save = m->isTrue(temp);
+ //this is so the threads can quickly load the reference data
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #else
+ if (processors != 1) { save = true; }
+ #endif
rdb->save = save;
if (save) { //clear out old references
rdb->clearMemory();
}
if (m->control_pressed) { delete classify; return 0; }
-
for (int s = 0; s < fastaFileNames.size(); s++) {
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; }
if (pid == 0) { //you are the root process
#else
- vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+ vector<unsigned long long> positions;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ positions = m->divideFile(fastaFileNames[s], processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
+#else
+ if (processors == 1) {
+ lines.push_back(new linePair(0, 1000));
+ }else {
+ positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs);
- for (int i = 0; i < (positions.size()-1); i++) {
- lines.push_back(new linePair(positions[i], positions[(i+1)]));
- }
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
if(processors == 1){
numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
- }
- else{
- processIDS.resize(0);
-
+ }else{
numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
-
}
- #else
- numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
- #endif
#endif
m->mothurOutEndLine();
}
delete classify;
+
return 0;
}
catch(exception& e) {
int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
try {
+
+ int num = 0;
+ processIDS.clear();
+
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 1;
- int num = 0;
//loop through and create all the processes you want
while (process != processors) {
if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
in.close(); m->mothurRemove(m->getFullPathName(tempFile));
}
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the alignData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<classifyData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+
+ classifyData* tempclass = new classifyData(probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i);
+ pDataArray.push_back(tempclass);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyClassThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+
+ }
+
+ //parent does its part
+ num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", filename);
+ processIDS.push_back((processors-1));
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ #endif
for(int i=0;i<processIDS.size();i++){
appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
}
return num;
-#endif
+
}
catch(exception& e) {
m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
delete candidateSeq;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
+int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long long>& MPIPos){
try {
MPI_Status statusNew;
MPI_Status statusTemp;
#include "command.hpp"
#include "classify.h"
#include "referencedb.h"
+#include "sequence.hpp"
+#include "bayesian.h"
+#include "phylotree.h"
+#include "phylosummary.h"
+#include "knn.h"
+
//KNN and Bayesian methods modeled from algorithms in
//Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
int MPIReadNamesFile(string);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct classifyData {
+ string taxFName;
+ string tempTFName;
+ string filename;
+ string search, taxonomyFileName, templateFileName, method;
+ unsigned long long start;
+ unsigned long long end;
+ MothurOut* m;
+ float match, misMatch, gapOpen, gapExtend;
+ int count, kmerSize, threadID, cutoff, iters, numWanted;
+ bool probs;
+
+ classifyData(){}
+ classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) {
+ taxonomyFileName = tx;
+ templateFileName = te;
+ taxFName = a;
+ tempTFName = r;
+ filename = f;
+ search = se;
+ method = me;
+ m = mout;
+ start = st;
+ end = en;
+ match = ma;
+ misMatch = misMa;
+ gapOpen = gapO;
+ gapExtend = gapE;
+ kmerSize = ks;
+ cutoff = cut;
+ iters = i;
+ numWanted = numW;
+ threadID = tid;
+ probs = p;
+ count = 0;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
+ classifyData* pDataArray;
+ pDataArray = (classifyData*)lpParam;
+
+ try {
+ ofstream outTax;
+ pDataArray->m->openOutputFile(pDataArray->taxFName, outTax);
+
+ ofstream outTaxSimple;
+ pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
+
+ ifstream inFASTA;
+ pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+
+ string taxonomy;
+
+ //print header if you are process 0
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+ inFASTA.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
+ }
+
+ pDataArray->count = pDataArray->end;
+
+ //make classify
+ Classify* myclassify;
+ if(pDataArray->method == "bayesian"){ myclassify = new Bayesian("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); }
+ else if(pDataArray->method == "knn"){ myclassify = new Knn("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
+ else {
+ pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
+ pDataArray->m->mothurOutEndLine();
+ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);
+ }
+
+ if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+
+ int count = 0;
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+
+ Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
+
+ if (candidateSeq->getName() != "") {
+
+ taxonomy = myclassify->getTaxonomy(candidateSeq);
+
+ if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
+
+ if (taxonomy != "bad seq") {
+ //output confidence scores or not
+ if (pDataArray->probs) {
+ outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ }else{
+ outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+ }
+
+ outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+ }
+ count++;
+ }
+ delete candidateSeq;
+ //report progress
+ if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+
+ }
+ //report progress
+ if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+
+ delete myclassify;
+ inFASTA.close();
+ outTax.close();
+ outTaxSimple.close();
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+
+
+
#endif
cout.flush();
print_start = false;
}
-
+
if(previousDist <= 0.0000){
printData("unique");
}
string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
string newFastaFile = fileroot + "fragclust.fasta";
- string newNamesFile = fileroot + "names";
+ string newNamesFile = fileroot + "fragclust.names";
if (m->control_pressed) { return 0; }
map<string, string>::iterator itStrings;
set<string> nameInFastaFile; //for sanity checking
set<string>::iterator itname;
+ vector<string> nameFileOrder;
int count = 0;
while (!in.eof()) {
m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
}else {
sequenceStrings[seq.getAligned()] = itNames->second;
+ nameFileOrder.push_back(seq.getAligned());
}
- }else { sequenceStrings[seq.getAligned()] = seq.getName(); }
+ }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); }
}else { //this is a dup
if (oldNameMapFName != "") {
itNames = nameMap.find(seq.getName());
ofstream outNames;
m->openOutputFile(outNameFile, outNames);
- for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) {
+ for (int i = 0; i < nameFileOrder.size(); i++) {
+ //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) {
if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; }
- //get rep name
- int pos = (itStrings->second).find_first_of(',');
+ itStrings = sequenceStrings.find(nameFileOrder[i]);
- if (pos == string::npos) { // only reps itself
- outNames << itStrings->second << '\t' << itStrings->second << endl;
- }else {
- outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
- }
+ if (itStrings != sequenceStrings.end()) {
+ //get rep name
+ int pos = (itStrings->second).find_first_of(',');
+
+ if (pos == string::npos) { // only reps itself
+ outNames << itStrings->second << '\t' << itStrings->second << endl;
+ }else {
+ outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
+ }
+ }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; }
}
outNames.close();
//do your part
string outputMyPart;
- unsigned long int mySize;
+ unsigned long long mySize;
if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
else { driverMPI(start, end, outputFile, mySize, output); }
//wait on chidren
for(int b = 1; b < processors; b++) {
- unsigned long int fileSize;
+ unsigned long long fileSize;
if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; }
MPI_File_close(&outMPI);
}else { //you are a child process
//do your part
- unsigned long int size;
+ unsigned long long size;
if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
else { driver(0, numSeqs, outputFile, "square"); }
}else{ //you have multiple processors
- unsigned long int numDists = 0;
+ unsigned long long numDists = 0;
if (output == "square") {
numDists = numSeqs * numSeqs;
}
/**************************************************************************************************/
/////// need to fix to work with calcs and sequencedb
-int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
+int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){
try {
ValidCalculators validCalculator;
Dist* distCalculator;
}
/**************************************************************************************************/
/////// need to fix to work with calcs and sequencedb
-int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
+int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){
try {
ValidCalculators validCalculator;
Dist* distCalculator;
#ifdef USE_MPI
int driverMPI(int, int, MPI_File&, float);
- int driverMPI(int, int, string, unsigned long int&);
- int driverMPI(int, int, string, unsigned long int&, string);
+ int driverMPI(int, int, string, unsigned long long&);
+ int driverMPI(int, int, string, unsigned long long&, string);
#endif
//int convertMatrix(string);
#ifdef USE_MPI
int pid, numSeqsPerProcessor, num;
int tag = 2001;
- vector<unsigned long int>MPIPos;
+ vector<unsigned long long>MPIPos;
MPI_Status status;
MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
+ vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
}
#ifdef USE_MPI
/**************************************************************************************/
-int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos) {
+int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {
try {
string outputString = "";
int count = 0;
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = in.tellg();
+ unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (in.eof()) { break; }
#ifdef USE_MPI
int pid, numSeqsPerProcessor, num;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_File inMPI;
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
+ vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = in.tellg();
+ unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (in.eof()) { break; }
}
#ifdef USE_MPI
/**************************************************************************************/
-int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector<unsigned long int>& MPIPos) {
+int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector<unsigned long long>& MPIPos) {
try {
MPI_Status status;
\r
private:\r
struct linePair {\r
- unsigned long int start;\r
- unsigned long int end;\r
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}\r
+ unsigned long long start;\r
+ unsigned long long end;\r
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}\r
};\r
\r
vector<linePair*> lines;\r
int driverRunFilter(string, string, string, linePair*);\r
int driverCreateFilter(Filters& F, string filename, linePair* line);\r
#ifdef USE_MPI\r
- int driverMPIRun(int, int, MPI_File&, MPI_File&, vector<unsigned long int>&);\r
- int MPICreateFilter(int, int, Filters&, MPI_File&, vector<unsigned long int>&); \r
+ int driverMPIRun(int, int, MPI_File&, MPI_File&, vector<unsigned long long>&);\r
+ int MPICreateFilter(int, int, Filters&, MPI_File&, vector<unsigned long long>&); \r
#endif\r
\r
};\r
bool FlowData::getNext(ifstream& flowFile){
try {
-
- string lengthString;
- string flowString;
-
- flowFile >> seqName >> endFlow;
- for(int i=0;i<numFlows;i++) { flowFile >> flowData[i]; }
-
- updateEndFlow();
+ flowFile >> seqName >> endFlow;
+ //cout << "in Flowdata " + seqName << endl;
+ for(int i=0;i<numFlows;i++) { flowFile >> flowData[i]; }
+ //cout << "in Flowdata read " << seqName + " done" << endl;
+ updateEndFlow();
translateFlow();
m->gobble(flowFile);
//link designMap to rows/columns in distance matrix
map<string, vector<int> > origGroupSampleMap;
for(int i=0;i<sampleNames.size();i++){
- origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+ string group = designMap->getGroup(sampleNames[i]);
+
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else { origGroupSampleMap[group].push_back(i); }
}
int numGroups = origGroupSampleMap.size();
+ if (m->control_pressed) { delete designMap; return 0; }
+
//create a new filename
ofstream HOMOVAFile;
string HOMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "homova";
USEREADLINE ?= yes
CYGWIN_BUILD ?= no
USECOMPRESSION ?= no
-MOTHUR_FILES="\"Enter_your_default_path_here\""
+MOTHUR_FILES="\"/Users/Sarahswork/desktop/release\""
RELEASE_DATE = "\"7/25/2011\""
VERSION = "\"1.21.0\""
# Optimize to level 3:
-CXXFLAGS += -O3
+CXXFLAGS += -03 -g
ifeq ($(strip $(64BIT_VERSION)),yes)
#if you are using centos uncomment the following lines
}
}
/**************************************************************************************************/
-vector<unsigned long int> MothurOut::setFilePosFasta(string filename, int& num) {
+vector<unsigned long long> MothurOut::setFilePosFasta(string filename, int& num) {
try {
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
ifstream inFASTA;
- openInputFile(filename, inFASTA);
+ //openInputFile(filename, inFASTA);
+ inFASTA.open(filename.c_str(), ios::binary);
string input;
+ unsigned long long count = 0;
while(!inFASTA.eof()){
- input = getline(inFASTA);
- if (input.length() != 0) {
- if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ //input = getline(inFASTA);
+ //cout << input << '\t' << inFASTA.tellg() << endl;
+ //if (input.length() != 0) {
+ // if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); cout << (pos - input.length() - 1) << endl; }
+ //}
+ //gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions
+ char c = inFASTA.get(); count++;
+ if (c == '>') {
+ positions.push_back(count-1);
+ //cout << count << endl;
}
- gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions
}
inFASTA.close();
fclose (pFile);
}*/
- unsigned long int size = positions[(positions.size()-1)];
+ unsigned long long size = positions[(positions.size()-1)];
ifstream in;
openInputFile(filename, in);
in.close();
positions.push_back(size);
+ positions[0] = 0;
return positions;
}
}
}
/**************************************************************************************************/
-vector<unsigned long int> MothurOut::setFilePosEachLine(string filename, int& num) {
+vector<unsigned long long> MothurOut::setFilePosEachLine(string filename, int& num) {
try {
filename = getFullPathName(filename);
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
ifstream in;
- openInputFile(filename, in);
-
+ //openInputFile(filename, in);
+ in.open(filename.c_str(), ios::binary);
+
string input;
+ unsigned long long count = 0;
+ positions.push_back(0);
+
while(!in.eof()){
- unsigned long int lastpos = in.tellg();
- input = getline(in);
- if (input.length() != 0) {
- unsigned long int pos = in.tellg();
- if (pos != -1) { positions.push_back(pos - input.length() - 1); }
- else { positions.push_back(lastpos); }
+ //unsigned long long lastpos = in.tellg();
+ //input = getline(in);
+ //if (input.length() != 0) {
+ //unsigned long long pos = in.tellg();
+ //if (pos != -1) { positions.push_back(pos - input.length() - 1); }
+ //else { positions.push_back(lastpos); }
+ //}
+ //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions
+
+
+ //getline counting reads
+ char d = in.get(); count++;
+ while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof())) {
+ //get next character
+ d = in.get();
+ count++;
+ }
+
+ if (!in.eof()) {
+ d=in.get(); count++;
+ while(isspace(d) && (d != in.eof())) { d=in.get(); count++;}
}
- gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions
+ positions.push_back(count-1);
+ cout << count-1 << endl;
}
in.close();
- num = positions.size();
+ num = positions.size()-1;
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (filename.c_str(),"rb");
fclose (pFile);
}
- positions.push_back(size);
+ positions[(positions.size()-1)] = size;
return positions;
}
}
/**************************************************************************************************/
-vector<unsigned long int> MothurOut::divideFile(string filename, int& proc) {
+vector<unsigned long long> MothurOut::divideFile(string filename, int& proc) {
try{
- vector<unsigned long int> filePos;
+ vector<unsigned long long> filePos;
filePos.push_back(0);
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
filename = getFullPathName(filename);
}
//estimate file breaks
- unsigned long int chunkSize = 0;
+ unsigned long long chunkSize = 0;
chunkSize = size / proc;
//file to small to divide by processors
//for each process seekg to closest file break and search for next '>' char. make that the filebreak
for (int i = 0; i < proc; i++) {
- unsigned long int spot = (i+1) * chunkSize;
+ unsigned long long spot = (i+1) * chunkSize;
ifstream in;
openInputFile(filename, in);
in.seekg(spot);
//look for next '>'
- unsigned long int newSpot = spot;
+ unsigned long long newSpot = spot;
while (!in.eof()) {
char c = in.get();
if (c == '>') { in.putback(c); newSpot = in.tellg(); break; }
}
//there was not another sequence before the end of the file
- unsigned long int sanityPos = in.tellg();
+ unsigned long long sanityPos = in.tellg();
if (sanityPos == -1) { break; }
else { filePos.push_back(newSpot); }
int MothurOut::divideFile(string filename, int& proc, vector<string>& files) {
try{
- vector<unsigned long int> filePos = divideFile(filename, proc);
+ vector<unsigned long long> filePos = divideFile(filename, proc);
for (int i = 0; i < (filePos.size()-1); i++) {
ifstream in;
openInputFile(filename, in);
in.seekg(filePos[i]);
- unsigned long int size = filePos[(i+1)] - filePos[i];
+ unsigned long long size = filePos[(i+1)] - filePos[i];
char* chunk = new char[size];
in.read(chunk, size);
in.close();
//functions from mothur.h
//file operations
- vector<unsigned long int> divideFile(string, int&);
+ vector<unsigned long long> divideFile(string, int&);
int divideFile(string, int&, vector<string>&);
- vector<unsigned long int> setFilePosEachLine(string, int&);
- vector<unsigned long int> setFilePosFasta(string, int&);
+ vector<unsigned long long> setFilePosEachLine(string, int&);
+ vector<unsigned long long> setFilePosFasta(string, int&);
string sortFile(string, string);
void appendFiles(string, string);
int renameFile(string, string); //oldname, newname
//do your part
string outputMyPart;
- unsigned long int mySize;
+ unsigned long long mySize;
if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
else { driverMPI(start, end, outputFile, mySize, output); }
//wait on chidren
for(int b = 1; b < processors; b++) {
- unsigned long int fileSize;
+ unsigned long long fileSize;
if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; }
MPI_File_close(&outMPI);
}else { //you are a child process
//do your part
- unsigned long int size;
+ unsigned long long size;
if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
}
/**************************************************************************************************/
/////// need to fix to work with calcs and sequencedb
-int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
+int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){
try {
MPI_Status status;
}
/**************************************************************************************************/
/////// need to fix to work with calcs and sequencedb
-int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
+int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){
try {
MPI_Status status;
#ifdef USE_MPI
int driverMPI(int, int, MPI_File&, float);
- int driverMPI(int, int, string, unsigned long int&);
- int driverMPI(int, int, string, unsigned long int&, string);
+ int driverMPI(int, int, string, unsigned long long&);
+ int driverMPI(int, int, string, unsigned long long&, string);
#endif
string fastaFileName, align, calc, outputDir, output;
#ifdef USE_MPI
int pid, num, processors;
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
MPI_Status status;
MPI_File inMPI;
*/
#include "preclustercommand.h"
+#include "sequenceparser.h"
+#include "deconvolutecommand.h"
//**********************************************************************************************************************
inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); }
try {
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
+ CommandParameter pbygroup("bygroup", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pbygroup);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
helpString += "The pre.cluster command outputs a new fasta and name file.\n";
helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n";
helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
+ helpString += "The group parameter allows you to provide a group file. \n";
helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
+ helpString += "The bygroup parameter allows you to indicate you whether you want to cluster sequences only within each group, default=T, when a groupfile is given. \n";
helpString += "The pre.cluster command should be in the following format: \n";
helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
}
//check for required parameters
namefile = validParameter.validFile(parameters, "name", true);
if (namefile == "not found") { namefile = ""; }
else if (namefile == "not open") { abort = true; }
- else { readNameFile(); m->setNameFile(namefile); }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not found") { groupfile = ""; }
+ else if (groupfile == "not open") { abort = true; groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
- string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; }
+ string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; }
convert(temp, diffs);
+
+ temp = validParameter.validFile(parameters, "bygroup", false);
+ if (temp == "not found") {
+ if (groupfile != "") { temp = "T"; }
+ else { temp = "F"; }
+ }
+ bygroup = m->isTrue(temp);
+
+ if (bygroup && (groupfile == "")) { m->mothurOut("You cannot set bygroup=T, unless you provide a groupfile."); m->mothurOutEndLine(); abort=true; }
+
}
}
int start = time(NULL);
- //reads fasta file and return number of seqs
- int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
+ string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
+ string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile);
+ string newNamesFile = fileroot + "precluster.names";
- if (m->control_pressed) { return 0; }
-
- if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; }
- if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; }
+ if (bygroup) {
+ //clear out old files
+ ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
+ ofstream outNames; m->openOutputFile(newNamesFile, outNames); outNames.close();
+
+ //parse fasta and name file by group
+ SequenceParser* parser;
+ if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); }
+ else { parser = new SequenceParser(groupfile, fastafile); }
+
+ vector<string> groups = parser->getNamesOfGroups();
+
+ //precluster each group
+ for (int i = 0; i < groups.size(); i++) {
+
+ start = time(NULL);
+
+ if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+
+ m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
+
+ map<string, string> thisNameMap;
+ if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); }
+ vector<Sequence> thisSeqs = parser->getSeqs(groups[i]);
+
+ //fill alignSeqs with this groups info.
+ int numSeqs = loadSeqs(thisNameMap, thisSeqs);
+
+ if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+
+ if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; }
+
+ int count = process();
+
+ if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+
+ m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
+ m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+ printData(newFastaFile, newNamesFile);
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
+
+ }
+
+ //run unique.seqs for deconvolute results
+ string inputString = "fasta=" + newFastaFile;
+ if (namefile != "") { inputString += ", name=" + newNamesFile; }
+ m->mothurOutEndLine();
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* uniqueCommand = new DeconvoluteCommand(inputString);
+ uniqueCommand->execute();
+
+ map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+
+ delete uniqueCommand;
+
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ newNamesFile = filenames["name"][0];
+ newFastaFile = filenames["fasta"][0];
+
+ }else {
+ if (namefile != "") { readNameFile(); }
+
+ //reads fasta file and return number of seqs
+ int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
- //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
-// sizes.clear();
+ if (m->control_pressed) { return 0; }
+ if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; }
+ if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; }
+
+ int count = process();
+
+ if (m->control_pressed) { return 0; }
+
+ m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
+ m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+ printData(newFastaFile, newNamesFile);
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
+ }
+
+ if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ m->mothurOut(newFastaFile); m->mothurOutEndLine(); outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
+ m->mothurOut(newNamesFile); m->mothurOutEndLine(); outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
+ m->mothurOutEndLine();
+
+ //set fasta file as new current fastafile
+ string current = "";
+ itTypes = outputTypes.find("fasta");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+ }
+
+ itTypes = outputTypes.find("name");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PreClusterCommand", "execute");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int PreClusterCommand::process(){
+ try {
+
//sort seqs by number of identical seqs
sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
-
+
int count = 0;
-
+ int numSeqs = alignSeqs.size();
+
//think about running through twice...
for (int i = 0; i < numSeqs; i++) {
//merge
alignSeqs[i].names += ',' + alignSeqs[j].names;
alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
-
+
alignSeqs[j].active = 0;
alignSeqs[j].numIdentical = 0;
count++;
}
}//end if j active
}//end if i != j
-
- //remove from active list
+
+ //remove from active list
alignSeqs[i].active = 0;
}//end if active i
if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
}
- if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
-
-
- string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
-
- string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile);
- string newNamesFile = fileroot + "precluster.names";
-
- if (m->control_pressed) { return 0; }
-
- m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
- m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
- printData(newFastaFile, newNamesFile);
-
- m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
-
- if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
-
- m->mothurOutEndLine();
- m->mothurOut("Output File Names: "); m->mothurOutEndLine();
- m->mothurOut(newFastaFile); m->mothurOutEndLine(); outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
- m->mothurOut(newNamesFile); m->mothurOutEndLine(); outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
- m->mothurOutEndLine();
-
- //set fasta file as new current fastafile
- string current = "";
- itTypes = outputTypes.find("fasta");
- if (itTypes != outputTypes.end()) {
- if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
- }
-
- itTypes = outputTypes.find("name");
- if (itTypes != outputTypes.end()) {
- if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
- }
+ if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
- return 0;
+ return count;
}
catch(exception& e) {
- m->errorOut(e, "PreClusterCommand", "execute");
+ m->errorOut(e, "PreClusterCommand", "process");
exit(1);
}
}
-
/**************************************************************************************************/
-
-//this requires the names and fasta file to be in the same order
-
int PreClusterCommand::readFASTA(){
try {
//ifstream inNames;
exit(1);
}
}
-
+/**************************************************************************************************/
+int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
+ try {
+ length = 0;
+ alignSeqs.clear();
+ map<string, string>::iterator it;
+ bool error = false;
+
+ for (int i = 0; i < thisSeqs.size(); i++) {
+
+ if (m->control_pressed) { return 0; }
+
+ if (namefile != "") {
+ it = thisName.find(thisSeqs[i].getName());
+
+ //should never be true since parser checks for this
+ if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
+ else{
+ //get number of reps
+ int numReps = 0;
+ for(int j=0;j<(it->second).length();j++){
+ if((it->second)[j] == ','){ numReps++; }
+ }
+
+ seqPNode tempNode(numReps, thisSeqs[i], it->second);
+ alignSeqs.push_back(tempNode);
+ if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ }
+ }else { //no names file, you are identical to yourself
+ seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
+ alignSeqs.push_back(tempNode);
+ if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ }
+ }
+
+ //sanity check
+ if (error) { m->control_pressed = true; }
+
+ thisSeqs.clear();
+
+ return alignSeqs.size();
+ }
+
+ catch(exception& e) {
+ m->errorOut(e, "PreClusterCommand", "loadSeqs");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
int PreClusterCommand::calcMisMatches(string seq1, string seq2){
ofstream outFasta;
ofstream outNames;
- m->openOutputFile(newfasta, outFasta);
- m->openOutputFile(newname, outNames);
+ if (bygroup) {
+ m->openOutputFileAppend(newfasta, outFasta);
+ m->openOutputFileAppend(newname, outNames);
+ }else {
+ m->openOutputFile(newfasta, outFasta);
+ m->openOutputFile(newname, outNames);
+ }
-
for (int i = 0; i < alignSeqs.size(); i++) {
if (alignSeqs[i].numIdentical != 0) {
alignSeqs[i].seq.printSequence(outFasta);
private:
int diffs, length;
- bool abort;
- string fastafile, namefile, outputDir;
+ bool abort, bygroup;
+ string fastafile, namefile, outputDir, groupfile;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
map<string, int> sizes; //this map a seq name to the number of identical seqs in the names file
//int readNamesFASTA();
int calcMisMatches(string, string);
void printData(string, string); //fasta filename, names file name
+ int process();
+ int loadSeqs(map<string, string>&, vector<Sequence>&);
};
/************************************************************/
CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap", "sobs", "", "", "",true,false); parameters.push_back(pcalc);
CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+ CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
helpString += "Example rarefaction.single(label=unique-.01-.03, iters=10000, freq=10, calc=sobs-rchao-race-rjack-rbootstrap-rshannon-rnpshannon-rsimpson).\n";
helpString += "The default values for iters is 1000, freq is 100, and calc is rarefaction which calculates the rarefaction curve for the observed richness.\n";
validCalculator.printCalc("rarefaction");
+ helpString += "If you are running rarefaction.single with a shared file and would like your results collated in one file, set groupmode=t. (Default=true).\n";
helpString += "The label parameter is used to analyze specific labels in your input.\n";
helpString += "Note: No spaces between parameter labels (i.e. freq), '=' and parameters (i.e.yourFreq).\n";
return helpString;
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
convert(temp, processors);
+
+ temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; }
+ groupMode = m->isTrue(temp);
}
}
}
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //create summary file containing all the groups data for each label - this function just combines the info from the files already created.
+ if ((sharedfile != "") && (groupMode)) { outputNames = createGroupFile(outputNames); }
+
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
m->mothurOutEndLine();
}
}
//**********************************************************************************************************************
+vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames) {
+ try {
+
+ vector<string> newFileNames;
+
+ //find different types of files
+ map<string, vector<string> > typesFiles;
+ for (int i = 0; i < outputNames.size(); i++) {
+ string extension = m->getExtension(outputNames[i]);
+
+ ifstream in;
+ m->openInputFile(outputNames[i], in);
+
+ string labels = m->getline(in);
+ string newLine = labels.substr(0, labels.find_first_of('\t'));
+
+ newLine += "\tGroup" + labels.substr(labels.find_first_of('\t'));
+
+ typesFiles[extension].push_back(outputNames[i]);
+
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+
+ //print headers
+ ofstream out;
+ m->openOutputFile(combineFileName, out);
+ out << newLine << endl;
+ out.close();
+
+ }
+
+ //for each type create a combo file
+ for (map<string, vector<string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
+
+ ofstream out;
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
+ m->openOutputFileAppend(combineFileName, out);
+ newFileNames.push_back(combineFileName);
+
+ vector<string> thisTypesFiles = it->second;
+
+ //open each type summary file
+ map<string, vector<string> > files; //maps file name to lines in file
+ int maxLines = 0;
+ for (int i=0; i<thisTypesFiles.size(); i++) {
+
+ ifstream temp;
+ m->openInputFile(thisTypesFiles[i], temp);
+
+ //read through first line - labels
+ m->getline(temp); m->gobble(temp);
+
+ vector<string> thisFilesLines;
+ string group = m->getRootName(thisTypesFiles[i]);
+ group = group.substr(0, group.length()-1);
+ group = group.substr(group.find_last_of('.')+1);
+
+ thisFilesLines.push_back(group);
+ while (!temp.eof()){
+
+ string thisLine = m->getline(temp);
+
+ thisFilesLines.push_back(thisLine);
+
+ m->gobble(temp);
+ }
+
+ files[thisTypesFiles[i]] = thisFilesLines;
+
+ //save longest file for below
+ if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
+
+ temp.close();
+ m->mothurRemove(thisTypesFiles[i]);
+ }
+
+
+ //for each label
+ for (int k = 1; k < maxLines; k++) {
+
+ //grab data for each group
+ for (int i=0; i<thisTypesFiles.size(); i++) {
+
+ string output = "NA\t";
+ if (k < files[thisTypesFiles[i]].size()) {
+ string line = files[thisTypesFiles[i]][k];
+ output = line.substr(0, line.find_first_of('\t'));
+ output += '\t' + files[thisTypesFiles[i]][0] + '\t' + line.substr(line.find_first_of('\t'));
+ }else{
+ output += '\t' + files[thisTypesFiles[i]][0] + '\t';
+ }
+ out << output << endl;
+ }
+ }
+
+ out.close();
+
+ }
+
+ //return combine file name
+ return newFileNames;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RareFactCommand", "createGroupFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
vector<string> RareFactCommand::parseSharedFile(string filename) {
try {
vector<string> filenames;
int nIters, abund, processors;
float freq;
- bool abort, allLines;
+ bool abort, allLines, groupMode;
set<string> labels; //holds labels to be used
string label, calc, sharedfile, listfile, rabundfile, sabundfile, format, inputfile;
vector<string> Estimators;
string outputDir;
vector<string> parseSharedFile(string);
+ vector<string> createGroupFile(vector<string>&);
};
#endif
if (abort == true) { if (calledHelp) { return 0; } return 2; }
//if the user want to optimize we need to know the 90% mark
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
//use the namefile to optimize correctly
if (namefile != "") { nameMap = m->readNames(namefile); }
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
}
}
//***************************************************************************************************************
-int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
+int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
try {
vector<int> startPosition;
vector<int> ambigBases;
vector<int> longHomoPolymer;
- vector<unsigned long int> positions = m->divideFile(fastafile, processors);
+ vector<unsigned long long> positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = in.tellg();
+ unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (in.eof()) { break; }
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (inFASTA.eof()) { break; }
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
+int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
try {
string outputString = "";
MPI_Status statusGood;
private:
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
int createProcesses(string, string, string, set<string>&);
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&, set<string>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, set<string>&);
#endif
bool abort;
map<string, int> nameMap;
int readNames();
- int getSummary(vector<unsigned long int>&);
+ int getSummary(vector<unsigned long long>&);
int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair*);
};
if(namesFileName != ""){ weights = getWeights(); }
- vector<unsigned long int> fastaFilePos;
- vector<unsigned long int> qFilePos;
- vector<unsigned long int> reportFilePos;
+ vector<unsigned long long> fastaFilePos;
+ vector<unsigned long long> qFilePos;
+ vector<unsigned long long> reportFilePos;
setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos);
index++;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = queryFile.tellg();
+ unsigned long long pos = queryFile.tellg();
if ((pos == -1) || (pos >= line.end)) { break; }
#else
if (queryFile.eof()) { break; }
}
/**************************************************************************************************/
-int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos, vector<unsigned long int>& rfileFilePos) {
+int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos, vector<unsigned long long>& rfileFilePos) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
//set file positions for fasta file
map<string, int>::iterator it = firstSeqNames.find(sname);
if(it != firstSeqNames.end()) { //this is the start of a new chunk
- unsigned long int pos = inQual.tellg();
+ unsigned long long pos = inQual.tellg();
qfileFilePos.push_back(pos - input.length() - 1);
firstSeqNames.erase(it);
}
//get last file position of qfile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (qfilename.c_str(),"rb");
map<string, int>::iterator it = firstSeqNamesReport.find(sname);
if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk
- unsigned long int pos = inR.tellg();
+ unsigned long long pos = inR.tellg();
rfileFilePos.push_back(pos - input.length() - 1);
firstSeqNamesReport.erase(it);
}
//get last file position of qfile
FILE * rFile;
- unsigned long int sizeR;
+ unsigned long long sizeR;
//get num bytes in file
rFile = fopen (rfilename.c_str(),"rb");
fastaFilePos.push_back(0); qfileFilePos.push_back(0);
//get last file position of fastafile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (filename.c_str(),"rb");
ReferenceDB* rdb;
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<int> processIDS; //processid
void printErrorQuality(map<char, vector<int> >);
void printQualityFR(vector<vector<int> >, vector<vector<int> >);
- int setLines(string, string, string, vector<unsigned long int>&, vector<unsigned long int>&, vector<unsigned long int>&);
+ int setLines(string, string, string, vector<unsigned long long>&, vector<unsigned long long>&, vector<unsigned long long>&);
int driver(string, string, string, string, string, string, linePair, linePair, linePair);
int createProcesses(string, string, string, string, string, string);
int tag = 2001;
int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
- vector<unsigned long int> MPIPos;
+ vector<unsigned long long> MPIPos;
MPI_Status status;
MPI_Status statusOut;
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- vector<unsigned long int> positions;
+ vector<unsigned long long> positions;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; }
m->mothurOutEndLine();
- m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); m->mothurOutEndLine();
- m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); m->mothurOutEndLine();
- m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); m->mothurOutEndLine();
- m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); m->mothurOutEndLine();
- m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); m->mothurOutEndLine();
- m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); m->mothurOutEndLine();
- m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); m->mothurOutEndLine();
- m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); m->mothurOutEndLine();
+ m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
+ m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
+ m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
+ m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
+ m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
+ m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
+ m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
+ m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = in.tellg();
+ unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
if (in.eof()) { break; }
}
#ifdef USE_MPI
/**************************************************************************************/
-int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos) {
+int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {
try {
int pid;
map<string, int> nameMap;
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
vector<linePair*> lines;
int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, string, linePair*);
#ifdef USE_MPI
- int MPICreateSummary(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+ int MPICreateSummary(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, MPI_File&, MPI_File&, vector<unsigned long long>&);
#endif
vector<int>* longHomoPolymer;
string filename;
string sumFile;
- unsigned long int start;
- unsigned long int end;
+ unsigned long long start;
+ unsigned long long end;
int count;
MothurOut* m;
string namefile;
seqSumData(){}
- seqSumData(vector<int>* s, vector<int>* e, vector<int>* l, vector<int>* a, vector<int>* h, string f, string sf, MothurOut* mout, unsigned long int st, unsigned long int en, string na, map<string, int> nam) {
+ seqSumData(vector<int>* s, vector<int>* e, vector<int>* l, vector<int>* a, vector<int>* h, string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map<string, int> nam) {
startPosition = s;
endPosition = e;
seqLength = l;
--- /dev/null
+/*
+ * sequenceParser.cpp
+ * Mothur
+ *
+ * Created by westcott on 9/9/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "sequenceParser.h"
+
+
+/************************************************************/
+SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
+ try {
+
+ m = MothurOut::getInstance();
+ int error;
+
+ //read group file
+ groupMap = new GroupMap(groupFile);
+ error = groupMap->readMap();
+
+ if (error == 1) { m->control_pressed = true; }
+
+ //initialize maps
+ vector<string> namesOfGroups = groupMap->getNamesOfGroups();
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ vector<Sequence> temp;
+ map<string, string> tempMap;
+ seqs[namesOfGroups[i]] = temp;
+ nameMapPerGroup[namesOfGroups[i]] = tempMap;
+ }
+
+ //read fasta file making sure each sequence is in the group file
+ ifstream in;
+ m->openInputFile(fastaFile, in);
+
+ map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); m->gobble(in);
+
+ if (seq.getName() != "") {
+
+ string group = groupMap->getGroup(seq.getName());
+ if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
+ else {
+ seqs[group].push_back(seq);
+ seqName[seq.getName()] = seq.getAligned();
+ }
+ }
+ }
+ in.close();
+
+ if (error == 1) { m->control_pressed = true; }
+
+ //read name file
+ ifstream inName;
+ m->openInputFile(nameFile, inName);
+
+ string first, second;
+ int countName = 0;
+ while(!inName.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ inName >> first; m->gobble(inName);
+ inName >> second; m->gobble(inName);
+
+ vector<string> names;
+ m->splitAtChar(second, names, ',');
+
+ //get aligned string for these seqs from the fasta file
+ string alignedString = "";
+ map<string, string>::iterator itAligned = seqName.find(names[0]);
+ if (itAligned == seqName.end()) {
+ error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
+ }else {
+ alignedString = itAligned->second;
+ }
+
+ //separate by group - parse one line in name file
+ map<string, string> splitMap; //group -> name1,name2,...
+ map<string, string>::iterator it;
+ for (int i = 0; i < names.size(); i++) {
+
+ string group = groupMap->getGroup(names[i]);
+ if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
+ else {
+
+ it = splitMap.find(group);
+ if (it != splitMap.end()) { //adding seqs to this group
+ (it->second) += "," + names[i];
+ countName++;
+ }else { //first sighting of this group
+ splitMap[group] = names[i];
+ countName++;
+
+ //is this seq in the fasta file?
+ if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
+ Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
+ seqs[group].push_back(tempSeq);
+ }
+ }
+ }
+ }
+
+
+ //fill nameMapPerGroup - holds all lines in namefile separated by group
+ for (it = splitMap.begin(); it != splitMap.end(); it++) {
+ //grab first name
+ string firstName = "";
+ for(int i = 0; i < (it->second).length(); i++) {
+ if (((it->second)[i]) != ',') {
+ firstName += ((it->second)[i]);
+ }else { break; }
+ }
+
+ //group1 -> seq1 -> seq1,seq2,seq3
+ nameMapPerGroup[it->first][firstName] = it->second;
+ }
+ }
+
+ inName.close();
+
+ if (error == 1) { m->control_pressed = true; }
+
+ if (countName != (groupMap->getNumSeqs())) {
+ m->mothurOutEndLine();
+ m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
+ m->mothurOutEndLine();
+ m->control_pressed = true;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "SequenceParser");
+ exit(1);
+ }
+}
+/************************************************************/
+SequenceParser::SequenceParser(string groupFile, string fastaFile) {
+ try {
+
+ m = MothurOut::getInstance();
+ int error;
+
+ //read group file
+ groupMap = new GroupMap(groupFile);
+ error = groupMap->readMap();
+
+ if (error == 1) { m->control_pressed = true; }
+
+ //initialize maps
+ vector<string> namesOfGroups = groupMap->getNamesOfGroups();
+ for (int i = 0; i < namesOfGroups.size(); i++) {
+ vector<Sequence> temp;
+ seqs[namesOfGroups[i]] = temp;
+ }
+
+ //read fasta file making sure each sequence is in the group file
+ ifstream in;
+ m->openInputFile(fastaFile, in);
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); m->gobble(in);
+
+ if (seq.getName() != "") {
+
+ string group = groupMap->getGroup(seq.getName());
+ if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); }
+ else { seqs[group].push_back(seq); count++; }
+ }
+ }
+ in.close();
+
+ if (error == 1) { m->control_pressed = true; }
+
+ if (count != (groupMap->getNumSeqs())) {
+ m->mothurOutEndLine();
+ m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
+ if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
+ m->mothurOutEndLine();
+ m->control_pressed = true;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "SequenceParser");
+ exit(1);
+ }
+}
+/************************************************************/
+SequenceParser::~SequenceParser(){ delete groupMap; }
+/************************************************************/
+int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
+/************************************************************/
+vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
+/************************************************************/
+bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
+/************************************************************/
+string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
+/************************************************************/
+int SequenceParser::getNumSeqs(string g){
+ try {
+ map<string, vector<Sequence> >::iterator it;
+ int num = 0;
+
+ it = seqs.find(g);
+ if(it == seqs.end()) {
+ m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+ }else {
+ num = (it->second).size();
+ }
+
+ return num;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "getNumSeqs");
+ exit(1);
+ }
+}
+/************************************************************/
+vector<Sequence> SequenceParser::getSeqs(string g){
+ try {
+ map<string, vector<Sequence> >::iterator it;
+ vector<Sequence> seqForThisGroup;
+
+ it = seqs.find(g);
+ if(it == seqs.end()) {
+ m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+ seqForThisGroup = it->second;
+ }
+
+ return seqForThisGroup;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "getSeqs");
+ exit(1);
+ }
+}
+/************************************************************/
+map<string, string> SequenceParser::getNameMap(string g){
+ try {
+ map<string, map<string, string> >::iterator it;
+ map<string, string> nameMapForThisGroup;
+
+ it = nameMapPerGroup.find(g);
+ if(it == nameMapPerGroup.end()) {
+ m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
+ }else {
+ nameMapForThisGroup = it->second;
+ }
+
+ return nameMapForThisGroup;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SequenceParser", "getNameMap");
+ exit(1);
+ }
+}
+/************************************************************/
+
+
+
--- /dev/null
+#ifndef SEQUENCEPARSER_H
+#define SEQUENCEPARSER_H
+
+/*
+ * sequenceParser.h
+ * Mothur
+ *
+ * Created by westcott on 9/9/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "groupmap.h"
+
+/* This class reads a fasta and group file with a namesfile as optional and parses the data by group.
+
+ Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences.
+ This is because when we parse the name file we make a unique for each group instead of 1 unique for all
+ groups.
+
+ */
+
+class SequenceParser {
+
+ public:
+
+ SequenceParser(string, string); //group, fasta - file mismatches will set m->control_pressed = true
+ SequenceParser(string, string, string); //group, fasta, name - file mismatches will set m->control_pressed = true
+ ~SequenceParser();
+
+ //general operations
+ int getNumGroups();
+ vector<string> getNamesOfGroups();
+ bool isValidGroup(string); //return true if string is a valid group
+ string getGroup(string); //returns group of a specific sequence
+
+ int getNumSeqs(string); //returns the number of unique sequences in a specific group
+ vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
+ map<string, string> getNameMap(string); //returns seqName -> namesOfRedundantSeqs separated by commas for a specific group - the name file format, but each line is parsed by group.
+
+ private:
+
+ GroupMap* groupMap;
+ MothurOut* m;
+
+ int numSeqs;
+ map<string, vector<Sequence> > seqs; //a vector for each group
+ map<string, map<string, string> > nameMapPerGroup; //nameMap for each group
+};
+
+#endif
+
//read offset
char buffer2 [8];
in.read(buffer2, 8);
- header.indexOffset = be_int8(*(unsigned long int *)(&buffer2));
+ header.indexOffset = be_int8(*(unsigned long long *)(&buffer2));
//read index length
char buffer3 [4];
delete[] tempBuffer2;
/* Pad to 8 chars */
- unsigned long int spotInFile = in.tellg();
- unsigned long int spot = (spotInFile + 7)& ~7; // ~ inverts
+ unsigned long long spotInFile = in.tellg();
+ unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts
in.seekg(spot);
}else{
decodeName(header.timestamp, header.region, header.xy, header.name);
/* Pad to 8 chars */
- unsigned long int spotInFile = in.tellg();
- unsigned long int spot = (spotInFile + 7)& ~7;
+ unsigned long long spotInFile = in.tellg();
+ unsigned long long spot = (spotInFile + 7)& ~7;
in.seekg(spot);
}else{
}
/* Pad to 8 chars */
- unsigned long int spotInFile = in.tellg();
- unsigned long int spot = (spotInFile + 7)& ~7;
+ unsigned long long spotInFile = in.tellg();
+ unsigned long long spot = (spotInFile + 7)& ~7;
in.seekg(spot);
}else{
struct CommonHeader {
unsigned int magicNumber;
string version;
- unsigned long int indexOffset;
+ unsigned long long indexOffset;
unsigned int indexLength;
unsigned int numReads;
unsigned short headerLength;
void SharedRAbundVector::setGroupIndex(int vIndex) { index = vIndex; }
/***********************************************************************/
int SharedRAbundVector::getNumBins(){
- return numBins;
+ return numBins;
}
/***********************************************************************/
vector<string> Groups = m->getGroups();
vector<string> allGroups = m->getAllGroups();
util->setGroups(Groups, allGroups);
+ m->setGroups(Groups);
bool remove = false;
for (int i = 0; i < lookup.size(); i++) {
#include "sparsematrix.hpp"
#include <cfloat>
-//**********************************************************************************************************************
-
-#define NUMBINS 1000
-#define HOMOPS 10
-#define MIN_COUNT 0.1
-#define MIN_WEIGHT 0.1
-#define MIN_TAU 0.0001
-#define MIN_ITER 10
//**********************************************************************************************************************
vector<string> ShhherCommand::setParameters(){
try {
processors = ncpus;
m->mothurOut("\nGetting preliminary data...\n");
- getSingleLookUp();
- getJointLookUp();
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
vector<string> flowFileVector;
if(flowFilesFileName != "not found"){
ifstream flowFilesFile;
m->openInputFile(flowFilesFileName, flowFilesFile);
while(flowFilesFile){
- flowFilesFile >> fName;
+ fName = m->getline(flowFilesFile);
flowFileVector.push_back(fName);
m->gobble(flowFilesFile);
}
}
for(int i=0;i<numFiles;i++){
+
+ if (m->control_pressed) { break; }
+
double begClock = clock();
- unsigned long int begTime = time(NULL);
+ unsigned long long begTime = time(NULL);
flowFileName = flowFileVector[i];
m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
m->mothurOut("Reading flowgrams...\n");
getFlowData();
+
+ if (m->control_pressed) { break; }
m->mothurOut("Identifying unique flowgrams...\n");
getUniques();
+
+ if (m->control_pressed) { break; }
m->mothurOut("Calculating distances between flowgrams...\n");
char fileName[1024];
string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
+ if (m->control_pressed) { break; }
+
int done;
for(int i=1;i<ncpus;i++){
MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
string namesFileName = createNamesFile();
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nClustering flowgrams...\n");
string listFileName = cluster(distFileName, namesFileName);
-
+
+ if (m->control_pressed) { break; }
+
getOTUData(listFileName);
m->mothurRemove(distFileName);
m->mothurRemove(namesFileName);
m->mothurRemove(listFileName);
+ if (m->control_pressed) { break; }
+
initPyroCluster();
+ if (m->control_pressed) { break; }
+
for(int i=1;i<ncpus;i++){
MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
}
+ if (m->control_pressed) { break; }
double maxDelta = 0;
int iter = 0;
m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-
+
double cycClock = clock();
- unsigned long int cycTime = time(NULL);
+ unsigned long long cycTime = time(NULL);
fill();
+
+ if (m->control_pressed) { break; }
int total = singleTau.size();
for(int i=1;i<ncpus;i++){
}
}
- maxDelta = getNewWeights();
- double nLL = getLikelihood();
- checkCentroids();
+ maxDelta = getNewWeights(); if (m->control_pressed) { break; }
+ double nLL = getLikelihood(); if (m->control_pressed) { break; }
+ checkCentroids(); if (m->control_pressed) { break; }
for(int i=1;i<ncpus;i++){
MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
}
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nFinalizing...\n");
fill();
+
+ if (m->control_pressed) { break; }
+
setOTUs();
vector<int> otuCounts(numOTUs, 0);
for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
calcCentroidsDriver(0, numOTUs);
- writeQualities(otuCounts);
- writeSequences(otuCounts);
- writeNames(otuCounts);
- writeClusters(otuCounts);
- writeGroups();
+ if (m->control_pressed) { break; }
+
+ writeQualities(otuCounts); if (m->control_pressed) { break; }
+ writeSequences(otuCounts); if (m->control_pressed) { break; }
+ writeNames(otuCounts); if (m->control_pressed) { break; }
+ writeClusters(otuCounts); if (m->control_pressed) { break; }
+ writeGroups(); if (m->control_pressed) { break; }
m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
for(int i=0;i<numFiles;i++){
+
+ if (m->control_pressed) { break; }
+
//Now into the pyrodist part
bool live = 1;
int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
-
+
+ if (m->control_pressed) { break; }
+
int done = 1;
MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
while(live){
+ if (m->control_pressed) { break; }
+
MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
singleTau.assign(total, 0.0000);
seqNumber.assign(total, 0);
MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
}
}
- }
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
MPI_Barrier(MPI_COMM_WORLD);
double begClock = clock();
for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<i;j++){
float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
}
}
- m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
-
string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
+
+ if (m->control_pressed) { return fDistFileName; }
+
+ m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
ofstream distFile(fDistFileName.c_str());
distFile << outStream.str();
try {
if (abort == true) { return 0; }
- getSingleLookUp();
- getJointLookUp();
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
+
vector<string> flowFileVector;
if(flowFilesFileName != "not found"){
string fName;
ifstream flowFilesFile;
m->openInputFile(flowFilesFileName, flowFilesFile);
while(flowFilesFile){
- flowFilesFile >> fName;
+ fName = m->getline(flowFilesFile);
flowFileVector.push_back(fName);
m->gobble(flowFilesFile);
}
for(int i=0;i<numFiles;i++){
+
+ if (m->control_pressed) { break; }
+
flowFileName = flowFileVector[i];
m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
m->mothurOut("Reading flowgrams...\n");
getFlowData();
+ if (m->control_pressed) { break; }
+
m->mothurOut("Identifying unique flowgrams...\n");
getUniques();
+ if (m->control_pressed) { break; }
m->mothurOut("Calculating distances between flowgrams...\n");
string distFileName = createDistFile(processors);
string namesFileName = createNamesFile();
-
+
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nClustering flowgrams...\n");
string listFileName = cluster(distFileName, namesFileName);
+
+ if (m->control_pressed) { break; }
+
getOTUData(listFileName);
+ if (m->control_pressed) { break; }
+
m->mothurRemove(distFileName);
m->mothurRemove(namesFileName);
m->mothurRemove(listFileName);
initPyroCluster();
+ if (m->control_pressed) { break; }
+
double maxDelta = 0;
int iter = 0;
double begClock = clock();
- unsigned long int begTime = time(NULL);
+ unsigned long long begTime = time(NULL);
m->mothurOut("\nDenoising flowgrams...\n");
m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+ if (m->control_pressed) { break; }
+
double cycClock = clock();
- unsigned long int cycTime = time(NULL);
+ unsigned long long cycTime = time(NULL);
fill();
+ if (m->control_pressed) { break; }
+
calcCentroids();
- maxDelta = getNewWeights();
- double nLL = getLikelihood();
+ if (m->control_pressed) { break; }
+
+ maxDelta = getNewWeights(); if (m->control_pressed) { break; }
+ double nLL = getLikelihood(); if (m->control_pressed) { break; }
checkCentroids();
+ if (m->control_pressed) { break; }
+
calcNewDistances();
-
+
+ if (m->control_pressed) { break; }
+
iter++;
m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
}
+ if (m->control_pressed) { break; }
+
m->mothurOut("\nFinalizing...\n");
fill();
+
+ if (m->control_pressed) { break; }
+
setOTUs();
+ if (m->control_pressed) { break; }
+
vector<int> otuCounts(numOTUs, 0);
for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
- calcCentroidsDriver(0, numOTUs);
- writeQualities(otuCounts);
- writeSequences(otuCounts);
- writeNames(otuCounts);
- writeClusters(otuCounts);
- writeGroups();
+ calcCentroidsDriver(0, numOTUs); if (m->control_pressed) { break; }
+ writeQualities(otuCounts); if (m->control_pressed) { break; }
+ writeSequences(otuCounts); if (m->control_pressed) { break; }
+ writeNames(otuCounts); if (m->control_pressed) { break; }
+ writeClusters(otuCounts); if (m->control_pressed) { break; }
+ writeGroups(); if (m->control_pressed) { break; }
m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+
if(compositeFASTAFileName != ""){
outputNames.push_back(compositeFASTAFileName);
outputNames.push_back(compositeNamesFileName);
flowFile >> numFlowCells;
int index = 0;//pcluster
while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
flowFile >> seqName >> currentNumFlowCells;
lengths.push_back(currentNumFlowCells);
numSeqs = seqNameVector.size();
for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
int iNumFlowCells = i * numFlowCells;
for(int j=lengths[i];j<numFlowCells;j++){
flowDataIntI[iNumFlowCells + j] = 0;
m->openInputFile(lookupFileName, lookUpFile);
for(int i=0;i<HOMOPS;i++){
+
+ if (m->control_pressed) { break; }
+
float logFracFreq;
lookUpFile >> logFracFreq;
jointLookUp.resize(NUMBINS * NUMBINS, 0);
for(int i=0;i<NUMBINS;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<NUMBINS;j++){
double minSum = 100000000;
for(int i=0;i<HOMOPS;i++){//loop signal strength
+
+ if (m->control_pressed) { break; }
+
float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
}
vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
int index = 0;
vector<short> current(numFlowCells);
uniqueLengths.resize(numUniques);
flowDataPrI.resize(numSeqs * numFlowCells, 0);
- for(int i=0;i<flowDataPrI.size();i++) { flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+ for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "getUniques");
float dist = 0;
for(int i=0;i<minLength;i++){
+
+ if (m->control_pressed) { break; }
+
int flowAIntI = flowDataIntI[ANumFlowCells + i];
float flowAPrI = flowDataPrI[ANumFlowCells + i];
double begClock = clock();
for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<i;j++){
float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
m->mothurOutEndLine();
}
}
- m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
ofstream distFile(distFileName.c_str());
distFile << outStream.str();
distFile.close();
+
+ if (m->control_pressed) {}
+ else {
+ m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ m->mothurOutEndLine();
+ }
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "flowDistParentFork");
string ShhherCommand::createDistFile(int processors){
try{
+//////////////////////// until I figure out the shared memory issue //////////////////////
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+ processors=1;
+#endif
+//////////////////////// until I figure out the shared memory issue //////////////////////
+
string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
- unsigned long int begTime = time(NULL);
+ unsigned long long begTime = time(NULL);
double begClock = clock();
-
- vector<int> start;
- vector<int> end;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ if (numSeqs < processors){ processors = 1; }
+
if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); }
+
else{ //you have multiple processors
- if (numSeqs < processors){ processors = 1; }
-
vector<int> start(processors, 0);
vector<int> end(processors, 0);
+ int process = 1;
+ vector<int> processIDs;
+
for (int i = 0; i < processors; i++) {
start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
}
- int process = 1;
- vector<int> processIDs;
-
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
int temp = processIDs[i];
wait(&temp);
}
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the flowDistParentForkData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<flowDistParentForkData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for(int i = 0; i < processors-1; i++){
+ // Allocate memory for thread data.
+ string extension = extension = toString(i) + ".temp";
+
+ flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i);
+ pDataArray.push_back(tempdist);
+ processIDs.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //parent does its part
+ flowDistParentFork(fDistFileName, start[0], end[0]);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
//append and remove temp files
for (int i=0;i<processIDs.size();i++) {
}
-#else
- flowDistParentFork(fDistFileName, 0, numUniques);
-#endif
-
m->mothurOutEndLine();
-
m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
-
return fDistFileName;
}
catch(exception& e) {
m->openOutputFile(nameFileName, nameFile);
for(int i=0;i<numUniques;i++){
+
+ if (m->control_pressed) { break; }
+
// nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
}
double clusterCutoff = cutoff;
while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
cluster->update(clusterCutoff);
}
string singleOTU = "";
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
listFile >> singleOTU;
void ShhherCommand::initPyroCluster(){
try{
+
+ if (numOTUs < processors) { processors = 1; }
+
dist.assign(numSeqs * numOTUs, 0);
change.assign(numOTUs, 1);
centroids.assign(numOTUs, -1);
try {
int index = 0;
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
cumNumSeqs[i] = index;
for(int j=0;j<nSeqsPerOTU[i];j++){
seqNumber[index] = aaP[i][j];
try{
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
+
if(processors == 1) {
calcCentroidsDriver(0, numOTUs);
}
try{
-
for(int i=start;i<finish;i++){
+ if (m->control_pressed) { break; }
+
double count = 0;
int position = 0;
int minFlowGram = 100000000;
int flowBValue = flow * numFlowCells;
double dist = 0;
-
+
for(int i=0;i<length;i++){
dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
flowAValue++;
for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { break; }
+
double difference = weight[i];
weight[i] = 0;
string hold;
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<nSeqsPerOTU[i];j++){
int index = cumNumSeqs[i] + j;
int nI = seqIndex[index];
}
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
if(unique[i] == 1){
for(int j=i+1;j<numOTUs;j++){
if(unique[j] == 1){
try{
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
+
if(processors == 1) {
calcNewDistancesParent(0, numSeqs);
}
exit(0);
}
}
-
+
//parent does its part
calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
int total = seqIndex.size();
seqIndex.clear();
singleTau.clear();
-
-
for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
double offset = 1e8;
int indexOffset = i * numOTUs;
child_singleTau.resize(0);
for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
double offset = 1e8;
int indexOffset = i * numOTUs;
vector<double> newTau(numOTUs,0);
vector<double> norms(numSeqs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
-
+
for(int i=startSeq;i<stopSeq;i++){
- int indexOffset = i * numOTUs;
+ if (m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
double offset = 1e8;
for(int j=0;j<numOTUs;j++){
+
if(weight[j] > MIN_WEIGHT && change[j] == 1){
dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
}
-
+
if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
offset = dist[indexOffset + j];
}
}
-
+
for(int j=0;j<numOTUs;j++){
if(weight[j] > MIN_WEIGHT){
newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
newTau[j] = 0.0;
}
}
-
+
for(int j=0;j<numOTUs;j++){
newTau[j] /= norms[i];
}
-
+
for(int j=0;j<numOTUs;j++){
if(newTau[j] > MIN_TAU){
nSeqsPerOTU[j]++;
}
}
+
}
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
for(int j=0;j<nSeqsPerOTU[i];j++){
int index = cumNumSeqs[i] + j;
double tauValue = singleTau[seqNumber[index]];
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
int index = 0;
int base = 0;
vector<string> names(numOTUs, "");
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
int index = centroids[i];
if(otuCounts[i] > 0){
m->openOutputFile(nameFileName, nameFile);
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
if(otuCounts[i] > 0){
nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
m->openOutputFile(groupFileName, groupFile);
for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
}
groupFile.close();
string bases = flowOrder;
for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
//output the translated version of the centroid sequence for the otu
if(otuCounts[i] > 0){
int index = centroids[i];
#include "mothur.h"
#include "command.hpp"
+//**********************************************************************************************************************
+
+#define NUMBINS 1000
+#define HOMOPS 10
+#define MIN_COUNT 0.1
+#define MIN_WEIGHT 0.1
+#define MIN_TAU 0.0001
+#define MIN_ITER 10
+//**********************************************************************************************************************
class ShhherCommand : public Command {
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct flowDistParentForkData {
+ string distFileName;
+ vector<int> mapUniqueToSeq;
+ vector<int> mapSeqToUnique;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ vector<double> jointLookUp;
+ MothurOut* m;
+ int threadID, startSeq, stopSeq, numFlowCells;
+ float cutoff;
+
+ flowDistParentForkData(){}
+ flowDistParentForkData(string d, vector<int> mapU, vector<int> mapS, vector<int> l, vector<short> flowD, vector<double> flowDa, vector<double> j, MothurOut* mout, int st, int sp, int n, float cut, int tid) {
+ distFileName = d;
+ mapUniqueToSeq = mapU;
+ mapSeqToUnique = mapS;
+ lengths = l;
+ flowDataIntI = flowD;
+ flowDataPrI = flowDa;
+ jointLookUp = j;
+ m = mout;
+ startSeq = st;
+ stopSeq = sp;
+ numFlowCells = n;
+ cutoff= cut;
+ threadID = tid;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){
+ flowDistParentForkData* pDataArray;
+ pDataArray = (flowDistParentForkData*)lpParam;
+
+ try {
+ ostringstream outStream;
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.setf(ios::dec, ios::basefield);
+ outStream.setf(ios::showpoint);
+ outStream.precision(6);
+
+ int begTime = time(NULL);
+ double begClock = clock();
+ string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-";
+ cout << tempOut << endl;
+
+ for(int i=pDataArray->startSeq;i<pDataArray->stopSeq;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+ cout << "thread i = " << i << endl;
+ for(int j=0;j<i;j++){
+
+ cout << "thread j = " << j << endl;
+ float flowDistance = 0.0;
+ ////////////////// calcPairwiseDist ///////////////////
+ //needed because this is a static global function that can't see the classes internal functions
+
+ int minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[i]]];
+ if(pDataArray->lengths[pDataArray->mapUniqueToSeq[j]] < minLength){ minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[j]]]; }
+
+ int ANumFlowCells = pDataArray->mapUniqueToSeq[i] * pDataArray->numFlowCells;
+ int BNumFlowCells = pDataArray->mapUniqueToSeq[j] * pDataArray->numFlowCells;
+
+ for(int k=0;k<minLength;k++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int flowAIntI = pDataArray->flowDataIntI[ANumFlowCells + k];
+ float flowAPrI = pDataArray->flowDataPrI[ANumFlowCells + k];
+
+ int flowBIntI = pDataArray->flowDataIntI[BNumFlowCells + k];
+ float flowBPrI = pDataArray->flowDataPrI[BNumFlowCells + k];
+ flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ flowDistance /= (float) minLength;
+ //cout << flowDistance << endl;
+ ////////////////// end of calcPairwiseDist ///////////////////
+
+ if(flowDistance < 1e-6){
+ outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+ }
+ else if(flowDistance <= pDataArray->cutoff){
+ outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+ }
+ }
+ if(i % 100 == 0){
+ pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+ }
+
+ ofstream distFile(pDataArray->distFileName.c_str());
+ distFile << outStream.str();
+ distFile.close();
+
+ if (pDataArray->m->control_pressed) {}
+ else {
+ pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
#endif
if(sobs != 0){
double simnum=0.0000;
- for(unsigned long int i=1;i<=maxRank;i++){
+ for(unsigned long long i=1;i<=maxRank;i++){
simnum += (double)(rank->get(i)*i*(i-1));
}
simpson = simnum / (sampled*(sampled-1));
- for(unsigned long int i=1;i<=maxRank;i++){
+ for(unsigned long long i=1;i<=maxRank;i++){
double piI = (double) i / (double)sampled;
firstTerm += rank->get(i) * pow(piI, 3);
secondTerm += rank->get(i) * pow(piI, 2);
#include "subsamplecommand.h"
#include "sharedutilities.h"
+#include "deconvolutecommand.h"
//**********************************************************************************************************************
vector<string> SubSampleCommand::setParameters(){
set<string> subset; //dont want repeat sequence names added
if (persample) {
- for (int i = 0; i < Groups.size(); i++) {
-
- //randomly select a subset of those names from this group to include in the subsample
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { return 0; }
+ //initialize counts
+ map<string, int> groupCounts;
+ for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+
+ for (int j = 0; j < names.size(); j++) {
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- if (subset.count(names[myrand]) == 0) {
-
- string group = groupMap->getGroup(names[myrand]);
- if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-
- if (group == Groups[i]) { subset.insert(names[myrand]); break; }
- }
- }
- }
+ if (m->control_pressed) { return 0; }
+
+ string group = groupMap->getGroup(names[j]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+ else{
+ if (groupCounts[group] < size) { subset.insert(names[j]); }
+ }
}
}else {
//randomly select a subset of those names to include in the subsample
- for (int j = 0; j < size; j++) {
+ //since names was randomly shuffled just grab the next one
+ for (int j = 0; j < names.size(); j++) {
if (m->control_pressed) { return 0; }
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
+ if (groupfile != "") { //if there is a groupfile given fill in group info
+ string group = groupMap->getGroup(names[j]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
- if (subset.count(names[myrand]) == 0) {
-
- if (groupfile != "") { //if there is a groupfile given fill in group info
- string group = groupMap->getGroup(names[myrand]);
- if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-
- if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
- if (m->inUsersGroups(group, Groups)) {
- subset.insert(names[myrand]); break;
- }
- }else{
- subset.insert(names[myrand]); break;
- }
- }else{ //save everyone, group
- subset.insert(names[myrand]); break;
- }
+ if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+ if (m->inUsersGroups(group, Groups)) {
+ subset.insert(names[j]);
+ }
+ }else{
+ subset.insert(names[j]);
}
- }
+ }else{ //save everyone, group
+ subset.insert(names[j]);
+ }
+
+ //do we have enough??
+ if (subset.size() == size) { break; }
}
}
ofstream out;
m->openOutputFile(outputFileName, out);
- outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
//read through fasta file outputting only the names on the subsample list
ifstream in;
m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine();
}
+ if (namefile != "") {
+ m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine();
+
+ //use unique.seqs to create new name and fastafile
+ string inputString = "fasta=" + outputFileName;
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* uniqueCommand = new DeconvoluteCommand(inputString);
+ uniqueCommand->execute();
+
+ map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+
+ delete uniqueCommand;
+
+ outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]);
+ m->mothurRemove(outputFileName);
+ outputFileName = filenames["fasta"][0];
+
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ m->mothurOut("Done."); m->mothurOutEndLine();
+ }
+
+ outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
//if a groupfile is provided read through the group file only outputting the names on the subsample list
if (groupfile != "") {
if (m->control_pressed) { delete order; out.close(); return 0; }
//get random number to sample from order between 0 and thisSize-1.
- int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
+ //don't need this because of the random shuffle above
+ //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
- int bin = order->get(myrand);
+ int bin = order->get(j);
int abund = thislookup[i]->getAbundance(bin);
thislookup[i]->set(bin, (abund+1), thisgroup);
//randomly select a subset of those names to include in the subsample
set<string> subset; //dont want repeat sequence names added
if (persample) {
- for (int i = 0; i < Groups.size(); i++) {
+ //initialize counts
+ map<string, int> groupCounts;
+ for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+
+ for (int j = 0; j < names.size(); j++) {
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { break; }
-
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- if (subset.count(names[myrand]) == 0) { //you are not already added
- if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; }
- }
- }
- }
+ if (m->control_pressed) { return 0; }
+
+ string group = groupMap->getGroup(names[j]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+ else{
+ if (groupCounts[group] < size) { subset.insert(names[j]); }
+ }
}
}else{
for (int j = 0; j < size; j++) {
if (m->control_pressed) { break; }
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; }
- }
+ subset.insert(names[j]);
}
}
if (m->control_pressed) { delete order; return 0; }
- //get random number to sample from order between 0 and thisSize-1.
- int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- int bin = order->get(myrand);
+ int bin = order->get(j);
int abund = rabund->get(bin);
rabund->set(bin, (abund+1));
if (m->control_pressed) { delete order; return 0; }
- //get random number to sample from order between 0 and thisSize-1.
- int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- int bin = order->get(myrand);
+ int bin = order->get(j);
int abund = rabund->get(bin);
rabund->set(bin, (abund+1));
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- command += " > ./commandScreen.output 2>&1";
- system(command.c_str());
+ //if command contains a redirect don't add the redirect
+ bool usedRedirect = false;
+ if ((command.find('>')) == string::npos) {
+ command += " > ./commandScreen.output 2>&1";
+ usedRedirect = true;
+ }
- ifstream in;
- string filename = "./commandScreen.output";
- m->openInputFile(filename, in, "no error");
+ system(command.c_str());
- string output = "";
- while(char c = in.get()){
- if(in.eof()) { break; }
- else { output += c; }
+ if (usedRedirect) {
+ ifstream in;
+ string filename = "./commandScreen.output";
+ m->openInputFile(filename, in, "no error");
+
+ string output = "";
+ while(char c = in.get()){
+ if(in.eof()) { break; }
+ else { output += c; }
+ }
+ in.close();
+
+ m->mothurOut(output); m->mothurOutEndLine();
+ m->mothurRemove(filename);
}
- in.close();
- m->mothurOut(output); m->mothurOutEndLine();
- m->mothurRemove(filename);
-
return 0;
}
#include "trimflowscommand.h"
#include "needlemanoverlap.hpp"
+
//**********************************************************************************************************************
vector<string> TrimFlowsCommand::setParameters(){
try {
outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
}
- vector<unsigned long int> flowFilePos = getFlowFileBreaks();
+ vector<unsigned long long> flowFilePos;
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ flowFilePos = getFlowFileBreaks();
for (int i = 0; i < (flowFilePos.size()-1); i++) {
lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
}
-
+ #else
+ ifstream in; m->openInputFile(flowFileName, in); in >> numFlows; in.close();
+ ///////////////////////////////////////// until I fix multiple processors for windows //////////////////
+ processors = 1;
+ ///////////////////////////////////////// until I fix multiple processors for windows //////////////////
+ if (processors == 1) {
+ lines.push_back(new linePair(0, 1000));
+ }else {
+ int numFlowLines;
+ flowFilePos = m->setFilePosEachLine(flowFileName, numFlowLines);
+ flowFilePos.erase(flowFilePos.begin() + 1); numFlowLines--;
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFlowLines / processors;
+ cout << numSeqsPerProcessor << '\t' << numFlowLines << endl;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFlowLines - i * numSeqsPerProcessor; }
+ lines.push_back(new linePair(flowFilePos[startIndex], numSeqsPerProcessor));
+ cout << flowFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
+ }
+ }
+ #endif
+
vector<vector<string> > barcodePrimerComboFileNames;
if(oligoFileName != ""){
getOligos(barcodePrimerComboFileNames);
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
}else{
createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames);
}
-#else
- driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
-#endif
if (m->control_pressed) { return 0; }
for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
//***************************************************************************************************************
-int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
+int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > thisBarcodePrimerComboFileNames, linePair* line){
try {
-
ofstream trimFlowFile;
m->openOutputFile(trimFlowFileName, trimFlowFile);
trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
ofstream scrapFlowFile;
m->openOutputFile(scrapFlowFileName, scrapFlowFile);
scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
-
- if(line->start == 4){
+
+ ofstream fastaFile;
+ if(fasta){ m->openOutputFile(fastaFileName, fastaFile); }
+
+ ifstream flowFile;
+ m->openInputFile(flowFileName, flowFile);
+
+ flowFile.seekg(line->start);
+
+ if(line->start == 0){
+ flowFile >> numFlows; m->gobble(flowFile);
scrapFlowFile << maxFlows << endl;
trimFlowFile << maxFlows << endl;
if(allFiles){
- for(int i=0;i<barcodePrimerComboFileNames.size();i++){
- for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
- // barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
+ for(int i=0;i<thisBarcodePrimerComboFileNames.size();i++){
+ for(int j=0;j<thisBarcodePrimerComboFileNames[0].size();j++){
ofstream temp;
- m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
+ m->openOutputFile(thisBarcodePrimerComboFileNames[i][j], temp);
temp << maxFlows << endl;
temp.close();
}
}
FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
-
- ofstream fastaFile;
- if(fasta){ m->openOutputFile(fastaFileName, fastaFile); }
-
- ifstream flowFile;
- m->openInputFile(flowFileName, flowFile);
-
- flowFile.seekg(line->start);
-
+ //cout << " driver flowdata address " << &flowData << &flowFile << endl;
int count = 0;
bool moreSeqs = 1;
-
+
+ TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
+
while(moreSeqs) {
+ //cout << "driver " << count << endl;
+
+
+ if (m->control_pressed) { break; }
int success = 1;
int currentSeqDiffs = 0;
string trashCode = "";
- flowData.getNext(flowFile);
+ flowData.getNext(flowFile);
+ //cout << "driver good bit " << flowFile.good() << endl;
flowData.capFlows(maxFlows);
Sequence currSeq = flowData.getSequence();
+
if(!flowData.hasMinFlows(minFlows)){ //screen to see if sequence is of a minimum number of flows
success = 0;
trashCode += 'l';
int barcodeIndex = 0;
if(barcodes.size() != 0){
- success = stripBarcode(currSeq, barcodeIndex);
+ success = trimOligos.stripBarcode(currSeq, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqDiffs += success; }
}
if(numFPrimers != 0){
- success = stripForward(currSeq, primerIndex);
+ success = trimOligos.stripForward(currSeq, primerIndex);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqDiffs += success; }
}
if (currentSeqDiffs > tdiffs) { trashCode += 't'; }
if(numRPrimers != 0){
- success = stripReverse(currSeq);
+ success = trimOligos.stripReverse(currSeq);
if(!success) { trashCode += 'r'; }
}
-
+
if(trashCode.length() == 0){
flowData.printFlows(trimFlowFile);
if(allFiles){
ofstream output;
- m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
+ m->openOutputFileAppend(thisBarcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
flowData.printFlows(output);
}
count++;
-
+ //cout << "driver" << '\t' << currSeq.getName() << endl;
//report progress
if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = flowFile.tellg();
+ unsigned long long pos = flowFile.tellg();
if ((pos == -1) || (pos >= line->end)) { break; }
#else
exit(1);
}
}
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
-
- //can you find the barcode
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
-
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- // int length = oligo.length();
-
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = pdiffs + 1; //guilty until proven innocent
-
- //can you find the primer
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
- success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((pdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
-
- for(it;it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
- // int length = oligo.length();
-
- if(rawSequence.length() < maxLength){
- success = pdiffs + 100;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > pdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimFlowsCommand", "stripForward");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-
-bool TrimFlowsCommand::stripReverse(Sequence& seq){
- try {
-
- string rawSequence = seq.getUnaligned();
- bool success = 0; //guilty until proven innocent
-
- for(int i=0;i<numRPrimers;i++){
- string oligo = revPrimer[i];
-
- if(rawSequence.length() < oligo.length()){
- success = 0;
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
- seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
- success = 1;
- break;
- }
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimFlowsCommand", "stripReverse");
- exit(1);
- }
-}
-
-
-//***************************************************************************************************************
-
-bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
- try {
- bool success = 1;
- int length = oligo.length();
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- return success;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::countDiffs(string oligo, string seq){
- try {
-
- int length = oligo.length();
- int countDiffs = 0;
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- }
-
- }
-
- return countDiffs;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimFlowsCommand", "countDiffs");
- exit(1);
- }
-
-}
-
/**************************************************************************************************/
-
-vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
+vector<unsigned long long> TrimFlowsCommand::getFlowFileBreaks() {
try{
- vector<unsigned long int> filePos;
+ vector<unsigned long long> filePos;
filePos.push_back(0);
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (flowFileName.c_str(),"rb");
}
//estimate file breaks
- unsigned long int chunkSize = 0;
+ unsigned long long chunkSize = 0;
chunkSize = size / processors;
//file too small to divide by processors
//for each process seekg to closest file break and search for next '>' char. make that the filebreak
for (int i = 0; i < processors; i++) {
- unsigned long int spot = (i+1) * chunkSize;
+ unsigned long long spot = (i+1) * chunkSize;
ifstream in;
m->openInputFile(flowFileName, in);
string dummy = m->getline(in);
//there was not another sequence before the end of the file
- unsigned long int sanityPos = in.tellg();
+ unsigned long long sanityPos = in.tellg();
// if (sanityPos == -1) { break; }
// else { filePos.push_back(newSpot); }
m->openInputFile(flowFileName, in);
in >> numFlows;
m->gobble(in);
- unsigned long int spot = in.tellg();
- filePos[0] = spot;
+ //unsigned long long spot = in.tellg();
+ //filePos[0] = spot;
in.close();
processors = (filePos.size() - 1);
int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
try {
+ processIDS.clear();
+ int exitCommand = 1;
+
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 1;
- int exitCommand = 1;
- processIDS.clear();
//loop through and create all the processes you want
while (process != processors) {
int temp = processIDS[i];
wait(&temp);
}
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the trimFlowData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<trimFlowData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+
+ vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
+ if(allFiles){
+ for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
+ for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
+ tempBarcodePrimerComboFileNames[i][j] += extension;
+ ofstream temp;
+ m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
+ temp.close();
+
+ }
+ }
+ }
+
+ trimFlowData* tempflow = new trimFlowData(flowFileName, (trimFlowFileName + extension), (scrapFlowFileName + extension), fastaFileName, flowOrder, tempBarcodePrimerComboFileNames, barcodes, primers, revPrimer, fasta, allFiles, lines[i]->start, lines[i]->end, m, signal, noise, numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, i);
+ pDataArray.push_back(tempflow);
+
+ //MyTrimFlowThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyTrimFlowThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //using the main process as a worker saves time and memory
+ ofstream temp;
+ m->openOutputFile(trimFlowFileName, temp);
+ temp.close();
+
+ m->openOutputFile(scrapFlowFileName, temp);
+ temp.close();
+
+ if(fasta){
+ m->openOutputFile(fastaFileName, temp);
+ temp.close();
+ }
+ vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
+ if(allFiles){
+ for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
+ for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
+ tempBarcodePrimerComboFileNames[i][j] += toString(processors-1) + ".temp";
+ ofstream temp;
+ m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
+ temp.close();
+
+ }
+ }
+ }
+
+ //do my part - do last piece because windows is looking for eof
+ int num = driverCreateTrim(flowFileName, (trimFlowFileName + toString(processors-1) + ".temp"), (scrapFlowFileName + toString(processors-1) + ".temp"), (fastaFileName + toString(processors-1) + ".temp"), tempBarcodePrimerComboFileNames, lines[processors-1]);
+ processIDS.push_back((processors-1));
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+
+#endif
//append files
m->mothurOutEndLine();
for(int i=0;i<processIDS.size();i++){
}
return exitCommand;
-#endif
+
}
catch(exception& e) {
m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
#include "sequence.hpp"
#include "flowdata.h"
#include "groupmap.h"
+#include "trimoligos.h"
class TrimFlowsCommand : public Command {
public:
bool abort;
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
int comboStarts;
vector<int> processIDS; //processid
vector<linePair*> lines;
- vector<unsigned long int> getFlowFileBreaks();
+ vector<unsigned long long> getFlowFileBreaks();
int createProcessesCreateTrim(string, string, string, string, vector<vector<string> >);
int driverCreateTrim(string, string, string, string, vector<vector<string> >, linePair*);
set<string> filesToRemove;
void getOligos(vector<vector<string> >&); //a rewrite of what is in trimseqscommand.h
- int stripBarcode(Sequence&, int&); //largely redundant with trimseqscommand.h
- int stripForward(Sequence&, int&); //largely redundant with trimseqscommand.h
- bool stripReverse(Sequence&); //largely redundant with trimseqscommand.h
- bool compareDNASeq(string, string); //largely redundant with trimseqscommand.h
- int countDiffs(string, string); //largely redundant with trimseqscommand.h
bool allFiles;
int processors;
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct trimFlowData {
+ string flowFileName;
+ string trimFlowFileName;
+ string scrapFlowFileName;
+ string fastaFileName;
+ string flowOrder;
+ vector<vector<string> > barcodePrimerComboFileNames;
+ map<string, int> barcodes;
+ map<string, int> primers;
+ vector<string> revPrimer;
+ bool fasta, allFiles;
+ unsigned long long start;
+ unsigned long long end;
+ MothurOut* m;
+ float signal, noise;
+ int numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, threadID, count;
+
+ trimFlowData(){}
+ trimFlowData(string ff, string tf, string sf, string f, string fo, vector<vector<string> > bfn, map<string, int> bar, map<string, int> pri, vector<string> rev, bool fa, bool al, unsigned long long st, unsigned long long en, MothurOut* mout, float sig, float n, int numF, int maxF, int minF, int maxH, int td, int bd, int pd, int tid) {
+ flowFileName = ff;
+ trimFlowFileName = tf;
+ scrapFlowFileName = sf;
+ fastaFileName = f;
+ flowOrder = fo;
+ barcodePrimerComboFileNames = bfn;
+ barcodes = bar;
+ primers = pri;
+ revPrimer = rev;
+ fasta = fa;
+ allFiles = al;
+ start = st;
+ end = en;
+ m = mout;
+ signal = sig;
+ noise = n;
+ numFlows = numF;
+ maxFlows = maxF;
+ minFlows = minF;
+ maxHomoP = maxH;
+ tdiffs = td;
+ bdiffs = bd;
+ pdiffs = pd;
+ threadID = tid;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyTrimFlowThreadFunction(LPVOID lpParam){
+ trimFlowData* pDataArray;
+ pDataArray = (trimFlowData*)lpParam;
+
+ try {
+ ofstream trimFlowFile;
+ pDataArray->m->openOutputFile(pDataArray->trimFlowFileName, trimFlowFile);
+ trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
+
+ ofstream scrapFlowFile;
+ pDataArray->m->openOutputFile(pDataArray->scrapFlowFileName, scrapFlowFile);
+ scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
+
+ ofstream fastaFile;
+ if(pDataArray->fasta){ pDataArray->m->openOutputFile(pDataArray->fastaFileName, fastaFile); }
+
+ ifstream flowFile;
+ pDataArray->m->openInputFile(pDataArray->flowFileName, flowFile);
+
+ flowFile.seekg(pDataArray->start);
+
+ if(pDataArray->start == 0){
+ flowFile >> pDataArray->numFlows; pDataArray->m->gobble(flowFile);
+ scrapFlowFile << pDataArray->maxFlows << endl;
+ trimFlowFile << pDataArray->maxFlows << endl;
+ if(pDataArray->allFiles){
+ for(int i=0;i<pDataArray->barcodePrimerComboFileNames.size();i++){
+ for(int j=0;j<pDataArray->barcodePrimerComboFileNames[0].size();j++){
+ ofstream temp;
+ pDataArray->m->openOutputFile(pDataArray->barcodePrimerComboFileNames[i][j], temp);
+ temp << pDataArray->maxFlows << endl;
+ temp.close();
+ }
+ }
+ }
+ }
+
+ FlowData flowData(pDataArray->numFlows, pDataArray->signal, pDataArray->noise, pDataArray->maxHomoP, pDataArray->flowOrder);
+ cout << " thread flowdata address " << &flowData << '\t' << &flowFile << endl;
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer);
+
+ pDataArray->count = pDataArray->end;
+ cout << pDataArray->threadID << '\t' << pDataArray->count << endl;
+ int count = 0;
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) { break; }
+ cout << pDataArray->threadID << '\t' << count << endl;
+ int success = 1;
+ int currentSeqDiffs = 0;
+ string trashCode = "";
+
+ flowData.getNext(flowFile);
+ cout << "thread good bit " << flowFile.good() << endl;
+ flowData.capFlows(pDataArray->maxFlows);
+
+ Sequence currSeq = flowData.getSequence();
+ if(!flowData.hasMinFlows(pDataArray->minFlows)){ //screen to see if sequence is of a minimum number of flows
+ success = 0;
+ trashCode += 'l';
+ }
+
+ int primerIndex = 0;
+ int barcodeIndex = 0;
+
+ if(pDataArray->barcodes.size() != 0){
+ success = trimOligos.stripBarcode(currSeq, barcodeIndex);
+ if(success > pDataArray->bdiffs) { trashCode += 'b'; }
+ else{ currentSeqDiffs += success; }
+ }
+
+ if(pDataArray->primers.size() != 0){
+ success = trimOligos.stripForward(currSeq, primerIndex);
+ if(success > pDataArray->pdiffs) { trashCode += 'f'; }
+ else{ currentSeqDiffs += success; }
+ }
+
+ if (currentSeqDiffs > pDataArray->tdiffs) { trashCode += 't'; }
+
+ if(pDataArray->revPrimer.size() != 0){
+ success = trimOligos.stripReverse(currSeq);
+ if(!success) { trashCode += 'r'; }
+ }
+
+ if(trashCode.length() == 0){
+
+ flowData.printFlows(trimFlowFile);
+
+ if(pDataArray->fasta) { currSeq.printSequence(fastaFile); }
+
+ if(pDataArray->allFiles){
+ ofstream output;
+ pDataArray->m->openOutputFileAppend(pDataArray->barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
+ output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
+
+ flowData.printFlows(output);
+ output.close();
+ }
+ }
+ else{
+ flowData.printFlows(scrapFlowFile, trashCode);
+ }
+
+ count++;
+ cout << pDataArray->threadID << '\t' << currSeq.getName() << endl;
+ //report progress
+ if((count) % 10000 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+
+ }
+ //report progress
+ if((count) % 10000 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+
+ trimFlowFile.close();
+ scrapFlowFile.close();
+ flowFile.close();
+ if(pDataArray->fasta){ fastaFile.close(); }
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "TrimFlowsCommand", "MyTrimFlowsThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
#endif
--- /dev/null
+/*
+ * trimoligos.cpp
+ * Mothur
+ *
+ * Created by westcott on 9/1/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "trimoligos.h"
+#include "alignment.hpp"
+#include "needlemanoverlap.hpp"
+
+
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
+ try {
+ m = MothurOut::getInstance();
+
+ pdiffs = p;
+ bdiffs = b;
+
+ barcodes = br;
+ primers = pr;
+ revPrimer = r;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "TrimOligos");
+ exit(1);
+ }
+}
+/********************************************************************/
+TrimOligos::~TrimOligos() {}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(oligo.length(), -1);
+ }
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (barcodes.size() > 0) {
+ map<string,int>::iterator it=barcodes.begin();
+
+ for(it;it!=barcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(minPos));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(minPos, -1);
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripBarcode");
+ exit(1);
+ }
+
+}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& seq, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (barcodes.size() > 0) {
+ map<string,int>::iterator it=barcodes.begin();
+
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(minPos));
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripBarcode");
+ exit(1);
+ }
+
+}
+//********************************************************************/
+int TrimOligos::stripForward(Sequence& seq, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = pdiffs + 1; //guilty until proven innocent
+
+ //can you find the primer
+ for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
+ success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((pdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (primers.size() > 0) {
+ map<string,int>::iterator it=primers.begin();
+
+ for(it;it!=primers.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){
+ success = pdiffs + 100;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > pdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(minPos));
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripForward");
+ exit(1);
+ }
+}
+//*******************************************************************/
+int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
+ try {
+ string rawSequence = seq.getUnaligned();
+ int success = pdiffs + 1; //guilty until proven innocent
+
+ //can you find the primer
+ for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
+ success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(oligo.length(), -1);
+ }
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((pdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (primers.size() > 0) {
+ map<string,int>::iterator it=primers.begin();
+
+ for(it;it!=primers.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){
+ success = pdiffs + 100;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > pdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(minPos));
+ if(qual.getName() != ""){
+ qual.trimQScores(minPos, -1);
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripForward");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
+ try {
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<revPrimer.size();i++){
+ string oligo = revPrimer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, rawSequence.length()-oligo.length());
+ }
+ success = 1;
+ break;
+ }
+ }
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripReverse");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripReverse(Sequence& seq){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ bool success = 0; //guilty until proven innocent
+
+ for(int i=0;i<revPrimer.size();i++){
+ string oligo = revPrimer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = 0;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+ success = 1;
+ break;
+ }
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripReverse");
+ exit(1);
+ }
+}
+
+//******************************************************************/
+bool TrimOligos::compareDNASeq(string oligo, string seq){
+ try {
+ bool success = 1;
+ int length = oligo.length();
+
+ for(int i=0;i<length;i++){
+
+ if(oligo[i] != seq[i]){
+ if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
+ else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
+ else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
+ else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
+ else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
+ else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
+ else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
+
+ if(success == 0) { break; }
+ }
+ else{
+ success = 1;
+ }
+ }
+
+ return success;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "compareDNASeq");
+ exit(1);
+ }
+
+}
+//********************************************************************/
+int TrimOligos::countDiffs(string oligo, string seq){
+ try {
+
+ int length = oligo.length();
+ int countDiffs = 0;
+
+ for(int i=0;i<length;i++){
+
+ if(oligo[i] != seq[i]){
+ if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
+ else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
+ else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
+ else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
+ else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
+ else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
+ else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
+ }
+
+ }
+
+ return countDiffs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "countDiffs");
+ exit(1);
+ }
+}
+/********************************************************************/
+
+
+
--- /dev/null
+#ifndef TRIMOLIGOS_H
+#define TRIMOLIGOS_H
+
+/*
+ * trimoligos.h
+ * Mothur
+ *
+ * Created by westcott on 9/1/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "qualityscores.h"
+
+
+class TrimOligos {
+
+ public:
+ TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+ ~TrimOligos();
+
+ int stripBarcode(Sequence&, int&);
+ int stripBarcode(Sequence&, QualityScores&, int&);
+
+ int stripForward(Sequence&, int&);
+ int stripForward(Sequence&, QualityScores&, int&);
+
+ bool stripReverse(Sequence&);
+ bool stripReverse(Sequence&, QualityScores&);
+
+
+ private:
+ int pdiffs, bdiffs;
+
+ map<string, int> barcodes;
+ map<string, int> primers;
+ vector<string> revPrimer;
+
+ MothurOut* m;
+
+ bool compareDNASeq(string, string);
+ int countDiffs(string, string);
+};
+
+#endif
+
#include "trimseqscommand.h"
#include "needlemanoverlap.hpp"
+#include "trimoligos.h"
//**********************************************************************************************************************
vector<string> TrimSeqsCommand::setParameters(){
}
}
- vector<unsigned long int> fastaFilePos;
- vector<unsigned long int> qFilePos;
+ vector<unsigned long long> fastaFilePos;
+ vector<unsigned long long> qFilePos;
setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
int count = 0;
bool moreSeqs = 1;
+ TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
while (moreSeqs) {
int primerIndex = 0;
if(barcodes.size() != 0){
- success = stripBarcode(currSeq, currQual, barcodeIndex);
+ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
if(numFPrimers != 0){
- success = stripForward(currSeq, currQual, primerIndex);
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
if(numRPrimers != 0){
- success = stripReverse(currSeq, currQual);
+ success = trimOligos.stripReverse(currSeq, currQual);
if(!success) { trashCode += 'r'; }
}
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= line->end)) { break; }
#else
/**************************************************************************************************/
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
//set file positions for fasta file
map<string, int>::iterator it = firstSeqNames.find(sname);
if(it != firstSeqNames.end()) { //this is the start of a new chunk
- unsigned long int pos = inQual.tellg();
+ unsigned long long pos = inQual.tellg();
qfileFilePos.push_back(pos - input.length() - 1);
firstSeqNames.erase(it);
}
//get last file position of qfile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (qfilename.c_str(),"rb");
fastaFilePos.push_back(0); qfileFilePos.push_back(0);
//get last file position of fastafile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (filename.c_str(),"rb");
exit(1);
}
}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
-
- //can you find the barcode
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
-
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
-
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
-
- for(it;it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
-
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
- try {
- string rawSequence = seq.getUnaligned();
- int success = pdiffs + 1; //guilty until proven innocent
-
- //can you find the primer
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
- success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((pdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
-
- for(it;it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){
- success = pdiffs + 100;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > pdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripForward");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
- try {
- string rawSequence = seq.getUnaligned();
- bool success = 0; //guilty until proven innocent
-
- for(int i=0;i<numRPrimers;i++){
- string oligo = revPrimer[i];
-
- if(rawSequence.length() < oligo.length()){
- success = 0;
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
- seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(-1, rawSequence.length()-oligo.length());
- }
- success = 1;
- break;
- }
- }
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripReverse");
- exit(1);
- }
-}
-
//***************************************************************************************************************
bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
}
}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
- try {
- bool success = 1;
- int length = oligo.length();
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- return success;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::countDiffs(string oligo, string seq){
- try {
-
- int length = oligo.length();
- int countDiffs = 0;
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- }
-
- }
-
- return countDiffs;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "countDiffs");
- exit(1);
- }
-
-}
-
//***************************************************************************************************************
GroupMap* groupMap;
struct linePair {
- unsigned long int start;
- unsigned long int end;
- linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
- int stripBarcode(Sequence&, QualityScores&, int&);
- int stripForward(Sequence&, QualityScores&, int&);
- bool stripReverse(Sequence&, QualityScores&);
-
bool keepFirstTrim(Sequence&, QualityScores&);
bool removeLastTrim(Sequence&, QualityScores&);
-
bool cullLength(Sequence&);
bool cullHomoP(Sequence&);
bool cullAmbigs(Sequence&);
- bool compareDNASeq(string, string);
- int countDiffs(string, string);
bool abort, createGroup;
string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair*, linePair*);
int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
- int setLines(string, string, vector<unsigned long int>&, vector<unsigned long int>&);
+ int setLines(string, string, vector<unsigned long long>&, vector<unsigned long long>&);
};
#endif