A70056E6156A93D000924A2D /* getotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056E5156A93D000924A2D /* getotulabelscommand.cpp */; };
A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; };
A70332B712D3A13400761E33 /* makefile in Sources */ = {isa = PBXBuildFile; fileRef = A70332B512D3A13400761E33 /* makefile */; };
+ A7128B1D16B7002A00723BE4 /* getdistscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7128B1C16B7002600723BE4 /* getdistscommand.cpp */; };
A713EBAC12DC7613000092AC /* readphylipvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBAB12DC7613000092AC /* readphylipvector.cpp */; };
A713EBED12DC7C5E000092AC /* nmdscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBEC12DC7C5E000092AC /* nmdscommand.cpp */; };
A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; };
A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */; };
A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; };
+ A74C06E916A9C0A9008390A3 /* primerdesigncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */; };
A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; };
A754149714840CF7005850D1 /* summaryqualcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A754149614840CF7005850D1 /* summaryqualcommand.cpp */; };
A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */; };
A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A3C8C714D041AD00B1BFBE /* otuassociationcommand.cpp */; };
A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; };
+ A7B0231516B8244C006BA09E /* removedistscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0231416B8244B006BA09E /* removedistscommand.cpp */; };
A7BF221414587886000AD524 /* myPerseus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF221214587886000AD524 /* myPerseus.cpp */; };
A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
A70056E9156AB6D400924A2D /* removeotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeotulabelscommand.h; sourceTree = "<group>"; };
A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeotulabelscommand.cpp; sourceTree = "<group>"; };
A70332B512D3A13400761E33 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
+ A7128B1A16B7001200723BE4 /* getdistscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getdistscommand.h; sourceTree = "<group>"; };
+ A7128B1C16B7002600723BE4 /* getdistscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getdistscommand.cpp; sourceTree = "<group>"; };
A713EBAA12DC7613000092AC /* readphylipvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readphylipvector.h; sourceTree = "<group>"; };
A713EBAB12DC7613000092AC /* readphylipvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readphylipvector.cpp; sourceTree = "<group>"; };
A713EBEB12DC7C5E000092AC /* nmdscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nmdscommand.h; sourceTree = "<group>"; };
A7496D2D167B531B00CC7D7C /* kruskalwalliscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kruskalwalliscommand.h; sourceTree = "<group>"; };
A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = "<group>"; };
A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = "<group>"; };
+ A74C06E616A9C097008390A3 /* primerdesigncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = primerdesigncommand.h; sourceTree = "<group>"; };
+ A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = primerdesigncommand.cpp; sourceTree = "<group>"; };
A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = "<group>"; };
A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerauchimecommand.cpp; sourceTree = "<group>"; };
A74D59A3159A1E2000043046 /* counttable.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = counttable.cpp; sourceTree = "<group>"; };
A7A61F2B130062E000E05B6B /* amovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = amovacommand.h; sourceTree = "<group>"; };
A7A61F2C130062E000E05B6B /* amovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = amovacommand.cpp; sourceTree = "<group>"; };
A7AACFBA132FE008003D6C4D /* currentfile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = currentfile.h; sourceTree = "<group>"; };
+ A7B0231416B8244B006BA09E /* removedistscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removedistscommand.cpp; sourceTree = "<group>"; };
+ A7B0231716B8245D006BA09E /* removedistscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removedistscommand.h; sourceTree = "<group>"; };
A7BF221214587886000AD524 /* myPerseus.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myPerseus.cpp; sourceTree = "<group>"; };
A7BF221314587886000AD524 /* myPerseus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myPerseus.h; sourceTree = "<group>"; };
A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraperseuscommand.h; sourceTree = "<group>"; };
219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */,
A7FE7C3E1330EA1000F7B327 /* getcurrentcommand.h */,
A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */,
+ A7128B1A16B7001200723BE4 /* getdistscommand.h */,
+ A7128B1C16B7002600723BE4 /* getdistscommand.cpp */,
A7E9B6F312D37EC400DA6239 /* getgroupcommand.h */,
A7E9B6F212D37EC400DA6239 /* getgroupcommand.cpp */,
A7E9B6F512D37EC400DA6239 /* getgroupscommand.h */,
A7E9B79512D37EC400DA6239 /* pipelinepdscommand.cpp */,
A7E9B79812D37EC400DA6239 /* preclustercommand.h */,
A7E9B79712D37EC400DA6239 /* preclustercommand.cpp */,
+ A74C06E616A9C097008390A3 /* primerdesigncommand.h */,
+ A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */,
A7E9B7A212D37EC400DA6239 /* quitcommand.h */,
A7E9B7A112D37EC400DA6239 /* quitcommand.cpp */,
A7E9B7AC12D37EC400DA6239 /* rarefactcommand.h */,
A7E9B7AB12D37EC400DA6239 /* rarefactcommand.cpp */,
A7E9B7AF12D37EC400DA6239 /* rarefactsharedcommand.h */,
A7E9B7AE12D37EC400DA6239 /* rarefactsharedcommand.cpp */,
+ A7B0231716B8245D006BA09E /* removedistscommand.h */,
+ A7B0231416B8244B006BA09E /* removedistscommand.cpp */,
A7E9B7C412D37EC400DA6239 /* removegroupscommand.h */,
A7E9B7C312D37EC400DA6239 /* removegroupscommand.cpp */,
A7E9B7C612D37EC400DA6239 /* removelineagecommand.h */,
834D9D5C1656DEC800E7FAB9 /* regularizeddecisiontree.cpp in Sources */,
A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */,
A79EEF8616971D4A0006DEC1 /* filtersharedcommand.cpp in Sources */,
+ A74C06E916A9C0A9008390A3 /* primerdesigncommand.cpp in Sources */,
+ A7128B1D16B7002A00723BE4 /* getdistscommand.cpp in Sources */,
+ A7B0231516B8244C006BA09E /* removedistscommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
num += pDataArray[i]->count;
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
}
- pDataArray->count = pDataArray->end;
-
AlignmentDB* templateDB = new AlignmentDB(pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->threadID);
//moved this into driver to avoid deep copies in windows paralellized version
alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);
}
- int count = 0;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
if (pDataArray->m->control_pressed) { break; }
delete nast;
if (needToDeleteCopy) { delete copy; }
- count++;
+ pDataArray->count++;
}
delete candidateSeq;
//report progress
- if((count) % 100 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
- if((count) % 100 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
delete alignment;
delete templateDB;
//bootstrap - to set confidenceScore
int numToSelect = queryKmers.size() / 8;
+ if (m->debug) { m->mothurOut(seq->getName() + "\t"); }
+
tax = bootstrapResults(queryKmers, index, numToSelect);
+
+ if (m->debug) { m->mothurOut("\n"); }
return tax;
}
int seqTaxIndex = tax;
TaxNode seqTax = phyloTree->get(tax);
+
while (seqTax.level != 0) { //while you are not at the root
itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
confidence = itBoot2->second;
}
+ if (m->debug) { m->mothurOut(seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");"); }
+
if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
simpleTax = seqTax.name + ";" + simpleTax;
}
-
+
seqTaxIndex = seqTax.parent;
seqTax = phyloTree->get(seqTax.parent);
}
outputTypes["models"] = tempOutNames;
outputTypes["bubble"] = tempOutNames;
outputTypes["summary"] = tempOutNames;
+ outputTypes["sabund"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "CatchAllCommand", "CatchAllCommand");
outputTypes["models"] = tempOutNames;
outputTypes["bubble"] = tempOutNames;
outputTypes["summary"] = tempOutNames;
+ outputTypes["sabund"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
catchAllTest = m->getFullPathName(catchAllTest);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- catchAllCommandExe += "mono " + catchAllTest + " ";
+ catchAllCommandExe += "mono \"" + catchAllTest + "\" ";
#else
catchAllCommandExe += "\"" + catchAllTest + "\" ";
#endif
//create system command
string catchAllCommand = "";
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
+ catchAllCommand += catchAllCommandExe + "\"" + filename + "\" \"" + outputPath + + "\" 1";
#else
//removes extra '\\' catchall doesnt like that
vector<string> tempNames;
//create system command
string catchAllCommand = "";
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
+ catchAllCommand += catchAllCommandExe + "\"" + filename + "\" \"" + outputPath + + "\" 1";
#else
//removes extra '\\' catchall doesnt like that
vector<string> tempNames;
//create system command
string catchAllCommand = "";
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
+ catchAllCommand += catchAllCommandExe + "\"" + filename + "\" \"" + outputPath + + "\" 1";
#else
//removes extra '\\' catchall doesnt like that
vector<string> tempNames;
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
num += pDataArray[i]->count;
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
num += pDataArray[i]->count;
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
}
- pDataArray->count = pDataArray->end;
-
if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
if (chimera->getUnaligned()) {
if (pDataArray->start == 0) { chimera->printHeader(out); }
- int count = 0;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->end; i++){
if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
}
- count++;
+ pDataArray->count++;
}
delete candidateSeq;
//report progress
- if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
- if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
pDataArray->numNoParents = chimera->getNumNoParents();
- if (pDataArray->numNoParents == count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
+ if (pDataArray->numNoParents == pDataArray->count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
out.close();
out2.close();
CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pstrand("strand", "String", "", "", "", "", "","",false,false); parameters.push_back(pstrand);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
CommandParameter pabskew("abskew", "Number", "", "1.9", "", "", "","",false,false); parameters.push_back(pabskew);
string helpString = "";
helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n";
- helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
+ helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl, strand and queryfact.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n";
maxp = validParameter.validFile(parameters, "maxp", false); if (maxp == "not found") { useMaxp = false; maxp = "2"; } else{ useMaxp = true; }
minlen = validParameter.validFile(parameters, "minlen", false); if (minlen == "not found") { useMinlen = false; minlen = "10"; } else{ useMinlen = true; }
maxlen = validParameter.validFile(parameters, "maxlen", false); if (maxlen == "not found") { useMaxlen = false; maxlen = "10000"; } else{ useMaxlen = true; }
+
+ strand = validParameter.validFile(parameters, "strand", false); if (strand == "not found") { strand = ""; }
temp = validParameter.validFile(parameters, "ucl", false); if (temp == "not found") { temp = "f"; }
ucl = m->isTrue(temp);
*tempa = '\0'; strncat(tempa, alns.c_str(), alns.length());
cPara.push_back(tempa);
}
+
+ if (strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[strand.length()+1];
+ *tempa = '\0'; strncat(tempa, strand.c_str(), strand.length());
+ cPara.push_back(tempa);
+ }
if (useAbskew) {
char* tempskew = new char[9];
uchimeData* tempUchime = new uchimeData(outputFileName+extension, uchimeLocation, templatefile, files[i], "", "", "", accnos+extension, alns+extension, dummy, m, 0, 0, i);
tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
- tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
+ tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand);
pDataArray.push_back(tempUchime);
processIDS.push_back(i);
uchimeData* tempUchime = new uchimeData(outputFName+extension, uchimeLocation, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end, i);
tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
- tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
+ tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand);
pDataArray.push_back(tempUchime);
processIDS.push_back(i);
int createProcesses(string, string, string, string, int&);
bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups;
- string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation;
+ string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation, strand;
int processors;
SequenceParser* sparser;
int threadID, count, numChimeras;
vector<string> groups;
bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount;
- string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
+ string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand;
uchimeData(){}
uchimeData(string o, string uloc, string t, string file, string f, string n, string g, string ac, string al, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
hasCount = hc;
}
- void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac) {
+ void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac, string stra) {
abskew = abske;
minh = min;
mindiv = mindi;
+ strand = stra;
xn = x;
dn = d;
xa = xa2;
cPara.push_back(tempa);
}
+ if (pDataArray->strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[pDataArray->strand.length()+1];
+ *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+ cPara.push_back(tempa);
+ }
+
if (pDataArray->useAbskew) {
char* tempskew = new char[9];
*tempskew = '\0'; strncat(tempskew, "--abskew", 8);
ofstream out23;
pDataArray->m->openOutputFile(outputFileName, out23);
+ int fcount = 0;
while (!in23.eof()) {
if (pDataArray->m->control_pressed) { break; }
Sequence seq(in23); pDataArray->m->gobble(in23);
- if (seq.getName() != "") { seq.printSequence(out23); }
+ if (seq.getName() != "") { seq.printSequence(out23); fcount++; }
}
in23.close();
out23.close();
cPara.push_back(tempa);
}
+ if (pDataArray->strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[pDataArray->strand.length()+1];
+ *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+ cPara.push_back(tempa);
+ }
+
if (pDataArray->useAbskew) {
char* tempskew = new char[9];
*tempskew = '\0'; strncat(tempskew, "--abskew", 8);
in.close();
out.close();
+ if (fcount != totalSeqs) { pDataArray->m->mothurOut("[ERROR]: process " + toString(pDataArray->threadID) + " only processed " + toString(pDataArray->count) + " of " + toString(pDataArray->end) + " sequences assigned to it, quitting. \n"); pDataArray->m->control_pressed = true; }
+
if (pDataArray->m->control_pressed) { return 0; }
pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences."); pDataArray->m->mothurOutEndLine();
pDataArray->count = totalSeqs;
pDataArray->numChimeras = numChimeras;
+
return totalSeqs;
}
for(int i=0; i < pDataArray.size(); i++){
if (pDataArray[i]->wroteAccnos) { wroteAccnos = pDataArray[i]->wroteAccnos; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp"); }
else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp")); }
+ //check to make sure the process finished
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
string outFasta, outAccnos, keep;
unsigned long long start;
unsigned long long end;
- int numbases;
+ int numbases, count;
bool countGaps, Short, wroteAccnos;
MothurOut* m;
string namefile;
bool done = false;
bool wroteAccnos = false;
- int count = 0;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
outAcc << seq.getName() << endl;
pDataArray->wroteAccnos = true;
}
- count++;
+ pDataArray->count++;
}
//report progress
- if((count) % 1000 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
- if((count) % 1000 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
in.close();
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- string outputMethodTag = method + ".";
+ string outputMethodTag = method;
if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); }
else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); }
else if(method == "zap"){
}
#endif
- if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine();
+ if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur reversed some your sequences for a better classification. If you would like to take a closer look, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine();
outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile);
}else { m->mothurRemove(newaccnosFile); }
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
}
- pDataArray->count = pDataArray->end;
-
//make classify
Classify* myclassify;
string outputMethodTag = pDataArray->method + ".";
if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
- int count = 0;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
- count++;
+ pDataArray->count++;
}
delete candidateSeq;
//report progress
- if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
- if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
delete myclassify;
inFASTA.close();
vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
delete split;
+ if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
+
//output a merged distance file
if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
estart = time(NULL);
-
+
if (!runCluster) {
-#ifdef USE_MPI
- }
-#endif
+
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
return 0;
}
-
+
//****************** break up files between processes and cluster each file set ******************************//
#ifdef USE_MPI
////you are process 0 from above////
#include "sffmultiplecommand.h"
#include "classifysharedcommand.h"
#include "filtersharedcommand.h"
+#include "primerdesigncommand.h"
+#include "getdistscommand.h"
+#include "removedistscommand.h"
/*******************************************************/
commands["quit"] = "MPIEnabled";
commands["classify.shared"] = "classify.shared";
commands["filter.shared"] = "filter.shared";
+ commands["primer.design"] = "primer.design";
+ commands["get.dists"] = "get.dists";
+ commands["remove.dists"] = "remove.dists";
}
else if(commandName == "sff.multiple") { command = new SffMultipleCommand(optionString); }
else if(commandName == "classify.shared") { command = new ClassifySharedCommand(optionString); }
else if(commandName == "filter.shared") { command = new FilterSharedCommand(optionString); }
+ else if(commandName == "primer.design") { command = new PrimerDesignCommand(optionString); }
+ else if(commandName == "get.dists") { command = new GetDistsCommand(optionString); }
+ else if(commandName == "remove.dists") { command = new RemoveDistsCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
else if(commandName == "sff.multiple") { pipecommand = new SffMultipleCommand(optionString); }
else if(commandName == "classify.shared") { pipecommand = new ClassifySharedCommand(optionString); }
else if(commandName == "filter.shared") { pipecommand = new FilterSharedCommand(optionString); }
+ else if(commandName == "primer.design") { pipecommand = new PrimerDesignCommand(optionString); }
+ else if(commandName == "get.dists") { pipecommand = new GetDistsCommand(optionString); }
+ else if(commandName == "remove.dists") { pipecommand = new RemoveDistsCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "sff.multiple") { shellcommand = new SffMultipleCommand(); }
else if(commandName == "classify.shared") { shellcommand = new ClassifySharedCommand(); }
else if(commandName == "filter.shared") { shellcommand = new FilterSharedCommand(); }
+ else if(commandName == "primer.design") { shellcommand = new PrimerDesignCommand(); }
+ else if(commandName == "get.dists") { shellcommand = new GetDistsCommand(); }
+ else if(commandName == "remove.dists") { shellcommand = new RemoveDistsCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
}
}
/***********************************************************************/
+void CommandFactory::printCommandsCategories(ostream& out) {
+ try {
+ map<string, string> commands = getListCommands();
+ map<string, string>::iterator it;
+
+ map<string, string> categories;
+ map<string, string>::iterator itCat;
+ //loop through each command outputting info
+ for (it = commands.begin(); it != commands.end(); it++) {
+
+ Command* thisCommand = getCommand(it->first);
+
+ //don't add hidden commands
+ if (thisCommand->getCommandCategory() != "Hidden") {
+ itCat = categories.find(thisCommand->getCommandCategory());
+ if (itCat == categories.end()) {
+ categories[thisCommand->getCommandCategory()] = thisCommand->getCommandName();
+ }else {
+ categories[thisCommand->getCommandCategory()] += ", " + thisCommand->getCommandName();
+ }
+ }
+ }
+
+ for (itCat = categories.begin(); itCat != categories.end(); itCat++) {
+ out << itCat->first << " commmands include: " << itCat->second << endl;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CommandFactory", "printCommandsCategories");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
bool isValidCommand(string);\r
bool isValidCommand(string, string);\r
void printCommands(ostream&);\r
+ void printCommandsCategories(ostream&);\r
void setOutputDirectory(string o) { outputDir = o; m->setOutputDir(o); }\r
void setInputDirectory(string i) { inputDir = i; }\r
void setLogfileName(string n, bool a) { logFileName = n; append = a; }\r
if (abort == true) { if (calledHelp) { return 0; } return 2; }
+ int start = time(NULL);
+
readFasta();
if (m->control_pressed) { return 0; }
delete input;
}
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the consensus sequences.");
+
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
//**********************************************************************************************************************
vector<string> CountGroupsCommand::setParameters(){
try {
- CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pshared);
- CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pgroup);
- CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pcount);
+ CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pshared);
+ CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pgroup);
+ CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pcount);
CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos);
CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
}
}
//**********************************************************************************************************************
+string CountGroupsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "summary") { pattern = "[filename],count.summary"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
string CountGroupsCommand::getHelpString(){
try {
string helpString = "";
try {
abort = true; calledHelp = true;
setParameters();
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "CountGroupsCommand", "CountGroupsCommand");
}
}
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
//check for required parameters
accnosfile = validParameter.validFile(parameters, "accnos", true);
if (accnosfile != "") { m->readAccnos(accnosfile, Groups); m->setGroups(Groups); }
if (groupfile != "") {
+ map<string, string> variables;
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
+ string outputFileName = getOutputFileName("summary", variables);
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
GroupMap groupMap(groupfile);
groupMap.readMap();
int num = groupMap.getNumSeqs(Groups[i]);
total += num;
m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine();
+ out << Groups[i] << '\t' << num << endl;
}
-
+ out.close();
m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
}
if (m->control_pressed) { return 0; }
if (countfile != "") {
+ map<string, string> variables;
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(countfile); }
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile));
+ string outputFileName = getOutputFileName("summary", variables);
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
CountTable ct;
ct.readTable(countfile);
int num = ct.getGroupCount(Groups[i]);
total += num;
m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine();
+ out << Groups[i] << '\t' << num << endl;
}
+ out.close();
m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
}
InputData input(sharedfile, "sharedfile");
vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+ map<string, string> variables;
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(countfile); }
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile));
+ string outputFileName = getOutputFileName("summary", variables);
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
int total = 0;
for (int i = 0; i < lookup.size(); i++) {
int num = lookup[i]->getNumSeqs();
total += num;
m->mothurOut(lookup[i]->getGroup() + " contains " + toString(num) + "."); m->mothurOutEndLine();
delete lookup[i];
+ out << lookup[i]->getGroup() << '\t' << num << endl;
}
+ out.close();
m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
}
-
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
return 0;
}
string getCommandName() { return "count.groups"; }
string getCommandCategory() { return "Sequence Processing"; }
string getHelpString();
- string getOutputPattern(string){ return ""; }
+ string getOutputPattern(string);
string getCitation() { return "http://www.mothur.org/wiki/Count.groups"; }
string getDescription() { return "counts the number of sequences in each group"; }
string sharedfile, groupfile, countfile, outputDir, groups, accnosfile;
bool abort;
vector<string> Groups;
+ vector<string> outputNames;
};
#endif
string firstCol, secondCol;
in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
-
+ //cout << firstCol << '\t' << secondCol << endl;
+ m->checkName(firstCol);
+ m->checkName(secondCol);
+ //cout << firstCol << '\t' << secondCol << endl;
+
vector<string> names;
m->splitAtChar(secondCol, names, ',');
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ m->checkName(firstCol);
+ m->checkName(secondCol);
//parse names into vector
vector<string> theseNames;
m->splitAtComma(secondCol, theseNames);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ m->checkName(firstCol);
+ m->checkName(secondCol);
//parse names into vector
vector<string> theseNames;
m->splitAtComma(secondCol, theseNames);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ m->checkName(firstCol);
it = groupIndex.find(secondCol);
if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
groupIndex[secondCol] = count;
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ m->checkName(firstCol);
it = groupIndex.find(secondCol);
if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
groupIndex[secondCol] = count;
string firstCol, secondCol;
in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
+ m->checkName(firstCol);
+ m->checkName(secondCol);
+
vector<string> names;
m->splitAtChar(secondCol, names, ',');
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->endLine-pDataArray[i]->startLine)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->endLine-pDataArray[i]->startLine) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
vector<string> Estimators;
MothurOut* m;
string output;
- int numNewFasta;
+ int numNewFasta, count;
string countends;
distanceData(){}
ofstream outFile(pDataArray->dFileName.c_str(), ios::trunc);
outFile.setf(ios::fixed, ios::showpoint);
outFile << setprecision(4);
-
+ pDataArray->count = 0;
if (pDataArray->output != "square") {
if((pDataArray->output == "lt") && (pDataArray->startLine == 0)){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
if(i % 100 == 0){
pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
}
-
+ pDataArray->count++;
}
- pDataArray->m->mothurOut(toString(pDataArray->endLine-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut(toString(pDataArray->count) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
}else{
if(pDataArray->startLine == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
if(i % 100 == 0){
pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
}
-
+ pDataArray->count++;
}
- pDataArray->m->mothurOut(toString(pDataArray->endLine-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut(toString(pDataArray->count) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
}
outFile.close();
if (pid == 0) {
#endif
-
+
+ if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); }
+
mout->mothurOutEndLine();
input = getCommand();
//cout << pid << " is in execute " << commandName << endl;
#endif
//executes valid command
+ mout->changedSeqNames = false;
mout->runParse = true;
mout->clearGroups();
mout->clearAllGroups();
if (input[0] != '#') {
-
+ if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); }
mout->mothurOutEndLine();
mout->mothurOut("mothur > " + input);
mout->mothurOutEndLine();
if ((cFactory->MPIEnabled(commandName)) || (pid == 0)) {
#endif
//executes valid command
+ mout->changedSeqNames = false;
mout->runParse = true;
mout->clearGroups();
mout->clearAllGroups();
input = getNextCommand(listOfCommands);
if (input == "") { input = "quit()"; }
+
+ if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); }
if (mout->gui) {
if ((input.find("quit") != string::npos) || (input.find("set.logfile") != string::npos)) {}
//cout << pid << " is in execute" << endl;
#endif
//executes valid command
+ mout->changedSeqNames = false;
mout->runParse = true;
mout->clearGroups();
mout->clearAllGroups();
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
F.mergeFilter(pDataArray[i]->F.getFilter());
for (int k = 0; k < alignmentLength; k++) { F.a[k] += pDataArray[i]->F.a[k]; }
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); \r
}\r
\r
- pDataArray->count = pDataArray->end;\r
+ pDataArray->count = 0;\r
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process\r
\r
if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }\r
if(pDataArray->trump != '*') { pDataArray->F.doTrump(current); }\r
if(pDataArray->m->isTrue(pDataArray->vertical) || pDataArray->soft != 0) { pDataArray->F.getFreqs(current); }\r
}\r
- \r
+ pDataArray->count++;\r
//report progress\r
if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }\r
}\r
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); \r
}\r
\r
- pDataArray->count = pDataArray->end;\r
+ pDataArray->count = 0;\r
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process\r
\r
if (pDataArray->m->control_pressed) { in.close(); out.close(); pDataArray->count = 1; return 1; }\r
\r
out << '>' << seq.getName() << endl << filterSeq << endl;\r
}\r
- \r
+ pDataArray->count++;\r
//report progress\r
if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }\r
}\r
for (int j = 0; j < rareCounts.size(); j++) { //add "rare" OTU to the filtered lookup
filteredLookup[j]->push_back(rareCounts[j], thislookup[j]->getGroup());
}
-
- //create new label
- string oldLastLabel = saveBinLabels[saveBinLabels.size()-1];
- string tag = "";
- string otuNumber = "";
- for (int i = 0;i < oldLastLabel.length(); i++){
- //add numbers
- if( oldLastLabel[i]>47 && oldLastLabel[i]<58){ otuNumber += oldLastLabel[i]; }
- else { tag += oldLastLabel[i]; }
- }
-
- int oldLastBin;
- m->mothurConvert(otuNumber, oldLastBin);
- oldLastBin++;
- string newLabel = tag + toString(oldLastBin);
- filteredLabels.push_back(newLabel);
+ //create label for rare OTUs
+ filteredLabels.push_back("rareOTUs");
}
}
bool FlowData::getNext(ifstream& flowFile){
try {
- flowFile >> seqName >> endFlow;
- if (seqName.length() != 0) {
- //cout << "in Flowdata " + seqName << endl;
+ seqName = getSequenceName(flowFile);
+ flowFile >> endFlow;
+ if (!m->control_pressed) {
for(int i=0;i<numFlows;i++) { flowFile >> flowData[i]; }
- //cout << "in Flowdata read " << seqName + " done" << endl;
updateEndFlow();
translateFlow();
m->gobble(flowFile);
- }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
if(flowFile){ return 1; }
else { return 0; }
}
}
+//********************************************************************************************************************
+string FlowData::getSequenceName(ifstream& flowFile) {
+ try {
+ string name = "";
+
+ flowFile >> name;
+
+ if (name.length() != 0) {
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
+ }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ return name;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "FlowData", "getSequenceName");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
string seqName, locationString, sequence, baseFlow;
int numFlows, maxFlows, endFlow;
vector<float> flowData;
+ string getSequenceName(ifstream&);
};
#endif
--- /dev/null
+//
+// getdistscommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 1/28/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "getdistscommand.h"
+
+//**********************************************************************************************************************
+vector<string> GetDistsCommand::setParameters(){
+ try {
+ CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "PhylipColumn", "none","phylip",false,false,true); parameters.push_back(pphylip);
+ CommandParameter pcolumn("column", "InputTypes", "", "", "none", "PhylipColumn", "none","column",false,false,true); parameters.push_back(pcolumn);
+ CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(paccnos);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string GetDistsCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The get.dists command selects distances from a phylip or column file related to groups or sequences listed in an accnos file.\n";
+ helpString += "The get.dists command parameters are accnos, phylip and column.\n";
+ helpString += "The get.dists command should be in the following format: get.dists(accnos=yourAccnos, phylip=yourPhylip).\n";
+ helpString += "Example get.dists(accnos=final.accnos, phylip=final.an.thetayc.0.03.lt.ave.dist).\n";
+ helpString += "Note: No spaces between parameter labels (i.e. accnos), '=' and parameters (i.e.final.accnos).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string GetDistsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "phylip") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "column") { pattern = "[filename],pick,[extension]"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+GetDistsCommand::GetDistsCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["phylip"] = tempOutNames;
+ outputTypes["column"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "GetDistsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+GetDistsCommand::GetDistsCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string,string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["column"] = tempOutNames;
+ outputTypes["phylip"] = tempOutNames;
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("phylip");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["phylip"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("column");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["column"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("accnos");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["accnos"] = inputDir + it->second; }
+ }
+ }
+
+
+ //check for required parameters
+ accnosfile = validParameter.validFile(parameters, "accnos", true);
+ if (accnosfile == "not open") { abort = true; }
+ else if (accnosfile == "not found") {
+ accnosfile = m->getAccnosFile();
+ if (accnosfile != "") { m->mothurOut("Using " + accnosfile + " as input file for the accnos parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no valid accnos file and accnos is required."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }else { m->setAccnosFile(accnosfile); }
+
+ phylipfile = validParameter.validFile(parameters, "phylip", true);
+ if (phylipfile == "not open") { phylipfile = ""; abort = true; }
+ else if (phylipfile == "not found") { phylipfile = ""; }
+ else { m->setPhylipFile(phylipfile); }
+
+ columnfile = validParameter.validFile(parameters, "column", true);
+ if (columnfile == "not open") { columnfile = ""; abort = true; }
+ else if (columnfile == "not found") { columnfile = ""; }
+ else { m->setColumnFile(columnfile); }
+
+ if ((phylipfile == "") && (columnfile == "")) {
+ //is there are current file available for either of these?
+ //give priority to column, then phylip
+ columnfile = m->getColumnFile();
+ if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
+ else {
+ phylipfile = m->getPhylipFile();
+ if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("No valid current files. You must provide a phylip or column file."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "GetDistsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int GetDistsCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ //get names you want to keep
+ names = m->readAccnos(accnosfile);
+
+ if (m->control_pressed) { return 0; }
+
+ //read through the correct file and output lines you want to keep
+ if (phylipfile != "") { readPhylip(); }
+ if (columnfile != "") { readColumn(); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+
+ if (outputNames.size() != 0) {
+ m->mothurOutEndLine();
+ m->mothurOut("Output File names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ //set fasta file as new current fastafile
+ string current = "";
+ itTypes = outputTypes.find("phylip");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
+ }
+
+ itTypes = outputTypes.find("column");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
+ }
+ }
+
+ return 0;
+ }
+
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "execute");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int GetDistsCommand::readPhylip(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(phylipfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(phylipfile));
+ variables["[extension]"] = m->getExtension(phylipfile);
+ string outputFileName = getOutputFileName("phylip", variables);
+
+ ifstream in;
+ m->openInputFile(phylipfile, in);
+
+ float distance;
+ int square, nseqs;
+ string name;
+ unsigned int row;
+ set<unsigned int> rows; //converts names in names to a index
+ row = 0;
+
+ string numTest;
+ in >> numTest >> name;
+
+ if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+ else { convert(numTest, nseqs); }
+
+ if (names.count(name) != 0) { rows.insert(row); }
+ row++;
+
+ //is the matrix square?
+ char d;
+ while((d=in.get()) != EOF){
+
+ if(isalnum(d)){
+ square = 1;
+ in.putback(d);
+ for(int i=0;i<nseqs;i++){
+ in >> distance;
+ }
+ break;
+ }
+ if(d == '\n'){
+ square = 0;
+ break;
+ }
+ }
+
+ //map name to row/column
+ if(square == 0){
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ if (names.count(name) != 0) { rows.insert(row); }
+ row++;
+
+ for(int j=0;j<i;j++){
+ if (m->control_pressed) { in.close(); return 0; }
+ in >> distance;
+ }
+ }
+ }
+ else{
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ if (names.count(name) != 0) { rows.insert(row); }
+ row++;
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { in.close(); return 0; }
+ in >> distance;
+ }
+ }
+ }
+ in.close();
+
+ if (m->control_pressed) { return 0; }
+
+ //read through file only printing rows and columns of seqs in names
+ ifstream inPhylip;
+ m->openInputFile(phylipfile, inPhylip);
+
+ inPhylip >> numTest;
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputTypes["phylip"].push_back(outputFileName); outputNames.push_back(outputFileName);
+ out << names.size() << endl;
+
+ unsigned int count = 0;
+ if(square == 0){
+ for(int i=0;i<nseqs;i++){
+ inPhylip >> name;
+ bool ignoreRow = false;
+
+ if (names.count(name) == 0) { ignoreRow = true; }
+ else{ out << name << '\t'; count++; }
+
+ for(int j=0;j<i;j++){
+ if (m->control_pressed) { inPhylip.close(); out.close(); return 0; }
+ inPhylip >> distance;
+ if (!ignoreRow) {
+ //is this a column we want
+ if(rows.count(j) != 0) { out << distance << '\t'; }
+ }
+ }
+ if (!ignoreRow) { out << endl; }
+ }
+ }
+ else{
+ for(int i=0;i<nseqs;i++){
+ inPhylip >> name;
+
+ bool ignoreRow = false;
+
+ if (names.count(name) == 0) { ignoreRow = true; }
+ else{ out << name << '\t'; count++; }
+
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { inPhylip.close(); out.close(); return 0; }
+ inPhylip >> distance;
+ if (!ignoreRow) {
+ //is this a column we want
+ if(rows.count(j) != 0) { out << distance << '\t'; }
+ }
+ }
+ if (!ignoreRow) { out << endl; }
+ }
+ }
+ inPhylip.close();
+ out.close();
+
+ if (count == 0) { m->mothurOut("Your file does NOT contain distances related to groups or sequences listed in the accnos file."); m->mothurOutEndLine(); }
+ else if (count != names.size()) {
+ m->mothurOut("[WARNING]: Your accnos file contains " + toString(names.size()) + " groups or sequences, but I only found " + toString(count) + " of them in the phylip file."); m->mothurOutEndLine();
+ //rewrite with new number
+ m->renameFile(outputFileName, outputFileName+".temp");
+ ofstream out2;
+ m->openOutputFile(outputFileName, out2);
+ out2 << count << endl;
+
+ ifstream in3;
+ m->openInputFile(outputFileName+".temp", in3);
+ in3 >> nseqs; m->gobble(in3);
+ char buffer[4096];
+ while (!in3.eof()) {
+ in3.read(buffer, 4096);
+ out2.write(buffer, in3.gcount());
+ }
+ in3.close();
+ out2.close();
+ m->mothurRemove(outputFileName+".temp");
+ }
+
+ m->mothurOut("Selected " + toString(count) + " groups or sequences from your phylip file."); m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "readPhylip");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int GetDistsCommand::readColumn(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(columnfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(columnfile));
+ variables["[extension]"] = m->getExtension(columnfile);
+ string outputFileName = getOutputFileName("column", variables);
+ outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(columnfile, in);
+
+ set<string> foundNames;
+ string firstName, secondName;
+ float distance;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { out.close(); in.close(); return 0; }
+
+ in >> firstName >> secondName >> distance; m->gobble(in);
+
+ //are both names in the accnos file
+ if ((names.count(firstName) != 0) && (names.count(secondName) != 0)) {
+ out << firstName << '\t' << secondName << '\t' << distance << endl;
+ foundNames.insert(firstName);
+ foundNames.insert(secondName);
+ }
+ }
+ in.close();
+ out.close();
+
+ if (foundNames.size() == 0) { m->mothurOut("Your file does NOT contain distances related to groups or sequences listed in the accnos file."); m->mothurOutEndLine(); }
+ else if (foundNames.size() != names.size()) {
+ m->mothurOut("[WARNING]: Your accnos file contains " + toString(names.size()) + " groups or sequences, but I only found " + toString(foundNames.size()) + " of them in the column file."); m->mothurOutEndLine();
+ }
+
+ m->mothurOut("Selected " + toString(foundNames.size()) + " groups or sequences from your column file."); m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetDistsCommand", "readColumn");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
--- /dev/null
+//
+// getdistscommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 1/28/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef Mothur_getdistscommand_h
+#define Mothur_getdistscommand_h
+
+#include "command.hpp"
+
+class GetDistsCommand : public Command {
+
+public:
+
+ GetDistsCommand(string);
+ GetDistsCommand();
+ ~GetDistsCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "get.dists"; }
+ string getCommandCategory() { return "General"; }
+
+ string getHelpString();
+ string getOutputPattern(string);
+ string getCitation() { return "http://www.mothur.org/wiki/Get.dists"; }
+ string getDescription() { return "gets distances from a phylip or column file related to groups or sequences listed in an accnos file"; }
+
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+
+private:
+ set<string> names;
+ string accnosfile, phylipfile, columnfile, outputDir;
+ bool abort;
+ vector<string> outputNames;
+
+ int readPhylip();
+ int readColumn();
+
+};
+
+
+#endif
try {
CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,true, true); parameters.push_back(paccnos);
CommandParameter pconstaxonomy("constaxonomy", "InputTypes", "", "", "none", "FNGLT", "none","constaxonomy",false,false, true); parameters.push_back(pconstaxonomy);
+ CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none","list",false,false, true); parameters.push_back(plist);
+ CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT", "none","shared",false,false, true); parameters.push_back(pshared);
CommandParameter potucorr("otucorr", "InputTypes", "", "", "none", "FNGLT", "none","otucorr",false,false, true); parameters.push_back(potucorr);
CommandParameter pcorraxes("corraxes", "InputTypes", "", "", "none", "FNGLT", "none","corraxes",false,false, true); parameters.push_back(pcorraxes);
+ CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string GetOtuLabelsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The get.otulabels command can be used to select specific otus with the output from classify.otu, otu.association, or corr.axes.\n";
- helpString += "The get.otulabels parameters are: constaxonomy, otucorr, corraxes, and accnos.\n";
+ helpString += "The get.otulabels command can be used to select specific otus with the output from classify.otu, otu.association, or corr.axes commands. It can also be used to select a set of otus from a shared or list file.\n";
+ helpString += "The get.otulabels parameters are: constaxonomy, otucorr, corraxes, shared, list, label and accnos.\n";
helpString += "The constaxonomy parameter is used to input the results of the classify.otu command.\n";
helpString += "The otucorr parameter is used to input the results of the otu.association command.\n";
helpString += "The corraxes parameter is used to input the results of the corr.axes command.\n";
+ helpString += "The label parameter is used to analyze specific labels in your input. \n";
helpString += "The get.otulabels commmand should be in the following format: \n";
helpString += "get.otulabels(accnos=yourListOfOTULabels, corraxes=yourCorrAxesFile)\n";
return helpString;
try {
string pattern = "";
- if (type == "constaxonomy") { pattern = "[filename],pick,[extension]"; }
- else if (type == "otucorr") { pattern = "[filename],pick,[extension]"; }
+ if (type == "constaxonomy") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "otucorr") { pattern = "[filename],pick,[extension]"; }
else if (type == "corraxes") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "list") { pattern = "[filename],[distance],pick,[extension]"; }
+ else if (type == "shared") { pattern = "[filename],[distance],pick,[extension]"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
outputTypes["constaxonomy"] = tempOutNames;
outputTypes["otucorr"] = tempOutNames;
outputTypes["corraxes"] = tempOutNames;
+ outputTypes["shared"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "GetOtuLabelsCommand", "GetOtuLabelsCommand");
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["otucorr"] = inputDir + it->second; }
}
+
+ it = parameters.find("list");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["list"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("shared");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["shared"] = inputDir + it->second; }
+ }
}
vector<string> tempOutNames;
outputTypes["constaxonomy"] = tempOutNames;
outputTypes["otucorr"] = tempOutNames;
outputTypes["corraxes"] = tempOutNames;
+ outputTypes["shared"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
//check for parameters
accnosfile = validParameter.validFile(parameters, "accnos", true);
otucorrfile = validParameter.validFile(parameters, "otucorr", true);
if (otucorrfile == "not open") { otucorrfile = ""; abort = true; }
else if (otucorrfile == "not found") { otucorrfile = ""; }
-
+
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not open") { listfile = ""; abort = true; }
+ else if (listfile == "not found") { listfile = ""; }
+ else { m->setListFile(listfile); }
+
+ sharedfile = validParameter.validFile(parameters, "shared", true);
+ if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+ else if (sharedfile == "not found") { sharedfile = ""; }
+ else { m->setSharedFile(sharedfile); }
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
- if ((constaxonomyfile == "") && (corraxesfile == "") && (otucorrfile == "")) { m->mothurOut("You must provide one of the following: constaxonomy, corraxes or otucorr."); m->mothurOutEndLine(); abort = true; }
+ if ((constaxonomyfile == "") && (corraxesfile == "") && (otucorrfile == "") && (sharedfile == "") && (listfile == "")) { m->mothurOut("You must provide one of the following: constaxonomy, corraxes, otucorr, shared or list."); m->mothurOutEndLine(); abort = true; }
+
+ if ((sharedfile != "") || (listfile != "")) {
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; }
+ }
}
}
if (constaxonomyfile != "") { readClassifyOtu(); }
if (corraxesfile != "") { readCorrAxes(); }
if (otucorrfile != "") { readOtuAssociation(); }
+ if (listfile != "") { readList(); }
+ if (sharedfile != "") { readShared(); }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
+ string current = "";
+ itTypes = outputTypes.find("list");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
+ }
+
+ itTypes = outputTypes.find("shared");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); }
+ }
+
return 0;
}
catch(exception& e) {
}
}
//**********************************************************************************************************************
+int GetOtuLabelsCommand::readShared(){
+ try {
+
+ getShared();
+
+ if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
+
+ vector<string> newLabels;
+
+ //create new "filtered" lookup
+ vector<SharedRAbundVector*> newLookup;
+ for (int i = 0; i < lookup.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(lookup[i]->getLabel());
+ temp->setGroup(lookup[i]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ bool wroteSomething = false;
+ int numSelected = 0;
+ for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; } return 0; }
+
+ //is this otu on the list
+ if (labels.count(m->currentBinLabels[i]) != 0) {
+ numSelected++; wroteSomething = true;
+ newLabels.push_back(m->currentBinLabels[i]);
+ for (int j = 0; j < newLookup.size(); j++) { //add this OTU to the new lookup
+ newLookup[j]->push_back(lookup[j]->getAbundance(i), lookup[j]->getGroup());
+ }
+ }
+ }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile));
+ variables["[extension]"] = m->getExtension(sharedfile);
+ variables["[distance]"] = lookup[0]->getLabel();
+ string outputFileName = getOutputFileName("shared", variables);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; }
+
+ m->currentBinLabels = newLabels;
+
+ newLookup[0]->printHeaders(out);
+
+ for (int i = 0; i < newLookup.size(); i++) {
+ out << newLookup[i]->getLabel() << '\t' << newLookup[i]->getGroup() << '\t';
+ newLookup[i]->print(out);
+ }
+ out.close();
+
+ for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; }
+
+ if (wroteSomething == false) { m->mothurOut("Your file does not contain any OTUs from the .accnos file."); m->mothurOutEndLine(); }
+
+ m->mothurOut("Selected " + toString(numSelected) + " OTUs from your shared file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOtuLabelsCommand", "readShared");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int GetOtuLabelsCommand::readList(){
+ try {
+ getListVector();
+
+ if (m->control_pressed) { delete list; return 0;}
+
+ ListVector newList;
+ newList.setLabel(list->getLabel());
+ int selectedCount = 0;
+ bool wroteSomething = false;
+ string snumBins = toString(list->getNumBins());
+
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ if (m->control_pressed) { delete list; return 0;}
+
+ //create a label for this otu
+ string otuLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { otuLabel += "0"; }
+ }
+ otuLabel += sbinNumber;
+
+ if (labels.count(otuLabel) != 0) {
+ selectedCount++;
+ newList.push_back(list->get(i));
+ }
+ }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(listfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(listfile));
+ variables["[extension]"] = m->getExtension(listfile);
+ variables["[distance]"] = list->getLabel();
+ string outputFileName = getOutputFileName("list", variables);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ delete list;
+ //print new listvector
+ if (newList.getNumBins() != 0) {
+ wroteSomething = true;
+ newList.print(out);
+ }
+ out.close();
+
+ if (wroteSomething == false) { m->mothurOut("Your file does not contain any OTUs from the .accnos file."); m->mothurOutEndLine(); }
+ outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName);
+
+ m->mothurOut("Selected " + toString(selectedCount) + " OTUs from your list file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOtuLabelsCommand", "readList");
+ exit(1);
+ }
+ }
+//**********************************************************************************************************************
+int GetOtuLabelsCommand::getListVector(){
+ try {
+ InputData input(listfile, "list");
+ list = input.getListVector();
+ string lastLabel = list->getLabel();
+
+ if (label == "") { label = lastLabel; return 0; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((list != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { return 0; }
+
+ if(labels.count(list->getLabel()) == 1){
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
+ delete list;
+ list = input.getListVector(lastLabel);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = list->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ delete list;
+ list = input.getListVector();
+ }
+
+
+ if (m->control_pressed) { return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ delete list;
+ list = input.getListVector(lastLabel);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOtuLabelsCommand", "getListVector");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int GetOtuLabelsCommand::getShared(){
+ try {
+ InputData input(sharedfile, "sharedfile");
+ lookup = input.getSharedRAbundVectors();
+ string lastLabel = lookup[0]->getLabel();
+
+ if (label == "") { label = lastLabel; return 0; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((lookup[0] != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { return 0; }
+
+ if(labels.count(lookup[0]->getLabel()) == 1){
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = lookup[0]->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input.getSharedRAbundVectors();
+ }
+
+
+ if (m->control_pressed) { return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOtuLabelsCommand", "getShared");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
#include "command.hpp"
+#include "inputdata.h"
+#include "listvector.hpp"
+#include "sharedrabundvector.h"
/**************************************************************************************************/
private:
bool abort;
- string outputDir, accnosfile, constaxonomyfile, otucorrfile, corraxesfile;
+ string outputDir, accnosfile, constaxonomyfile, otucorrfile, corraxesfile, listfile, sharedfile, label;
vector<string> outputNames;
set<string> labels;
+ ListVector* list;
+ vector<SharedRAbundVector*> lookup;
int readClassifyOtu();
int readOtuAssociation();
int readCorrAxes();
+ int readList();
+ int readShared();
+ int getListVector();
+ int getShared();
};
/**************************************************************************************************/
/************************************************************/
GroupMap::~GroupMap(){}
-
/************************************************************/
int GroupMap::readMap() {
try {
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
setNamesOfGroups(seqGroup);
if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
-
+ m->checkName(seqName);
it = groupmap.find(seqName);
if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
void GroupMap::setGroup(string sequenceName, string groupN) {
setNamesOfGroups(groupN);
-
+ m->checkName(sequenceName);
it = groupmap.find(sequenceName);
if (it != groupmap.end()) { m->mothurOut("Your groupfile contains more than 1 sequence named " + sequenceName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); }
try {
CommandParameter pshared("shared", "InputTypes", "", "", "SharedRel", "SharedRel", "none","otulabels",false,false,true); parameters.push_back(pshared);
CommandParameter prelabund("relabund", "InputTypes", "", "", "SharedRel", "SharedRel", "none","otulabels",false,false); parameters.push_back(prelabund);
+ CommandParameter plist("list", "InputTypes", "", "", "SharedRel", "SharedRel", "none","otulabels",false,false); parameters.push_back(plist);
CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
//every command must have inputdir and outputdir. This allows mothur users to redirect input and output files.
string ListOtuLabelsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The list.otulabels lists otu labels from shared or relabund file. The results can be used by the get.otulabels to select specific otus with the output from classify.otu, otu.association, or corr.axes.\n";
+ helpString += "The list.otulabels lists otu labels from shared, relabund or list file. The results can be used by the get.otulabels to select specific otus with the output from classify.otu, otu.association, or corr.axes.\n";
helpString += "The list.otulabels parameters are: shared, relabund, label and groups.\n";
helpString += "The label parameter is used to analyze specific labels in your input.\n";
helpString += "The groups parameter allows you to specify which of the groups you would like analyzed.\n";
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["shared"] = inputDir + it->second; }
}
+
+ it = parameters.find("list");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["list"] = inputDir + it->second; }
+ }
}
vector<string> tempOutNames;
else if (relabundfile == "not found") { relabundfile = ""; }
else { inputFileName = relabundfile; format = "relabund"; m->setRelAbundFile(relabundfile); }
- if ((relabundfile == "") && (sharedfile == "")) {
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not open") { abort = true; }
+ else if (listfile == "not found") { listfile = ""; }
+ else { inputFileName = listfile; format = "list"; m->setListFile(listfile); }
+
+
+ if ((relabundfile == "") && (sharedfile == "") && (listfile== "")) {
//is there are current file available for either of these?
//give priority to shared, then relabund
sharedfile = m->getSharedFile();
relabundfile = m->getRelAbundFile();
if (relabundfile != "") { inputFileName = relabundfile; format="relabund"; m->mothurOut("Using " + relabundfile + " as input file for the relabund parameter."); m->mothurOutEndLine(); }
else {
- m->mothurOut("No valid current files. You must provide a shared or relabund."); m->mothurOutEndLine();
- abort = true;
+ listfile = m->getListFile();
+ if (listfile != "") { inputFileName = listfile; format="list"; m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("No valid current files. You must provide a shared, list or relabund."); m->mothurOutEndLine();
+ abort = true;
+ }
}
}
}
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
}
- }else {
+ }else if (format == "sharedfile") {
vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
string lastLabel = lookup[0]->getLabel();
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
}
+ }else {
+ ListVector* list = input.getListVector();
+ string lastLabel = list->getLabel();
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ if(allLines == 1 || labels.count(list->getLabel()) == 1){
+
+ m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+
+ createList(list);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+ }
+
+ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
+ delete list;
+ list = input.getListVector(lastLabel);
+ m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+
+ createList(list);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
+ }
+
+ lastLabel = list->getLabel();
+ //prevent memory leak
+ delete list; list = NULL;
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //get next line to process
+ list = input.getListVector();
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ delete list;
+ list = input.getListVector(lastLabel);
+
+ m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+
+ createList(list);
+
+ delete list;
+ }
}
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
return 0;
}
catch(exception& e) {
- m->errorOut(e, "ListOtuLabelsCommand", "createTable");
+ m->errorOut(e, "ListOtuLabelsCommand", "createList");
exit(1);
}
}
return 0;
}
catch(exception& e) {
- m->errorOut(e, "ListOtuLabelsCommand", "createTable");
+ m->errorOut(e, "ListOtuLabelsCommand", "createList");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ListOtuLabelsCommand::createList(ListVector*& list){
+ try {
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inputFileName));
+ variables["[distance]"] = list->getLabel();
+ string outputFileName = getOutputFileName("otulabels",variables);
+ outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ string snumBins = toString(list->getNumBins());
+ for (int i = 0; i < list->getNumBins(); i++) {
+ if (m->control_pressed) { break; }
+
+ string otuLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { otuLabel += "0"; }
+ }
+ otuLabel += sbinNumber;
+
+ out << otuLabel << endl;
+ }
+
+ out.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ListOtuLabelsCommand", "createList");
exit(1);
}
}
#include "command.hpp"
#include "sharedrabundvector.h"
+#include "listvector.hpp"
/**************************************************************************************************/
private:
bool abort, allLines;
- string outputDir, sharedfile, relabundfile, label, inputFileName, format;
+ string outputDir, sharedfile, relabundfile, label, inputFileName, format, listfile;
vector<string> outputNames;
vector<string> Groups;
set<string> labels;
int createList(vector<SharedRAbundFloatVector*>&);
int createList(vector<SharedRAbundVector*>&);
+ int createList(ListVector*&);
};
CommandParameter prfastq("rfastq", "InputTypes", "", "", "none", "none", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(prfastq);
CommandParameter pfasta("ffasta", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastaGroup","fasta",false,false,true); parameters.push_back(pfasta);
CommandParameter prfasta("rfasta", "InputTypes", "", "", "none", "none", "none","fastaGroup",false,false,true); parameters.push_back(prfasta);
- CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(pfqual);
- CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(prqual);
+ CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(pfqual);
+ CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual);
CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile);
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
- CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "","",false,false); parameters.push_back(pthreshold);
+ CommandParameter pthreshold("insert", "Number", "", "25", "", "", "","",false,false); parameters.push_back(pthreshold);
+ CommandParameter pdeltaq("deltaq", "Number", "", "6", "", "", "","",false,false); parameters.push_back(pdeltaq);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string helpString = "";
helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n";
helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
- helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n";
+ helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
- helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and qual file with all the sequences.\n";
+ helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n";
helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n";
helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n";
- helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
+ helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
+ helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
- helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
- helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
+ //helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+ //helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
+ helpString += "The deltaq parameter allows you to specify the delta allowed between quality scores of a mismatched base. For example in the overlap, if deltaq=5 and in the alignment seqA, pos 200 has a quality score of 30 and the same position in seqB has a quality score of 20, you take the base from seqA (30-20 >= 5). If the quality score in seqB is 28 then the base in the consensus will be an N (30-28<5) The default is 6.\n";
helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
- helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n";
+ helpString += "The insert parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=25.\n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
helpString += "The make.contigs command should be in the following format: \n";
string pattern = "";
if (type == "fasta") { pattern = "[filename],[tag],contigs.fasta"; }
- else if (type == "qfile") { pattern = "[filename],[tag],contigs.qual"; }
else if (type == "group") { pattern = "[filename],[tag],contigs.groups"; }
- else if (type == "mismatch") { pattern = "[filename],[tag],contigs.mismatch"; }
+ else if (type == "report") { pattern = "[filename],[tag],contigs.report"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
setParameters();
vector<string> tempOutNames;
outputTypes["fasta"] = tempOutNames;
- outputTypes["qfile"] = tempOutNames;
outputTypes["group"] = tempOutNames;
- outputTypes["mismatch"] = tempOutNames;
+ outputTypes["report"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand");
//initialize outputTypes
vector<string> tempOutNames;
outputTypes["fasta"] = tempOutNames;
- outputTypes["qfile"] = tempOutNames;
- outputTypes["mismatch"] = tempOutNames;
+ outputTypes["report"] = tempOutNames;
outputTypes["group"] = tempOutNames;
m->mothurConvert(temp, gapExtend);
if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
- temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "40"; }
- m->mothurConvert(temp, threshold);
- if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; }
+ temp = validParameter.validFile(parameters, "insert", false); if (temp == "not found"){ temp = "25"; }
+ m->mothurConvert(temp, insert);
+ if ((insert < 0) || (insert > 40)) { m->mothurOut("[ERROR]: insert must be between 0 and 40.\n"); abort=true; }
+ temp = validParameter.validFile(parameters, "deltaq", false); if (temp == "not found"){ temp = "6"; }
+ m->mothurConvert(temp, deltaq);
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; }
if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
+
+ format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "illumina1.8+"; }
+
+ if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
+ m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
+ abort=true;
+ }
+
+ //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
+ for (int i = -64; i < 65; i++) {
+ char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
+ convertTable.push_back(temp);
+ }
}
}
string compositeGroupFile = getOutputFileName("group",cvars);
cvars["[tag]"] = "trim";
string compositeFastaFile = getOutputFileName("fasta",cvars);
- string compositeQualFile = getOutputFileName("qfile",cvars);
cvars["[tag]"] = "scrap";
string compositeScrapFastaFile = getOutputFileName("fasta",cvars);
- string compositeScrapQualFile = getOutputFileName("qfile",cvars);
cvars["[tag]"] = "";
- string compositeMisMatchFile = getOutputFileName("mismatch",cvars);
+ string compositeMisMatchFile = getOutputFileName("report",cvars);
if (filesToProcess.size() > 1) { //clear files for append below
ofstream outCTFasta, outCTQual, outCSFasta, outCSQual, outCMisMatch;
m->openOutputFile(compositeFastaFile, outCTFasta); outCTFasta.close();
m->openOutputFile(compositeScrapFastaFile, outCSFasta); outCSFasta.close();
m->openOutputFile(compositeMisMatchFile, outCMisMatch); outCMisMatch.close();
- m->openOutputFile(compositeQualFile, outCTQual); outCTQual.close();
- m->openOutputFile(compositeScrapQualFile, outCSQual); outCSQual.close();
outputNames.push_back(compositeFastaFile); outputTypes["fasta"].push_back(compositeFastaFile);
- outputNames.push_back(compositeQualFile); outputTypes["qfile"].push_back(compositeQualFile);
- outputNames.push_back(compositeMisMatchFile); outputTypes["mismatch"].push_back(compositeMisMatchFile);
+ outputNames.push_back(compositeMisMatchFile); outputTypes["report"].push_back(compositeMisMatchFile);
outputNames.push_back(compositeScrapFastaFile); outputTypes["fasta"].push_back(compositeScrapFastaFile);
- outputNames.push_back(compositeScrapQualFile); outputTypes["qfile"].push_back(compositeScrapQualFile);
}
for (int l = 0; l < filesToProcess.size(); l++) {
m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n");
vector<vector<string> > fastaFileNames;
- vector<vector<string> > qualFileNames;
createGroup = false;
string outputGroupFileName;
map<string, string> variables;
variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0]));
variables["[tag]"] = "";
if(oligosfile != ""){
- createGroup = getOligos(fastaFileNames, qualFileNames, variables["[filename]"]);
+ createGroup = getOligos(fastaFileNames, variables["[filename]"]);
if (createGroup) {
outputGroupFileName = getOutputFileName("group",variables);
outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
variables["[tag]"] = "trim";
string outFastaFile = getOutputFileName("fasta",variables);
- string outQualFile = getOutputFileName("qfile",variables);
variables["[tag]"] = "scrap";
string outScrapFastaFile = getOutputFileName("fasta",variables);
- string outScrapQualFile = getOutputFileName("qfile",variables);
variables["[tag]"] = "";
- string outMisMatchFile = getOutputFileName("mismatch",variables);
+ string outMisMatchFile = getOutputFileName("report",variables);
outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
- if (filesToProcess[l][0][1] != "") {
- outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile);
- outputNames.push_back(outScrapQualFile); outputTypes["qfile"].push_back(outScrapQualFile);
- }
- outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile);
+ outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile);
m->mothurOut("Making contigs...\n");
- createProcesses(filesToProcess[l], outFastaFile, outQualFile, outScrapFastaFile, outScrapQualFile, outMisMatchFile, fastaFileNames, qualFileNames);
+ createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames);
m->mothurOut("Done.\n");
//remove temp fasta and qual files
if(m->isBlank(fastaFileNames[i][j])){
m->mothurRemove(fastaFileNames[i][j]);
namesToRemove.insert(fastaFileNames[i][j]);
-
- if (filesToProcess[l][0][1] != "") {
- m->mothurRemove(qualFileNames[i][j]);
- namesToRemove.insert(qualFileNames[i][j]);
- }
}else{
it = uniqueFastaNames.find(fastaFileNames[i][j]);
if (it == uniqueFastaNames.end()) {
}
m->appendFiles(outMisMatchFile, compositeMisMatchFile);
m->appendFiles(outFastaFile, compositeFastaFile);
- m->appendFiles(outQualFile, compositeQualFile);
m->appendFiles(outScrapFastaFile, compositeScrapFastaFile);
- m->appendFiles(outScrapQualFile, compositeScrapQualFile);
}
}
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n");
if ((itTypes->second).size() != 0) { currentFasta = (itTypes->second)[0]; m->setFastaFile(currentFasta); }
}
- string currentQual = "";
- itTypes = outputTypes.find("qfile");
- if (itTypes != outputTypes.end()) {
- if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); }
- }
-
string currentGroup = "";
itTypes = outputTypes.find("group");
if (itTypes != outputTypes.end()) {
}
}
//**********************************************************************************************************************
-int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
+int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames) {
try {
int num = 0;
vector<int> processIDS;
process++;
}else if (pid == 0){
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
- vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
if(allFiles){
ofstream temp;
if (tempFASTAFileNames[i][j] != "") {
tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
-
- if (files[processors-1][1] != "") {
- tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
- m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
- }
}
}
}
num = driver(files[process],
outputFasta + toString(getpid()) + ".temp",
- outputQual + toString(getpid()) + ".temp",
outputScrapFasta + toString(getpid()) + ".temp",
- outputScrapQual + toString(getpid()) + ".temp",
outputMisMatches + toString(getpid()) + ".temp",
- tempFASTAFileNames,
- tempPrimerQualFileNames);
+ tempFASTAFileNames, process);
//pass groupCounts to parent
ofstream out;
ofstream temp;
m->openOutputFile(outputFasta, temp); temp.close();
m->openOutputFile(outputScrapFasta, temp); temp.close();
- if (files[processors-1][1] != "") {
- m->openOutputFile(outputScrapQual, temp); temp.close();
- m->openOutputFile(outputQual, temp); temp.close();
- }
-
+
//do my part
- num = driver(files[processors-1], outputFasta, outputQual, outputScrapFasta, outputScrapQual, outputMisMatches, fastaFileNames, qualFileNames);
+ num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
string extension = "";
if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
- vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
-
+
if(allFiles){
ofstream temp;
if (tempFASTAFileNames[i][j] != "") {
tempFASTAFileNames[i][j] += extension;
m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
-
- if (files[processors-1][1] != "") {
- tempPrimerQualFileNames[i][j] += extension;
- m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
- }
}
}
}
}
- contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputQual + extension), (outputScrapFasta + extension), (outputScrapQual + extension),(outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, barcodes, primers, tempFASTAFileNames, tempPrimerQualFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h);
+ contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, barcodes, primers, tempFASTAFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h);
pDataArray.push_back(tempcontig);
hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
}
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
- vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
if(allFiles){
ofstream temp;
if (tempFASTAFileNames[i][j] != "") {
tempFASTAFileNames[i][j] += extension;
m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
-
- if (files[processors-1][1] != "") {
- tempPrimerQualFileNames[i][j] += extension;
- m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
- }
}
}
}
ofstream temp;
m->openOutputFile(outputFasta, temp); temp.close();
m->openOutputFile(outputScrapFasta, temp); temp.close();
- if (files[processors-1][1] != "") {
- m->openOutputFile(outputScrapQual, temp); temp.close();
- m->openOutputFile(outputQual, temp); temp.close();
- }
//do my part
processIDS.push_back(processors-1);
- num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames);
+ num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1);
//Wait until all threads have terminated.
WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (!pDataArray[i]->done) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
map<string, int>::iterator it2 = groupCounts.find(it->first);
if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta);
m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp"));
-
- if (files[processors-1][1] != "") {
- m->appendFiles((outputScrapQual + toString(processIDS[i]) + ".temp"), outputScrapQual);
- m->mothurRemove((outputScrapQual + toString(processIDS[i]) + ".temp"));
-
- m->appendFiles((outputQual + toString(processIDS[i]) + ".temp"), outputQual);
- m->mothurRemove((outputQual + toString(processIDS[i]) + ".temp"));
- }
m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches);
m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp"));
if (fastaFileNames[j][k] != "") {
m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
-
- if (files[processors-1][1] != "") {
- m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
- m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
- }
}
}
}
}
}
//**********************************************************************************************************************
-int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames){
+int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames, int process){
try {
Alignment* alignment;
if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
ifstream inFFasta, inRFasta, inFQual, inRQual;
- ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
+ ofstream outFasta, outMisMatch, outScrapFasta;
m->openInputFile(thisffastafile, inFFasta);
m->openInputFile(thisrfastafile, inRFasta);
if (thisfqualfile != "") {
m->openInputFile(thisfqualfile, inFQual);
m->openInputFile(thisrqualfile, inRQual);
- m->openOutputFile(outputScrapQual, outScrapQual);
- m->openOutputFile(outputQual, outQual);
}
m->openOutputFile(outputFasta, outFasta);
m->openOutputFile(outputScrapFasta, outScrapFasta);
m->openOutputFile(outputMisMatches, outMisMatch);
- outMisMatch << "Name\tLength\tMisMatches\n";
+ if (process == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; }
TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes);
//traverse alignments merging into one contiguous seq
string contig = "";
- vector<int> contigScores;
int numMismatches = 0;
string seq1 = fSeq.getAligned();
string seq2 = rSeq.getAligned();
//bigger of the 2 starting positions is the location of the overlapping start
if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
overlapStart = seq2Start;
- for (int i = 0; i < overlapStart; i++) {
- contig += seq1[i];
- if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
- }
+ for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; }
}else { //seq1 starts later so take from 0 to overlapStart from seq2
- for (int i = 0; i < overlapStart; i++) {
- contig += seq2[i];
- if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
- }
+ for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; }
}
int seq1End = fSeq.getEndPos();
int overlapEnd = seq1End;
if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
+ int oStart = contig.length();
for (int i = overlapStart; i < overlapEnd; i++) {
if (seq1[i] == seq2[i]) { //match, add base and choose highest score
contig += seq1[i];
- if (thisfqualfile != "") {
- contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
- }
- }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
+ }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
if (thisfqualfile != "") {
- if (scores2[BBaseMap[i]] < threshold) { } //
- else {
- contig += seq2[i];
- contigScores.push_back(scores2[BBaseMap[i]]);
- }
+ if (scores2[BBaseMap[i]] < insert) { } //
+ else { contig += seq2[i]; }
}else { contig += seq2[i]; } //with no quality info, then we keep it?
- }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
+ }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
if (thisfqualfile != "") {
- if (scores1[ABaseMap[i]] < threshold) { } //
- else {
- contig += seq1[i];
- contigScores.push_back(scores1[ABaseMap[i]]);
- }
+ if (scores1[ABaseMap[i]] < insert) { } //
+ else { contig += seq1[i]; }
}else { contig += seq1[i]; } //with no quality info, then we keep it?
}else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
if (thisfqualfile != "") {
- char c = seq1[i];
- contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
- contig += c;
+ if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
+ char c = seq1[i];
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
+ contig += c;
+ }else { //if no, base becomes n
+ contig += 'N';
+ }
numMismatches++;
}else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
}else { //should never get here
m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
}
}
-
+ int oend = contig.length();
if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
- for (int i = overlapEnd; i < length; i++) {
- contig += seq2[i];
- if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
- }
+ for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; }
}else { //seq2 ends before seq1 so take from overlap to length from seq1
- for (int i = overlapEnd; i < length; i++) {
- contig += seq1[i];
- if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
- }
-
+ for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; }
}
if(trashCode.length() == 0){
m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
output << ">" << fSeq.getName() << endl << contig << endl;
output.close();
-
- if (thisfqualfile != "") {
- m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
- output << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
- output << endl;
- output.close();
- }
}
//output
outFasta << ">" << fSeq.getName() << endl << contig << endl;
- if (thisfqualfile != "") {
- outQual << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
- outQual << endl;
- }
- outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
+ int numNs = 0;
+ for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } }
+ outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
}else {
//output
outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
- if (thisfqualfile != "") {
- outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
- for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
- outScrapQual << endl;
- }
}
num++;
if (thisfqualfile != "") {
inFQual.close();
inRQual.close();
- outQual.close();
- outScrapQual.close();
}
delete alignment;
- if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); if (thisfqualfile != "") { m->mothurRemove(outputQual); m->mothurRemove(outputScrapQual); } }
+ if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); }
return num;
}
if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
- if (checkReads(fread, rread, ffastq, rfastq)) {
+ //if (checkReads(fread, rread, ffastq, rfastq)) {
if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
//if the reads are okay write to output files
//report progress
if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
- }
+ //}
}
}
//report progress
if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
- if (checkReads(fread, rread, ffasta, rfasta)) {
+ // if (checkReads(fread, rread, ffasta, rfasta)) {
if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; }
//if the reads are okay write to output files
//report progress
if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
- }
+ //}
}
}
//report progress
if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
- vector<int> qualScores;
- int controlChar = int('!');
- for (int i = 0; i < quality.length(); i++) {
- int temp = int(quality[i]);
- temp -= controlChar;
-
- qualScores.push_back(temp);
- }
-
+ vector<int> qualScores = convertQual(quality);
+
read.name = name;
read.sequence = sequence;
read.scores = qualScores;
exit(1);
}
}
-//**********************************************************************************************************************
+/**********************************************************************************************************************
bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, string ffile, string rfile){
try {
bool good = true;
m->errorOut(e, "MakeContigsCommand", "checkReads");
exit(1);
}
-}
+}*/
//***************************************************************************************************************
vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
try {
//BARCODE atgcatgc atgcatgc groupName
//PRIMER atgcatgc atgcatgc groupName
//PRIMER atgcatgc atgcatgc
-bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, string rootname){
+bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, string rootname){
try {
ifstream in;
m->openInputFile(oligosfile, in);
while(!in.eof()){
in >> type;
- cout << type << endl;
+
if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
if(type[0] == '#'){
barcodes[indexBarcode]=newPair; indexBarcode++;
barcodeNameVector.push_back(group);
- cout << group << endl;
}else if(type == "LINKER"){
linker.push_back(foligo);
m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n");
for(int i=0;i<fastaFileNames.size();i++){
fastaFileNames[i].assign(primerNameVector.size(), "");
}
- qualFileNames = fastaFileNames;
if(allFiles){
set<string> uniqueNames; //used to cleanup outputFileNames
fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
m->openOutputFile(fastaFileName, temp); temp.close();
-
- if ((fqualfile != "") || (ffastqfile != "") || (file != "")) {
- qualFileName = rootname + ".qual";
- if (uniqueNames.count(qualFileName) == 0) {
- outputNames.push_back(qualFileName);
- outputTypes["qfile"].push_back(qualFileName);
- }
-
- qualFileNames[itBar->first][itPrimer->first] = qualFileName;
- m->openOutputFile(qualFileName, temp); temp.close();
- }
}
}
}
}
}
//**********************************************************************************************************************
+vector<int> MakeContigsCommand::convertQual(string qual) {
+ try {
+ vector<int> qualScores;
+ bool negativeScores = false;
+
+ for (int i = 0; i < qual.length(); i++) {
+
+ int temp = 0;
+ temp = int(qual[i]);
+ if (format == "illumina") {
+ temp -= 64; //char '@'
+ }else if (format == "illumina1.8+") {
+ temp -= int('!'); //char '!'
+ }else if (format == "solexa") {
+ temp = int(convertTable[temp]); //convert to sanger
+ temp -= int('!'); //char '!'
+ }else {
+ temp -= int('!'); //char '!'
+ }
+
+ if (temp < -5) { negativeScores = true; }
+ qualScores.push_back(temp);
+ }
+
+ if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; }
+
+ return qualScores;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "convertQual");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
private:
bool abort, allFiles, createGroup;
- string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file;
+ string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format;
float match, misMatch, gapOpen, gapExtend;
- int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
+ int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq;
vector<string> outputNames;
map<int, oligosPair> barcodes;
vector<string> linker;
vector<string> spacer;
vector<string> primerNameVector;
- vector<string> barcodeNameVector;
+ vector<string> barcodeNameVector;
+ vector<char> convertTable;
map<string, int> groupCounts;
map<string, string> groupMap;
+ vector<int> convertQual(string);
fastqRead readFastq(ifstream&, bool&);
vector< vector< vector<string> > > preProcessData(unsigned long int&);
vector< vector<string> > readFileNames(string);
vector< vector<string> > readFastqFiles(unsigned long int&, string, string);
vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
- bool checkReads(fastqRead&, fastqRead&, string, string);
- int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
- int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
- bool getOligos(vector<vector<string> >&, vector< vector<string> >&, string);
+ //bool checkReads(fastqRead&, fastqRead&, string, string);
+ int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >);
+ int driver(vector<string>, string, string, string, vector<vector<string> >, int);
+ bool getOligos(vector<vector<string> >&, string);
string reverseOligo(string);
vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
};
// that can be passed using a single void pointer (LPVOID).
struct contigsData {
string outputFasta;
- string outputQual;
string outputScrapFasta;
- string outputScrapQual;
string outputMisMatches;
string align;
vector<string> files;
vector<vector<string> > fastaFileNames;
- vector<vector<string> > qualFileNames;
MothurOut* m;
float match, misMatch, gapOpen, gapExtend;
- int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
- bool allFiles, createGroup;
+ int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq;
+ bool allFiles, createGroup, done;
map<string, int> groupCounts;
map<string, string> groupMap;
vector<string> primerNameVector;
map<int, oligosPair> primers;
contigsData(){}
- contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
+ contigsData(vector<string> f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
files = f;
outputFasta = of;
- outputQual = oq;
outputMisMatches = om;
m = mout;
match = ma;
misMatch = misMa;
gapOpen = gapO;
gapExtend = gapE;
- threshold = thr;
+ insert = thr;
align = al;
count = 0;
outputScrapFasta = osf;
- outputScrapQual = osq;
fastaFileNames = ffn;
- qualFileNames = qfn;
barcodes = br;
primers = pr;
barcodeNameVector = bnv;
allFiles = all;
createGroup = cg;
threadID = tid;
+ deltaq = delt;
+ done=false;
}
};
if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); }
else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); }
- int num = 0;
+ pDataArray->count = 0;
string thisffastafile = pDataArray->files[0];
string thisfqualfile = pDataArray->files[1];
string thisrfastafile = pDataArray->files[2];
if (pDataArray->fastaFileNames[i][j] != "") {
ofstream temp;
pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
- if (thisfqualfile != "") { pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close(); }
}
}
}
}
ifstream inFFasta, inRFasta, inFQual, inRQual;
- ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
+ ofstream outFasta, outMisMatch, outScrapFasta;
pDataArray->m->openInputFile(thisffastafile, inFFasta);
pDataArray->m->openInputFile(thisrfastafile, inRFasta);
if (thisfqualfile != "") {
pDataArray->m->openInputFile(thisfqualfile, inFQual);
pDataArray->m->openInputFile(thisrqualfile, inRQual);
- pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
- pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
}
pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
- outMisMatch << "Name\tLength\tMisMatches\n";
+ if (pDataArray->threadID == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; }
TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
//traverse alignments merging into one contiguous seq
string contig = "";
- vector<int> contigScores;
int numMismatches = 0;
string seq1 = fSeq.getAligned();
string seq2 = rSeq.getAligned();
//bigger of the 2 starting positions is the location of the overlapping start
if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
overlapStart = seq2Start;
- for (int i = 0; i < overlapStart; i++) {
- contig += seq1[i];
- if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
- }
+ for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; }
}else { //seq1 starts later so take from 0 to overlapStart from seq2
- for (int i = 0; i < overlapStart; i++) {
- contig += seq2[i];
- if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
- }
+ for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; }
}
int seq1End = fSeq.getEndPos();
int overlapEnd = seq1End;
if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
+ int oStart = contig.length();
for (int i = overlapStart; i < overlapEnd; i++) {
if (seq1[i] == seq2[i]) { //match, add base and choose highest score
contig += seq1[i];
- if (thisfqualfile != "") {
- contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
- }
- }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
+ }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
if (thisfqualfile != "") {
- if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
- else {
- contig += seq2[i];
- contigScores.push_back(scores2[BBaseMap[i]]);
- }
- }else { contig += seq2[i]; }
- }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
+ if (scores2[BBaseMap[i]] < pDataArray->insert) { } //
+ else { contig += seq2[i]; }
+ }else { contig += seq2[i]; } //with no quality info, then we keep it?
+ }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
if (thisfqualfile != "") {
- if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
- else {
- contig += seq1[i];
- contigScores.push_back(scores1[ABaseMap[i]]);
- }
- }else { contig += seq1[i]; }
+ if (scores1[ABaseMap[i]] < pDataArray->insert) { } //
+ else { contig += seq1[i]; }
+ }else { contig += seq1[i]; } //with no quality info, then we keep it?
}else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
if (thisfqualfile != "") {
- char c = seq1[i];
- contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
- contig += c;
+ if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
+ char c = seq1[i];
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
+ contig += c;
+ }else { //if no, base becomes n
+ contig += 'N';
+ }
numMismatches++;
- }else { numMismatches++; }
+ }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
}else { //should never get here
pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
}
}
+ int oend = contig.length();
if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
- for (int i = overlapEnd; i < length; i++) {
- contig += seq2[i];
- if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
- }
+ for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; }
}else { //seq2 ends before seq1 so take from overlap to length from seq1
- for (int i = overlapEnd; i < length; i++) {
- contig += seq1[i];
- if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
- }
-
+ for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; }
}
if(trashCode.length() == 0){
pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
output << ">" << fSeq.getName() << endl << contig << endl;
output.close();
-
- if (thisfqualfile != "") {
- pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
- output << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
- output << endl;
- output.close();
- }
}
//output
outFasta << ">" << fSeq.getName() << endl << contig << endl;
- if (thisfqualfile != "") {
- outQual << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
- outQual << endl;
- }
- outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
+ int numNs = 0;
+ for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } }
+ outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
}else {
//output
outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
- if (thisfqualfile != "") {
- outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
- for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
- outScrapQual << endl;
- }
}
- num++;
+ pDataArray->count++;
//report progress
- if((num) % 1000 == 0){ pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
- if((num) % 1000 != 0){ pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
inFFasta.close();
inRFasta.close();
if (thisfqualfile != "") {
inFQual.close();
inRQual.close();
- outQual.close();
- outScrapQual.close();
}
delete alignment;
- if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); if (thisfqualfile != "") { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); } }
+ pDataArray->done = true;
+ if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); }
return 0;
try {
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fastq",false,true,true); parameters.push_back(pfasta);
CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","fastq",false,true,true); parameters.push_back(pqfile);
- CommandParameter pformat("format", "Multiple", "sanger-illumina", "sanger", "", "", "","",false,false); parameters.push_back(pformat);
+ CommandParameter pformat("format", "Multiple", "sanger-illumina-illumina1.8+", "sanger", "", "", "","",false,false); parameters.push_back(pformat);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string helpString = "";
helpString += "The make.fastq command reads a fasta and quality file and creates a fastq file.\n";
helpString += "The make.fastq command parameters are fasta, qfile and format. fasta and qfile are required.\n";
- helpString += "The format parameter is used to indicate whether your sequences are sanger or illumina, default=sanger.\n";
+ helpString += "The format parameter is used to indicate whether your sequences are sanger, illumina1.8+ or illumina, default=sanger.\n";
helpString += "The make.fastq command should be in the following format: make.fastq(qfile=yourQualityFile, fasta=yourFasta).\n";
helpString += "Example make.fastq(fasta=amazon.fasta, qfile=amazon.qual).\n";
helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; }
- if ((format != "sanger") && (format != "illumina") && (format != "solexa")) {
- m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine();
+ if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+")) {
+ m->mothurOut(format + " is not a valid format. Your format choices are sanger, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
abort=true;
}
CYGWIN_BUILD ?= no
USECOMPRESSION ?= no
MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"11/2/2012\""
-VERSION = "\"1.28.0\""
+RELEASE_DATE = "\"1/23/2013\""
+VERSION = "\"1.29.1\""
FORTAN_COMPILER = gfortran
FORTRAN_FLAGS =
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
for (int k = 0; k < calcDists.size(); k++) {
if (iters != 0) {
//we need to find the average distance and standard deviation for each groups distance
+ vector< vector<seqDist> > calcAverages = m->getAverages(calcDistsTotals, mode);
- vector< vector<seqDist> > calcAverages; calcAverages.resize(matrixCalculators.size());
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- calcAverages[i].resize(calcDistsTotals[0][i].size());
-
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].seq1 = calcDistsTotals[0][i][j].seq1;
- calcAverages[i][j].seq2 = calcDistsTotals[0][i][j].seq2;
- calcAverages[i][j].dist = 0.0;
- }
- }
- if (mode == "average") {
- for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
- if (m->debug) { m->mothurOut("[DEBUG]: Totaling for average calc: iter = " + toString(thisIter) + ", " + thisLookup[calcDistsTotals[thisIter][i][j].seq1]->getGroup() + " - " + thisLookup[calcDistsTotals[thisIter][i][j].seq2]->getGroup() + " distance = " + toString(calcDistsTotals[thisIter][i][j].dist) + ". New total = " + toString(calcAverages[i][j].dist) + ".\n"); }
- }
- }
- }
-
- for (int i = 0; i < calcAverages.size(); i++) { //finds average.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist /= (float) iters;
- }
- }
- }else { //find median
- for (int i = 0; i < calcAverages.size(); i++) { //for each calc
- for (int j = 0; j < calcAverages[i].size(); j++) { //for each comparison
- vector<double> dists;
- for (int thisIter = 0; thisIter < iters; thisIter++) { //for each subsample
- dists.push_back(calcDistsTotals[thisIter][i][j].dist);
- }
- sort(dists.begin(), dists.end());
- calcAverages[i][j].dist = dists[(iters/2)];
- }
- }
- }
//find standard deviation
- vector< vector<seqDist> > stdDev; stdDev.resize(matrixCalculators.size());
- for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero.
- stdDev[i].resize(calcDistsTotals[0][i].size());
-
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].seq1 = calcDistsTotals[0][i][j].seq1;
- stdDev[i][j].seq2 = calcDistsTotals[0][i][j].seq2;
- stdDev[i][j].dist = 0.0;
- }
- }
-
- for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
- for (int i = 0; i < stdDev.size(); i++) {
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
- }
- }
- }
-
- for (int i = 0; i < stdDev.size(); i++) { //finds average.
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].dist /= (float) iters;
- stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
- }
- }
+ vector< vector<seqDist> > stdDev = m->getStandardDeviation(calcDistsTotals, calcAverages);
//print results
for (int i = 0; i < calcDists.size(); i++) {
unsigned long long start;
unsigned long long end;
MothurOut* m;
+ int count;
distSharedData(){}
distSharedData(MothurOut* mout, unsigned long long st, unsigned long long en, vector<string> est, vector<SharedRAbundVector*> lu) {
end = en;
Estimators = est;
thisLookup = lu;
+ count = 0;
}
};
/**************************************************************************************************/
vector<SharedRAbundVector*> subset;
for (int k = pDataArray->start; k < pDataArray->end; k++) { // pass cdd each set of groups to compare
-
+ pDataArray->count++;
for (int l = 0; l < k; l++) {
if (k != l) { //we dont need to similiarity of a groups to itself
try {
string pattern = "";
- if (type == "metastats") { pattern = "[filename],[distance],[groups],metastats"; }
+ if (type == "metastats") { pattern = "[filename],[distance],[group],metastats"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->num)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->num) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int j = 0; j < pDataArray[i]->thisLookUp.size(); j++) { delete pDataArray[i]->thisLookUp[j]; }
for (int j = 0; j < pDataArray[i]->outputNames.size(); j++) {
outputNames.push_back(pDataArray[i]->outputNames[j]);
vector<string> designMapGroups;
vector<string> outputNames;
int start;
- int num, iters;
+ int num, iters, count;
float threshold;
MothurOut* m;
string sharedfile;
designMapGroups = dg;
iters = i;
threshold = thr;
+ count=0;
}
};
/**************************************************************************************************/
//for each combo
for (int c = pDataArray->start; c < (pDataArray->start+pDataArray->num); c++) {
-
+ pDataArray->count++;
//get set names
string setA = pDataArray->namesOfGroupCombos[c][0];
string setB = pDataArray->namesOfGroupCombos[c][1];
//double vm, rss;
//mem_usage(vm, rss);
- mothurOut("[ERROR]: ");
- mothurOut(toString(e.what()));
- mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry.");
- mothurOutEndLine();
+ string errorType = toString(e.what());
+
+ int pos = errorType.find("bad_alloc");
+ mothurOut("[ERROR]: ");
+ mothurOut(errorType);
+
+ if (pos == string::npos) { //not bad_alloc
+ mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry.");
+ mothurOutEndLine();
+ }else { //bad alloc
+ if (object == "cluster"){
+ mothurOut(" has occurred in the " + object + " class function " + function + ". This error indicates your computer is running out of memory. There are two common causes for this, file size and format.\n\nFile Size:\nThe cluster command loads your distance matrix into RAM, and your distance file is most likely too large to fit in RAM. There are two options to help with this. The first is to use a cutoff. By using a cutoff mothur will only load distances that are below the cutoff. If that is still not enough, there is a command called cluster.split, http://www.mothur.org/wiki/cluster.split which divides the distance matrix, and clusters the smaller pieces separately. You may also be able to reduce the size of the original distance matrix by using the commands outlined in the Schloss SOP, http://www.mothur.org/wiki/Schloss_SOP. \n\nWrong Format:\nThis error can be caused by trying to read a column formatted distance matrix using the phylip parameter. By default, the dist.seqs command generates a column formatted distance matrix. To make a phylip formatted matrix set the dist.seqs command parameter output to lt. \n\nIf you are uable to resolve the issue, please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry.");
+ }else {
+ mothurOut(" has occurred in the " + object + " class function " + function + ". This error indicates your computer is running out of memory. This is most commonly caused by trying to process a dataset too large, or a file format issue. If you are running our 32bit version, your memory usage is limited to 4G. If you have more than 4G of RAM and are running a 64bit OS, using our 64bit version may resolve your issue. Also, you may be able to reduce the size of your dataset by using the commands outlined in the Schloss SOP, http://www.mothur.org/wiki/Schloss_SOP. If you are uable to resolve the issue, please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry.");
+ }
+ }
}
/*********************************************************************************************/
//The following was originally from http://stackoverflow.com/questions/669438/how-to-get-memory-usage-at-run-time-in-c
}
}
/**************************************************************************************************/
+
+vector<unsigned long long> MothurOut::divideFilePerLine(string filename, int& proc) {
+ try{
+ vector<unsigned long long> filePos;
+ filePos.push_back(0);
+
+ FILE * pFile;
+ unsigned long long size;
+
+ filename = getFullPathName(filename);
+
+ //get num bytes in file
+ pFile = fopen (filename.c_str(),"rb");
+ if (pFile==NULL) perror ("Error opening file");
+ else{
+ fseek (pFile, 0, SEEK_END);
+ size=ftell (pFile);
+ fclose (pFile);
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //estimate file breaks
+ unsigned long long chunkSize = 0;
+ chunkSize = size / proc;
+
+ //file to small to divide by processors
+ if (chunkSize == 0) { proc = 1; filePos.push_back(size); return filePos; }
+
+ //for each process seekg to closest file break and search for next '>' char. make that the filebreak
+ for (int i = 0; i < proc; i++) {
+ unsigned long long spot = (i+1) * chunkSize;
+
+ ifstream in;
+ openInputFile(filename, in);
+ in.seekg(spot);
+
+ //look for next line break
+ unsigned long long newSpot = spot;
+ while (!in.eof()) {
+ char c = in.get();
+
+ if ((c == '\n') || (c == '\r') || (c == '\f')) { gobble(in); newSpot = in.tellg(); break; }
+ else if (int(c) == -1) { break; }
+ }
+
+ //there was not another line before the end of the file
+ unsigned long long sanityPos = in.tellg();
+
+ if (sanityPos == -1) { break; }
+ else { filePos.push_back(newSpot); }
+
+ in.close();
+ }
+
+ //save end pos
+ filePos.push_back(size);
+
+ //sanity check filePos
+ for (int i = 0; i < (filePos.size()-1); i++) {
+ if (filePos[(i+1)] <= filePos[i]) { filePos.erase(filePos.begin()+(i+1)); i--; }
+ }
+
+ proc = (filePos.size() - 1);
+#else
+ mothurOut("[ERROR]: Windows version should not be calling the divideFile function."); mothurOutEndLine();
+ proc=1;
+ filePos.push_back(size);
+#endif
+ return filePos;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "divideFile");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
int MothurOut::divideFile(string filename, int& proc, vector<string>& files) {
try{
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
//are there confidence scores, if so remove them
if (secondCol.find_first_of('(') != -1) { removeConfidences(secondCol); }
map<string, string>::iterator itTax = taxMap.find(firstCol);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
//are there confidence scores, if so remove them
if (secondCol.find_first_of('(') != -1) { removeConfidences(secondCol); }
map<string, string>::iterator itTax = taxMap.find(firstCol);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+
//parse names into vector
vector<string> theseNames;
splitAtComma(secondCol, theseNames);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+
//parse names into vector
vector<string> theseNames;
splitAtComma(secondCol, theseNames);
- for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; }
+ for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; }
pairDone = false;
}
}
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
nameMap[secondCol] = firstCol;
pairDone = false;
}
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
nameMap[secondCol] = firstCol;
pairDone = false;
}
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
//parse names into vector
vector<string> theseNames;
splitAtComma(secondCol, theseNames);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
//parse names into vector
vector<string> theseNames;
splitAtComma(secondCol, theseNames);
if (columnOne) { firstCol = pieces[i]; columnOne=false; }
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
- if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; }
+ if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+ nameMap[firstCol] = secondCol; pairDone = false; }
}
}
in.close();
if (columnOne) { firstCol = pieces[i]; columnOne=false; }
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
- if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; }
+ if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+ nameMap[firstCol] = secondCol; pairDone = false; }
}
}
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
vector<string> temp;
splitAtComma(secondCol, temp);
nameMap[firstCol] = temp;
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
vector<string> temp;
splitAtComma(secondCol, temp);
nameMap[firstCol] = temp;
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+ int num = getNumNames(secondCol);
+ nameMap[firstCol] = num;
+ pairDone = false;
+ }
+ }
+ }
+ in.close();
+
+ if (rest != "") {
+ vector<string> pieces = splitWhiteSpace(rest);
+ for (int i = 0; i < pieces.size(); i++) {
+ if (columnOne) { firstCol = pieces[i]; columnOne=false; }
+ else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+
+ if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
+ int num = getNumNames(secondCol);
+ nameMap[firstCol] = num;
+ pairDone = false;
+ }
+ }
+ }
+
+ return nameMap;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "readNames");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+map<string, int> MothurOut::readNames(string namefile, unsigned long int& numSeqs) {
+ try {
+ map<string, int> nameMap;
+ numSeqs = 0;
+
+ //open input file
+ ifstream in;
+ openInputFile(namefile, in);
+
+ string rest = "";
+ char buffer[4096];
+ bool pairDone = false;
+ bool columnOne = true;
+ string firstCol, secondCol;
+
+ while (!in.eof()) {
+ if (control_pressed) { break; }
+
+ in.read(buffer, 4096);
+ vector<string> pieces = splitWhiteSpace(rest, buffer, in.gcount());
+
+ for (int i = 0; i < pieces.size(); i++) {
+ if (columnOne) { firstCol = pieces[i]; columnOne=false; }
+ else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+
+ if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
int num = getNumNames(secondCol);
nameMap[firstCol] = num;
pairDone = false;
+ numSeqs += num;
}
}
}
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
int num = getNumNames(secondCol);
nameMap[firstCol] = num;
pairDone = false;
+ numSeqs += num;
}
}
}
exit(1);
}
}
+/************************************************************/
+int MothurOut::checkName(string& name) {
+ try {
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; changedSeqNames = true; }
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "checkName");
+ exit(1);
+ }
+}
/**********************************************************************************************************************/
int MothurOut::readNames(string namefile, vector<seqPriorityNode>& nameVector, map<string, string>& fastamap) {
try {
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
int num = getNumNames(secondCol);
map<string, string>::iterator it = fastamap.find(firstCol);
else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
if (pairDone) {
+ checkName(firstCol);
+ checkName(secondCol);
int num = getNumNames(secondCol);
map<string, string>::iterator it = fastamap.find(firstCol);
in.read(buffer, 4096);
vector<string> pieces = splitWhiteSpace(rest, buffer, in.gcount());
- for (int i = 0; i < pieces.size(); i++) { names.insert(pieces[i]); }
+ for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.insert(pieces[i]); }
}
in.close();
if (rest != "") {
vector<string> pieces = splitWhiteSpace(rest);
- for (int i = 0; i < pieces.size(); i++) { names.insert(pieces[i]); }
+ for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.insert(pieces[i]); }
}
return names;
}
in.read(buffer, 4096);
vector<string> pieces = splitWhiteSpace(rest, buffer, in.gcount());
- for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); }
+ for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.push_back(pieces[i]); }
}
in.close();
if (rest != "") {
vector<string> pieces = splitWhiteSpace(rest);
- for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); }
+ for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.push_back(pieces[i]); }
}
return 0;
string individual = "";
int estimLength = estim.size();
bool prevEscape = false;
- for(int i=0;i<estimLength;i++){
+ /*for(int i=0;i<estimLength;i++){
if(prevEscape){
individual += estim[i];
prevEscape = false;
prevEscape = false;
}
}
- }
+ }*/
+
+
+ for(int i=0;i<estimLength;i++){
+ if(estim[i] == '-'){
+ if (prevEscape) { individual += estim[i]; prevEscape = false; } //add in dash because it was escaped.
+ else {
+ container.push_back(individual);
+ individual = "";
+ }
+ }else if(estim[i] == '\\'){
+ if (i < estimLength-1) {
+ if (estim[i+1] == '-') { prevEscape=true; } //are you a backslash before a dash, if yes ignore
+ else { individual += estim[i]; prevEscape = false; } //if no, add in
+ }else { individual += estim[i]; }
+ }else {
+ individual += estim[i];
+ }
+ }
+
+
+
container.push_back(individual);
}
catch(exception& e) {
exit(1);
}
}
+
/***********************************************************************/
string MothurOut::makeList(vector<string>& names) {
try {
string space = " ";
while(suffix.at(0) == ' ')
suffix = suffix.substr(1, suffix.length());
- }
+ }else { suffix = ""; }
- }
+ }
catch(exception& e) {
- errorOut(e, "MothurOut", "splitAtComma");
+ errorOut(e, "MothurOut", "splitAtChar");
exit(1);
}
}
string space = " ";
while(suffix.at(0) == ' ')
suffix = suffix.substr(1, suffix.length());
- }
+ }else { suffix = ""; }
}
catch(exception& e) {
exit(1);
}
}
+/**************************************************************************************************/
+vector<double> MothurOut::getAverages(vector< vector<double> >& dists) {
+ try{
+ vector<double> averages; //averages.resize(numComp, 0.0);
+ for (int i = 0; i < dists[0].size(); i++) { averages.push_back(0.0); }
+
+ for (int thisIter = 0; thisIter < dists.size(); thisIter++) {
+ for (int i = 0; i < dists[thisIter].size(); i++) {
+ averages[i] += dists[thisIter][i];
+ }
+ }
+
+ //finds average.
+ for (int i = 0; i < averages.size(); i++) { averages[i] /= (double) dists.size(); }
+
+ return averages;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector<double> MothurOut::getStandardDeviation(vector< vector<double> >& dists) {
+ try{
+
+ vector<double> averages = getAverages(dists);
+
+ //find standard deviation
+ vector<double> stdDev; //stdDev.resize(numComp, 0.0);
+ for (int i = 0; i < dists[0].size(); i++) { stdDev.push_back(0.0); }
+
+ for (int thisIter = 0; thisIter < dists.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int j = 0; j < dists[thisIter].size(); j++) {
+ stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+ }
+ }
+ for (int i = 0; i < stdDev.size(); i++) {
+ stdDev[i] /= (double) dists.size();
+ stdDev[i] = sqrt(stdDev[i]);
+ }
+
+ return stdDev;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector<double> MothurOut::getStandardDeviation(vector< vector<double> >& dists, vector<double>& averages) {
+ try{
+ //find standard deviation
+ vector<double> stdDev; //stdDev.resize(numComp, 0.0);
+ for (int i = 0; i < dists[0].size(); i++) { stdDev.push_back(0.0); }
+
+ for (int thisIter = 0; thisIter < dists.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int j = 0; j < dists[thisIter].size(); j++) {
+ stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
+ }
+ }
+ for (int i = 0; i < stdDev.size(); i++) {
+ stdDev[i] /= (double) dists.size();
+ stdDev[i] = sqrt(stdDev[i]);
+ }
+
+ return stdDev;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector< vector<seqDist> > MothurOut::getAverages(vector< vector< vector<seqDist> > >& calcDistsTotals, string mode) {
+ try{
+
+ vector< vector<seqDist> > calcAverages; //calcAverages.resize(calcDistsTotals[0].size());
+ for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero.
+ //calcAverages[i].resize(calcDistsTotals[0][i].size());
+ vector<seqDist> temp;
+ for (int j = 0; j < calcDistsTotals[0][i].size(); j++) {
+ seqDist tempDist;
+ tempDist.seq1 = calcDistsTotals[0][i][j].seq1;
+ tempDist.seq2 = calcDistsTotals[0][i][j].seq2;
+ tempDist.dist = 0.0;
+ temp.push_back(tempDist);
+ }
+ calcAverages.push_back(temp);
+ }
+
+ if (mode == "average") {
+ for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //sum all groups dists for each calculator
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+ }
+ }
+ }
+
+ for (int i = 0; i < calcAverages.size(); i++) { //finds average.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist /= (float) calcDistsTotals.size();
+ }
+ }
+ }else { //find median
+ for (int i = 0; i < calcAverages.size(); i++) { //for each calc
+ for (int j = 0; j < calcAverages[i].size(); j++) { //for each comparison
+ vector<double> dists;
+ for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //for each subsample
+ dists.push_back(calcDistsTotals[thisIter][i][j].dist);
+ }
+ sort(dists.begin(), dists.end());
+ calcAverages[i][j].dist = dists[(calcDistsTotals.size()/2)];
+ }
+ }
+ }
+
+ return calcAverages;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector< vector<seqDist> > MothurOut::getAverages(vector< vector< vector<seqDist> > >& calcDistsTotals) {
+ try{
+
+ vector< vector<seqDist> > calcAverages; //calcAverages.resize(calcDistsTotals[0].size());
+ for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero.
+ //calcAverages[i].resize(calcDistsTotals[0][i].size());
+ vector<seqDist> temp;
+ for (int j = 0; j < calcDistsTotals[0][i].size(); j++) {
+ seqDist tempDist;
+ tempDist.seq1 = calcDistsTotals[0][i][j].seq1;
+ tempDist.seq2 = calcDistsTotals[0][i][j].seq2;
+ tempDist.dist = 0.0;
+ temp.push_back(tempDist);
+ }
+ calcAverages.push_back(temp);
+ }
+
+
+ for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //sum all groups dists for each calculator
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+ }
+ }
+ }
+
+ for (int i = 0; i < calcAverages.size(); i++) { //finds average.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist /= (float) calcDistsTotals.size();
+ }
+ }
+
+ return calcAverages;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector< vector<seqDist> > MothurOut::getStandardDeviation(vector< vector< vector<seqDist> > >& calcDistsTotals) {
+ try{
+
+ vector< vector<seqDist> > calcAverages = getAverages(calcDistsTotals);
+
+ //find standard deviation
+ vector< vector<seqDist> > stdDev;
+ for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero.
+ vector<seqDist> temp;
+ for (int j = 0; j < calcDistsTotals[0][i].size(); j++) {
+ seqDist tempDist;
+ tempDist.seq1 = calcDistsTotals[0][i][j].seq1;
+ tempDist.seq2 = calcDistsTotals[0][i][j].seq2;
+ tempDist.dist = 0.0;
+ temp.push_back(tempDist);
+ }
+ stdDev.push_back(temp);
+ }
+
+ for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int i = 0; i < stdDev.size(); i++) {
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
+ }
+ }
+ }
+
+ for (int i = 0; i < stdDev.size(); i++) { //finds average.
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist /= (float) calcDistsTotals.size();
+ stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
+ }
+ }
+
+ return stdDev;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector< vector<seqDist> > MothurOut::getStandardDeviation(vector< vector< vector<seqDist> > >& calcDistsTotals, vector< vector<seqDist> >& calcAverages) {
+ try{
+ //find standard deviation
+ vector< vector<seqDist> > stdDev;
+ for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero.
+ vector<seqDist> temp;
+ for (int j = 0; j < calcDistsTotals[0][i].size(); j++) {
+ seqDist tempDist;
+ tempDist.seq1 = calcDistsTotals[0][i][j].seq1;
+ tempDist.seq2 = calcDistsTotals[0][i][j].seq2;
+ tempDist.dist = 0.0;
+ temp.push_back(tempDist);
+ }
+ stdDev.push_back(temp);
+ }
+
+ for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int i = 0; i < stdDev.size(); i++) {
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
+ }
+ }
+ }
+
+ for (int i = 0; i < stdDev.size(); i++) { //finds average.
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist /= (float) calcDistsTotals.size();
+ stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
+ }
+ }
+
+ return stdDev;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "getAverages");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
bool MothurOut::isContainingOnlyDigits(string input) {
try{
vector<string> binLabelsInFile;
vector<string> currentBinLabels;
string saveNextLabel, argv, sharedHeaderMode, groupMode;
- bool printedHeaders, commandInputsConvertError;
+ bool printedHeaders, commandInputsConvertError, changedSeqNames;
//functions from mothur.h
//file operations
bool dirCheck(string&); //completes path, appends appropriate / or \, makes sure dir is writable.
- vector<unsigned long long> divideFile(string, int&);
+ vector<unsigned long long> divideFile(string, int&); //divides splitting unevenness by sequence
+ vector<unsigned long long> divideFilePerLine(string, int&); //divides splitting unevenness at line breaks
int divideFile(string, int&, vector<string>&);
vector<unsigned long long> setFilePosEachLine(string, int&);
vector<unsigned long long> setFilePosFasta(string, int&);
set<string> readAccnos(string);
int readAccnos(string, vector<string>&);
map<string, int> readNames(string);
+ map<string, int> readNames(string, unsigned long int&);
int readTax(string, map<string, string>&);
int readNames(string, map<string, string>&, map<string, int>&);
int readNames(string, map<string, string>&);
string removeQuotes(string);
string makeList(vector<string>&);
bool isSubset(vector<string>, vector<string>); //bigSet, subset
+ int checkName(string&);
//math operation
int factorial(int num);
unsigned int fromBase36(string);
int getRandomIndex(int); //highest
double getStandardDeviation(vector<int>&);
+ vector<double> getStandardDeviation(vector< vector<double> >&);
+ vector<double> getStandardDeviation(vector< vector<double> >&, vector<double>&);
+ vector<double> getAverages(vector< vector<double> >&);
+ vector< vector<seqDist> > getStandardDeviation(vector< vector< vector<seqDist> > >&);
+ vector< vector<seqDist> > getStandardDeviation(vector< vector< vector<seqDist> > >&, vector< vector<seqDist> >&);
+ vector< vector<seqDist> > getAverages(vector< vector< vector<seqDist> > >&, string);
+ vector< vector<seqDist> > getAverages(vector< vector< vector<seqDist> > >&);
int control_pressed;
bool executing, runParse, jumble, gui, mothurCalling, debug;
debug = false;
sharedHeaderMode = "";
groupMode = "group";
+ changedSeqNames = false;
}
~MothurOut();
///variables for examples below that you will most likely want to put in the header for
//use by the other class functions.
- string phylipfile, columnfile, namefile, fastafile, sharedfile, method;
+ string phylipfile, columnfile, namefile, fastafile, sharedfile, method, countfile;
int processors;
bool useTiming, allLines;
vector<string> Estimators, Groups;
//saved by mothur that is associated with the other files you are using as inputs.
//You can do so by adding the files associated with the namefile to the files vector and then asking parser to check.
//This saves our users headaches over file mismatches because they forgot to include the namefile, :)
- if (namefile == "") {
- vector<string> files; files.push_back(fastafile);
- parser.getNameFile(files);
- }
+ if (countfile == "") {
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
+
}
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
outFile.setf(ios::fixed, ios::showpoint);
outFile << setprecision(4);
- pDataArray->count = pDataArray->end;
+ pDataArray->count = 0;
int startTime = time(NULL);
if(pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
for(int i=pDataArray->start;i<pDataArray->end;i++){
+ pDataArray->count++;
string name = pDataArray->alignDB.get(i).getName();
//pad with spaces to make compatible
}
}
- pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut(toString(pDataArray->count) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
outFile.close();
delete alignment;
CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pfastq);
CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta);
CommandParameter pqual("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqual);
- CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
+ CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n";
helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n";
helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
- helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n";
+ helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=sanger.\n";
helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n";
helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n";
helpString += "Example fastq.info(fastaq=test.fastaq).\n";
format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; }
- if ((format != "sanger") && (format != "illumina") && (format != "solexa")) {
- m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine();
+ if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
+ m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
abort=true;
}
try {
vector<int> qualScores;
+ bool negativeScores = false;
+
for (int i = 0; i < qual.length(); i++) {
int temp = 0;
temp = int(qual[i]);
if (format == "illumina") {
temp -= 64; //char '@'
+ }else if (format == "illumina1.8+") {
+ temp -= int('!'); //char '!'
}else if (format == "solexa") {
temp = int(convertTable[temp]); //convert to sanger
temp -= int('!'); //char '!'
}else {
temp -= int('!'); //char '!'
}
+ if (temp < -5) { negativeScores = true; }
qualScores.push_back(temp);
}
+ if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; }
+
return qualScores;
}
catch(exception& e) {
~ParseFastaQCommand() {}
vector<string> setParameters();
- string getCommandName() { return "parse.fastq"; }
+ string getCommandName() { return "fastq.info"; }
string getCommandCategory() { return "Sequence Processing"; }
string getHelpString();
}
set<int> lengths;
- pDataArray->count = pDataArray->fend;
+
for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
-
+ pDataArray->count++;
if (pDataArray->m->control_pressed) { break; }
Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->fend) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter ptopdown("topdown", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptopdown);
+
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string helpString = "";
helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
helpString += "The pre.cluster command outputs a new fasta and name file.\n";
- helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n";
+ helpString += "The pre.cluster command parameters are fasta, name, group, count, topdown, processors and diffs. The fasta parameter is required. \n";
helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
+ helpString += "The topdown parameter allows you to specify whether to cluster from largest abundance to smallest or smallest to largest. Default=T, meanging largest to smallest.\n";
helpString += "The pre.cluster command should be in the following format: \n";
helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+ temp = validParameter.validFile(parameters, "topdown", false); if(temp == "not found"){ temp = "T"; }
+ topdown = m->isTrue(temp);
+
if (countfile == "") {
if (namefile == "") {
vector<string> files; files.push_back(fastafile);
// Allocate memory for thread data.
string extension = toString(i) + ".temp";
- preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
+ preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, topdown, i);
pDataArray.push_back(tempPreCluster);
processIDS.push_back(i);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]);
}
m->openOutputFile(newMapFile, out);
//sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+ if (topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown); }
+ else { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop); }
int count = 0;
int numSeqs = alignSeqs.size();
~seqPNode() {}
};
/************************************************************/
-inline bool comparePriority(seqPNode first, seqPNode second) {
+inline bool comparePriorityTopDown(seqPNode first, seqPNode second) {
if (first.numIdentical > second.numIdentical) { return true; }
else if (first.numIdentical == second.numIdentical) {
if (first.seq.getName() > second.seq.getName()) { return true; }
}
return false;
}
+/************************************************************/
+inline bool comparePriorityDownTop(seqPNode first, seqPNode second) {
+ if (first.numIdentical < second.numIdentical) { return true; }
+ else if (first.numIdentical == second.numIdentical) {
+ if (first.seq.getName() > second.seq.getName()) { return true; }
+ }
+ return false;
+}
//************************************************************/
class PreClusterCommand : public Command {
CountTable ct;
int diffs, length, processors;
- bool abort, bygroup;
+ bool abort, bygroup, topdown;
string fastafile, namefile, outputDir, groupfile, countfile;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
string newFName, newNName, newMName;
MothurOut* m;
int start;
- int end;
+ int end, count;
int diffs, threadID;
vector<string> groups;
vector<string> mapFileNames;
+ bool topdown;
preClusterData(){}
- preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
+ preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, bool td, int tid) {
fastafile = f;
namefile = n;
groupfile = g;
threadID = tid;
groups = gr;
countfile = c;
+ topdown = td;
+ count=0;
}
};
//precluster each group
for (int k = pDataArray->start; k < pDataArray->end; k++) {
+ pDataArray->count++;
+
int start = time(NULL);
if (pDataArray->m->control_pressed) { delete parser; return 0; }
pDataArray->m->openOutputFile(pDataArray->newMName+pDataArray->groups[k]+".map", out);
pDataArray->mapFileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
- //sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
-
+ //sort seqs by number of identical seqs
+ if (pDataArray->topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown); }
+ else { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop); }
+
int count = 0;
//think about running through twice...
--- /dev/null
+//
+// primerdesigncommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 1/18/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "primerdesigncommand.h"
+
+//**********************************************************************************************************************
+vector<string> PrimerDesignCommand::setParameters(){
+ try {
+ CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+ CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","summary-list",false,true,true); parameters.push_back(plist);
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","",false,true, true); parameters.push_back(pfasta);
+ CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
+ CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
+ CommandParameter plength("length", "Number", "", "18", "", "", "","",false,false); parameters.push_back(plength);
+ CommandParameter pmintm("mintm", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmintm);
+ CommandParameter pmaxtm("maxtm", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxtm);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pprocessors);
+ CommandParameter potunumber("otunumber", "Number", "", "-1", "", "", "","",false,true,true); parameters.push_back(potunumber);
+ CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
+ CommandParameter pcutoff("cutoff", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pcutoff);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string PrimerDesignCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The primer.design allows you to identify sequence fragments that are specific to particular OTUs.\n";
+ helpString += "The primer.design command parameters are: list, fasta, name, count, otunumber, cutoff, length, pdiffs, mintm, maxtm, processors and label.\n";
+ helpString += "The list parameter allows you to provide a list file and is required.\n";
+ helpString += "The fasta parameter allows you to provide a fasta file and is required.\n";
+ helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
+ helpString += "The count parameter allows you to provide a count file associated with your fasta file.\n";
+ helpString += "The label parameter is used to indicate the label you want to use from your list file.\n";
+ helpString += "The otunumber parameter is used to indicate the otu you want to use from your list file. It is required.\n";
+ helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+ helpString += "The length parameter is used to indicate the length of the primer. The default is 18.\n";
+ helpString += "The mintm parameter is used to indicate minimum melting temperature.\n";
+ helpString += "The maxtm parameter is used to indicate maximum melting temperature.\n";
+ helpString += "The processors parameter allows you to indicate the number of processors you want to use. Default=1.\n";
+ helpString += "The cutoff parameter allows you set a percentage of sequences that support the base. For example: cutoff=97 would only return a sequence that only showed ambiguities for bases that were not supported by at least 97% of sequences.\n";
+ helpString += "The primer.desing command should be in the following format: primer.design(list=yourListFile, fasta=yourFastaFile, name=yourNameFile)\n";
+ helpString += "primer.design(list=final.an.list, fasta=final.fasta, name=final.names, label=0.03)\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string PrimerDesignCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "fasta") { pattern = "[filename],[distance],otu.cons.fasta"; }
+ else if (type == "summary") { pattern = "[filename],[distance],primer.summary"; }
+ else if (type == "list") { pattern = "[filename],pick,[extension]"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+PrimerDesignCommand::PrimerDesignCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "PrimerDesignCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+PrimerDesignCommand::PrimerDesignCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ //valid paramters for this command
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string,string>::iterator it;
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("count");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["count"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("fasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["fasta"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("list");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["list"] = inputDir + it->second; }
+ }
+ }
+
+ //check for parameters
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+ else { m->setNameFile(namefile); }
+
+ countfile = validParameter.validFile(parameters, "count", true);
+ if (countfile == "not open") { countfile = ""; abort = true; }
+ else if (countfile == "not found") { countfile = ""; }
+ else { m->setCountTableFile(countfile); }
+
+ //get fastafile - it is required
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not open") { fastafile = ""; abort=true; }
+ else if (fastafile == "not found") {
+ fastafile = m->getFastaFile();
+ if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setFastaFile(fastafile); }
+
+ //get listfile - it is required
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not open") { listfile = ""; abort=true; }
+ else if (listfile == "not found") {
+ listfile = m->getListFile();
+ if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setListFile(listfile); }
+
+
+ if ((namefile != "") && (countfile != "")) {
+ m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+ }
+
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
+ outputDir = m->hasPath(listfile); //if user entered a file with a path then preserve it
+ }
+
+ string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "100"; }
+ m->mothurConvert(temp, cutoff);
+
+ temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, pdiffs);
+
+ temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "18"; }
+ m->mothurConvert(temp, length);
+
+ temp = validParameter.validFile(parameters, "mintm", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, minTM);
+
+ temp = validParameter.validFile(parameters, "maxtm", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, maxTM);
+
+ temp = validParameter.validFile(parameters, "otunumber", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, otunumber);
+ if (otunumber < 1) { m->mothurOut("[ERROR]: You must provide an OTU number, aborting.\n"); abort = true; }
+
+ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
+ m->setProcessors(temp);
+ m->mothurConvert(temp, processors);
+
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; }
+
+ if (countfile == "") {
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "PrimerDesignCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int PrimerDesignCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ int start = time(NULL);
+ //////////////////////////////////////////////////////////////////////////////
+ // get file inputs //
+ //////////////////////////////////////////////////////////////////////////////
+
+ //reads list file and selects the label the users specified or the first label
+ getListVector();
+ if (otunumber > list->getNumBins()) { m->mothurOut("[ERROR]: You selected an OTU number larger than the number of OTUs you have in your list file, quitting.\n"); return 0; }
+
+ map<string, int> nameMap;
+ unsigned long int numSeqs; //used to sanity check the files. numSeqs = total seqs for namefile and uniques for count.
+ //list file should have all seqs if namefile was used to create it and only uniques in count file was used.
+
+ if (namefile != "") { nameMap = m->readNames(namefile, numSeqs); }
+ else if (countfile != "") { nameMap = readCount(numSeqs); }
+ else { numSeqs = list->getNumSeqs(); }
+
+ //sanity check
+ if (numSeqs != list->getNumSeqs()) {
+ if (namefile != "") { m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your name file contains " + toString(numSeqs) + " sequences, aborting. Do you have the correct files? Perhaps you forgot to include the name file when you clustered? \n"); }
+ else if (countfile != "") {
+ m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your count file contains " + toString(numSeqs) + " unique sequences, aborting. Do you have the correct files? Perhaps you forgot to include the count file when you clustered? \n");
+ }
+ m->control_pressed = true;
+ }
+
+ if (m->control_pressed) { delete list; return 0; }
+
+ //////////////////////////////////////////////////////////////////////////////
+ // process data //
+ //////////////////////////////////////////////////////////////////////////////
+ m->mothurOut("\nFinding consensus sequences for each otu..."); cout.flush();
+
+ vector<Sequence> conSeqs = createProcessesConSeqs(nameMap, numSeqs);
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
+ variables["[distance]"] = list->getLabel();
+ string consFastaFile = getOutputFileName("fasta", variables);
+ outputNames.push_back(consFastaFile); outputTypes["fasta"].push_back(consFastaFile);
+ ofstream out;
+ m->openOutputFile(consFastaFile, out);
+ for (int i = 0; i < conSeqs.size(); i++) { conSeqs[i].printSequence(out); }
+ out.close();
+
+ m->mothurOut("Done.\n\n");
+
+ set<string> primers = getPrimer(conSeqs[otunumber-1]);
+
+ if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ string consSummaryFile = getOutputFileName("summary", variables);
+ outputNames.push_back(consSummaryFile); outputTypes["summary"].push_back(consSummaryFile);
+ ofstream outSum;
+ m->openOutputFile(consSummaryFile, outSum);
+
+ outSum << "PrimerOtu: " << otunumber << " Members: " << list->get(otunumber-1) << endl << "Primers\tminTm\tmaxTm" << endl;
+
+ //find min and max melting points
+ vector<double> minTms;
+ vector<double> maxTms;
+ string primerString = "";
+ for (set<string>::iterator it = primers.begin(); it != primers.end();) {
+
+ double minTm, maxTm;
+ findMeltingPoint(*it, minTm, maxTm);
+ if ((minTM == -1) && (maxTM == -1)) { //user did not set min or max Tm so save this primer
+ minTms.push_back(minTm);
+ maxTms.push_back(maxTm);
+ outSum << *it << '\t' << minTm << '\t' << maxTm << endl;
+ it++;
+ }else if ((minTM == -1) && (maxTm <= maxTM)){ //user set max and no min, keep if below max
+ minTms.push_back(minTm);
+ maxTms.push_back(maxTm);
+ outSum << *it << '\t' << minTm << '\t' << maxTm << endl;
+ it++;
+ }else if ((maxTM == -1) && (minTm >= minTM)){ //user set min and no max, keep if above min
+ minTms.push_back(minTm);
+ maxTms.push_back(maxTm);
+ outSum << *it << '\t' << minTm << '\t' << maxTm << endl;
+ it++;
+ }else if ((maxTm <= maxTM) && (minTm >= minTM)) { //keep if above min and below max
+ minTms.push_back(minTm);
+ maxTms.push_back(maxTm);
+ outSum << *it << '\t' << minTm << '\t' << maxTm << endl;
+ it++;
+ }else { primers.erase(it++); } //erase because it didn't qualify
+ }
+
+ outSum << "\nOTUNumber\tPrimer\tStart\tEnd\tLength\tMismatches\tminTm\tmaxTm\n";
+ outSum.close();
+
+ //check each otu's conseq for each primer in otunumber
+ set<int> otuToRemove = createProcesses(consSummaryFile, minTms, maxTms, primers, conSeqs);
+
+ if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //print new list file
+ map<string, string> mvariables;
+ mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
+ mvariables["[extension]"] = m->getExtension(listfile);
+ string newListFile = getOutputFileName("list", mvariables);
+ outputNames.push_back(newListFile); outputTypes["list"].push_back(newListFile);
+ ofstream outList;
+ m->openOutputFile(newListFile, outList);
+
+ outList << list->getLabel() << '\t' << (list->getNumBins()-otuToRemove.size()) << '\t';
+ for (int j = 0; j < list->getNumBins(); j++) {
+ if (m->control_pressed) { break; }
+ //good otus
+ if (otuToRemove.count(j) == 0) {
+ string bin = list->get(j);
+ if (bin != "") { outList << bin << '\t'; }
+ }
+ }
+ outList << endl;
+ outList.close();
+
+ if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ delete list;
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(list->getNumBins()) + " OTUs.\n");
+
+
+ //output files created by command
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "execute");
+ exit(1);
+ }
+}
+//********************************************************************/
+//used http://www.biophp.org/minitools/melting_temperature/ as a reference to substitute degenerate bases
+// in order to find the min and max Tm values.
+//Tm = 64.9°C + 41°C x (number of G’s and C’s in the primer – 16.4)/N
+
+/* A = adenine
+ * C = cytosine
+ * G = guanine
+ * T = thymine
+ * R = G A (purine)
+ * Y = T C (pyrimidine)
+ * K = G T (keto)
+ * M = A C (amino)
+ * S = G C (strong bonds)
+ * W = A T (weak bonds)
+ * B = G T C (all but A)
+ * D = G A T (all but C)
+ * H = A C T (all but G)
+ * V = G C A (all but T)
+ * N = A G C T (any) */
+
+int PrimerDesignCommand::findMeltingPoint(string primer, double& minTm, double& maxTm){
+ try {
+ string minTmprimer = primer;
+ string maxTmprimer = primer;
+
+ //find minimum Tm string substituting for degenerate bases
+ for (int i = 0; i < minTmprimer.length(); i++) {
+ minTmprimer[i] = toupper(minTmprimer[i]);
+
+ if (minTmprimer[i] == 'Y') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'R') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'W') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'K') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'M') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'D') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'V') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'H') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'B') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'N') { minTmprimer[i] = 'A'; }
+ else if (minTmprimer[i] == 'S') { minTmprimer[i] = 'G'; }
+ }
+
+ //find maximum Tm string substituting for degenerate bases
+ for (int i = 0; i < maxTmprimer.length(); i++) {
+ maxTmprimer[i] = toupper(maxTmprimer[i]);
+
+ if (maxTmprimer[i] == 'Y') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'R') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'W') { maxTmprimer[i] = 'A'; }
+ else if (maxTmprimer[i] == 'K') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'M') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'D') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'V') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'H') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'B') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'N') { maxTmprimer[i] = 'G'; }
+ else if (maxTmprimer[i] == 'S') { maxTmprimer[i] = 'G'; }
+ }
+
+ int numGC = 0;
+ for (int i = 0; i < minTmprimer.length(); i++) {
+ if (minTmprimer[i] == 'G') { numGC++; }
+ else if (minTmprimer[i] == 'C') { numGC++; }
+ }
+
+ minTm = 64.9 + 41 * (numGC - 16.4) / (double) minTmprimer.length();
+
+ numGC = 0;
+ for (int i = 0; i < maxTmprimer.length(); i++) {
+ if (maxTmprimer[i] == 'G') { numGC++; }
+ else if (maxTmprimer[i] == 'C') { numGC++; }
+ }
+
+ maxTm = 64.9 + 41 * (numGC - 16.4) / (double) maxTmprimer.length();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "findMeltingPoint");
+ exit(1);
+ }
+}
+//********************************************************************/
+//search for a primer over the sequence string
+bool PrimerDesignCommand::findPrimer(string rawSequence, string primer, vector<int>& primerStart, vector<int>& primerEnd, vector<int>& mismatches){
+ try {
+ bool foundAtLeastOne = false; //innocent til proven guilty
+
+ //look for exact match
+ if(rawSequence.length() < primer.length()) { return false; }
+
+ //search for primer
+ for (int j = 0; j < rawSequence.length()-length; j++){
+
+ if (m->control_pressed) { return foundAtLeastOne; }
+
+ string rawChunk = rawSequence.substr(j, length);
+
+ int numDiff = countDiffs(primer, rawChunk);
+
+ if(numDiff <= pdiffs){
+ primerStart.push_back(j);
+ primerEnd.push_back(j+length);
+ mismatches.push_back(numDiff);
+ foundAtLeastOne = true;
+ }
+ }
+
+ return foundAtLeastOne;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "findPrimer");
+ exit(1);
+ }
+}
+//********************************************************************/
+//find all primers for the given sequence
+set<string> PrimerDesignCommand::getPrimer(Sequence primerSeq){
+ try {
+ set<string> primers;
+
+ string rawSequence = primerSeq.getUnaligned();
+
+ for (int j = 0; j < rawSequence.length()-length; j++){
+ if (m->control_pressed) { break; }
+
+ string primer = rawSequence.substr(j, length);
+ primers.insert(primer);
+ }
+
+ return primers;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getPrimer");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+set<int> PrimerDesignCommand::createProcesses(string newSummaryFile, vector<double>& minTms, vector<double>& maxTms, set<string>& primers, vector<Sequence>& conSeqs) {
+ try {
+
+ vector<int> processIDS;
+ int process = 1;
+ set<int> otusToRemove;
+ int numBinsProcessed = 0;
+
+ //sanity check
+ int numBins = conSeqs.size();
+ if (numBins < processors) { processors = numBins; }
+
+ //divide the otus between the processors
+ vector<linePair> lines;
+ int numOtusPerProcessor = numBins / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numOtusPerProcessor;
+ int endIndex = (i+1) * numOtusPerProcessor;
+ if(i == (processors - 1)){ endIndex = numBins; }
+ lines.push_back(linePair(startIndex, endIndex));
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ //clear old file because we append in driver
+ m->mothurRemove(newSummaryFile + toString(getpid()) + ".temp");
+
+ otusToRemove = driver(newSummaryFile + toString(getpid()) + ".temp", minTms, maxTms, primers, conSeqs, lines[process].start, lines[process].end, numBinsProcessed);
+
+ string tempFile = toString(getpid()) + ".otus2Remove.temp";
+ ofstream outTemp;
+ m->openOutputFile(tempFile, outTemp);
+
+ outTemp << numBinsProcessed << endl;
+ outTemp << otusToRemove.size() << endl;
+ for (set<int>::iterator it = otusToRemove.begin(); it != otusToRemove.end(); it++) { outTemp << *it << endl; }
+ outTemp.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //do my part
+ otusToRemove = driver(newSummaryFile, minTms, maxTms, primers, conSeqs, lines[0].start, lines[0].end, numBinsProcessed);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempFile = toString(processIDS[i]) + ".otus2Remove.temp";
+ ifstream intemp;
+ m->openInputFile(tempFile, intemp);
+
+ int num;
+ intemp >> num; m->gobble(intemp);
+ if (num != (lines[i+1].end - lines[i+1].start)) { m->mothurOut("[ERROR]: process " + toString(processIDS[i]) + " did not complete processing all OTUs assigned to it, quitting.\n"); m->control_pressed = true; }
+ intemp >> num; m->gobble(intemp);
+ for (int k = 0; k < num; k++) {
+ int otu;
+ intemp >> otu; m->gobble(intemp);
+ otusToRemove.insert(otu);
+ }
+ intemp.close();
+ m->mothurRemove(tempFile);
+ }
+
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the primerDesignData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<primerDesignData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ string extension = toString(i) + ".temp";
+ m->mothurRemove(newSummaryFile+extension);
+
+ primerDesignData* tempPrimer = new primerDesignData((newSummaryFile+extension), m, lines[i].start, lines[i].end, minTms, maxTms, primers, conSeqs, pdiffs, otunumber, length, i);
+ pDataArray.push_back(tempPrimer);
+ processIDS.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyPrimerThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+
+ //using the main process as a worker saves time and memory
+ otusToRemove = driver(newSummaryFile, minTms, maxTms, primers, conSeqs, lines[0].start, lines[0].end, numBinsProcessed);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for (set<int>::iterator it = pDataArray[i]->otusToRemove.begin(); it != pDataArray[i]->otusToRemove.end(); it++) {
+ otusToRemove.insert(*it);
+ }
+ int num = pDataArray[i]->numBinsProcessed;
+ if (num != (lines[processIDS[i]].end - lines[processIDS[i]].start)) { m->mothurOut("[ERROR]: process " + toString(processIDS[i]) + " did not complete processing all OTUs assigned to it, quitting.\n"); m->control_pressed = true; }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+
+ //append output files
+ for(int i=0;i<processIDS.size();i++){
+ m->appendFiles((newSummaryFile + toString(processIDS[i]) + ".temp"), newSummaryFile);
+ m->mothurRemove((newSummaryFile + toString(processIDS[i]) + ".temp"));
+ }
+
+ return otusToRemove;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "createProcesses");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+set<int> PrimerDesignCommand::driver(string summaryFileName, vector<double>& minTms, vector<double>& maxTms, set<string>& primers, vector<Sequence>& conSeqs, int start, int end, int& numBinsProcessed){
+ try {
+ set<int> otuToRemove;
+
+ ofstream outSum;
+ m->openOutputFileAppend(summaryFileName, outSum);
+
+ for (int i = start; i < end; i++) {
+
+ if (m->control_pressed) { break; }
+
+ if (i != (otunumber-1)) {
+ int primerIndex = 0;
+ for (set<string>::iterator it = primers.begin(); it != primers.end(); it++) {
+ vector<int> primerStarts;
+ vector<int> primerEnds;
+ vector<int> mismatches;
+
+ bool found = findPrimer(conSeqs[i].getUnaligned(), (*it), primerStarts, primerEnds, mismatches);
+
+ //if we found it report to the table
+ if (found) {
+ for (int j = 0; j < primerStarts.size(); j++) {
+ outSum << (i+1) << '\t' << *it << '\t' << primerStarts[j] << '\t' << primerEnds[j] << '\t' << length << '\t' << mismatches[j] << '\t' << minTms[primerIndex] << '\t' << maxTms[primerIndex] << endl;
+ }
+ otuToRemove.insert(i);
+ }
+ primerIndex++;
+ }
+ }
+ numBinsProcessed++;
+ }
+ outSum.close();
+
+
+ return otuToRemove;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "driver");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector< vector< vector<unsigned int> > > PrimerDesignCommand::driverGetCounts(map<string, int>& nameMap, unsigned long int& fastaCount, vector<unsigned int>& otuCounts, unsigned long long& start, unsigned long long& end){
+ try {
+ vector< vector< vector<unsigned int> > > counts;
+ map<string, int> seq2Bin;
+ alignedLength = 0;
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ in.seekg(start);
+
+ bool done = false;
+ fastaCount = 0;
+
+ while (!done) {
+ if (m->control_pressed) { in.close(); return counts; }
+
+ Sequence seq(in); m->gobble(in);
+
+ if (seq.getName() != "") {
+ if (fastaCount == 0) { alignedLength = seq.getAligned().length(); initializeCounts(counts, alignedLength, seq2Bin, nameMap, otuCounts); }
+ else if (alignedLength != seq.getAligned().length()) {
+ m->mothurOut("[ERROR]: your sequences are not all the same length. primer.design requires sequences to be aligned."); m->mothurOutEndLine(); m->control_pressed = true; break;
+ }
+
+ int num = 1;
+ map<string, int>::iterator itCount;
+ if (namefile != "") {
+ itCount = nameMap.find(seq.getName());
+ if (itCount == nameMap.end()) { m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your name file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break; }
+ else { num = itCount->second; }
+ fastaCount+=num;
+ }else if (countfile != "") {
+ itCount = nameMap.find(seq.getName());
+ if (itCount == nameMap.end()) { m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break; }
+ else { num = itCount->second; }
+ fastaCount++;
+ }else {
+ fastaCount++;
+ }
+
+ //increment counts
+ itCount = seq2Bin.find(seq.getName());
+ if (itCount == seq2Bin.end()) {
+ if ((namefile != "") || (countfile != "")) {
+ m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your list file, aborting. Perhaps you forgot to include your name or count file while clustering.\n"); m->mothurOutEndLine(); m->control_pressed = true; break;
+ }else{
+ m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your list file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break;
+ }
+ }else {
+ otuCounts[itCount->second] += num;
+ string aligned = seq.getAligned();
+ for (int i = 0; i < alignedLength; i++) {
+ char base = toupper(aligned[i]);
+ if (base == 'A') { counts[itCount->second][i][0]+=num; }
+ else if (base == 'T') { counts[itCount->second][i][1]+=num; }
+ else if (base == 'G') { counts[itCount->second][i][2]+=num; }
+ else if (base == 'C') { counts[itCount->second][i][3]+=num; }
+ else { counts[itCount->second][i][4]+=num; }
+ }
+ }
+
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ unsigned long long pos = in.tellg();
+ if ((pos == -1) || (pos >= end)) { break; }
+#else
+ if (in.eof()) { break; }
+#endif
+ }
+
+ in.close();
+
+ return counts;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "driverGetCounts");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector<Sequence> PrimerDesignCommand::createProcessesConSeqs(map<string, int>& nameMap, unsigned long int& numSeqs) {
+ try {
+ vector< vector< vector<unsigned int> > > counts;
+ vector<unsigned int> otuCounts;
+ vector<int> processIDS;
+ int process = 1;
+ unsigned long int fastaCount = 0;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ vector<unsigned long long> positions;
+ vector<fastaLinePair> lines;
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(fastaLinePair(positions[i], positions[(i+1)])); }
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ counts = driverGetCounts(nameMap, fastaCount, otuCounts, lines[process].start, lines[process].end);
+
+ string tempFile = toString(getpid()) + ".cons_counts.temp";
+ ofstream outTemp;
+ m->openOutputFile(tempFile, outTemp);
+
+ outTemp << fastaCount << endl;
+ //pass counts
+ outTemp << counts.size() << endl;
+ for (int i = 0; i < counts.size(); i++) {
+ outTemp << counts[i].size() << endl;
+ for (int j = 0; j < counts[i].size(); j++) {
+ for (int k = 0; k < 5; k++) { outTemp << counts[i][j][k] << '\t'; }
+ outTemp << endl;
+ }
+ }
+ //pass otuCounts
+ outTemp << otuCounts.size() << endl;
+ for (int i = 0; i < otuCounts.size(); i++) { outTemp << otuCounts[i] << '\t'; }
+ outTemp << endl;
+ outTemp.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //do my part
+ counts = driverGetCounts(nameMap, fastaCount, otuCounts, lines[0].start, lines[0].end);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempFile = toString(processIDS[i]) + ".cons_counts.temp";
+ ifstream intemp;
+ m->openInputFile(tempFile, intemp);
+
+ unsigned long int num;
+ intemp >> num; m->gobble(intemp); fastaCount += num;
+ intemp >> num; m->gobble(intemp);
+ if (num != counts.size()) { m->mothurOut("[ERROR]: " + tempFile + " was not built correctly by the child process, quitting.\n"); m->control_pressed = true; }
+ else {
+ //read counts
+ for (int k = 0; k < num; k++) {
+ int alength;
+ intemp >> alength; m->gobble(intemp);
+ if (alength != alignedLength) { m->mothurOut("[ERROR]: your sequences are not all the same length. primer.design requires sequences to be aligned."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else {
+ for (int j = 0; j < alength; j++) {
+ for (int l = 0; l < 5; l++) { unsigned int numTemp; intemp >> numTemp; m->gobble(intemp); counts[k][j][l] += numTemp; }
+ }
+ }
+ }
+ //read otuCounts
+ intemp >> num; m->gobble(intemp);
+ for (int k = 0; k < num; k++) {
+ unsigned int numTemp; intemp >> numTemp; m->gobble(intemp);
+ otuCounts[k] += numTemp;
+ }
+ }
+ intemp.close();
+ m->mothurRemove(tempFile);
+ }
+
+
+#else
+ unsigned long long start = 0;
+ unsigned long long end = 1000;
+ counts = driverGetCounts(nameMap, fastaCount, otuCounts, start, end);
+#endif
+
+ //you will have a nameMap error if there is a namefile or countfile, but if those aren't given we want to make sure the fasta and list file match.
+ if (fastaCount != numSeqs) {
+ if ((namefile == "") && (countfile == "")) { m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your fasta file contains " + toString(fastaCount) + " sequences, aborting. Do you have the correct files? Perhaps you forgot to include the name or count file? \n"); }
+ m->control_pressed = true;
+ }
+
+ vector<Sequence> conSeqs;
+
+ if (m->control_pressed) { return conSeqs; }
+
+ //build consensus seqs
+ string snumBins = toString(counts.size());
+ for (int i = 0; i < counts.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ string otuLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { otuLabel += "0"; }
+ }
+ otuLabel += sbinNumber;
+
+ string cons = "";
+ for (int j = 0; j < counts[i].size(); j++) {
+ cons += getBase(counts[i][j], otuCounts[i]);
+ }
+ Sequence consSeq(otuLabel, cons);
+ conSeqs.push_back(consSeq);
+ }
+
+ if (m->control_pressed) { conSeqs.clear(); return conSeqs; }
+
+ return conSeqs;
+
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "createProcessesConSeqs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+char PrimerDesignCommand::getBase(vector<unsigned int> counts, int size){ //A,T,G,C,Gap
+ try{
+ /* A = adenine
+ * C = cytosine
+ * G = guanine
+ * T = thymine
+ * R = G A (purine)
+ * Y = T C (pyrimidine)
+ * K = G T (keto)
+ * M = A C (amino)
+ * S = G C (strong bonds)
+ * W = A T (weak bonds)
+ * B = G T C (all but A)
+ * D = G A T (all but C)
+ * H = A C T (all but G)
+ * V = G C A (all but T)
+ * N = A G C T (any) */
+
+ char conBase = 'N';
+
+ //zero out counts that don't make the cutoff
+ float percentage = (100.0 - cutoff) / 100.0;
+
+ for (int i = 0; i < counts.size(); i++) {
+ float countPercentage = counts[i] / (float) size;
+ if (countPercentage < percentage) { counts[i] = 0; }
+ }
+
+ //any
+ if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'n'; }
+ //any no gap
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'N'; }
+ //all but T
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'v'; }
+ //all but T no gap
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'V'; }
+ //all but G
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'h'; }
+ //all but G no gap
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'H'; }
+ //all but C
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'd'; }
+ //all but C no gap
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'D'; }
+ //all but A
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'b'; }
+ //all but A no gap
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'B'; }
+ //W = A T (weak bonds)
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'w'; }
+ //W = A T (weak bonds) no gap
+ else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'W'; }
+ //S = G C (strong bonds)
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 's'; }
+ //S = G C (strong bonds) no gap
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'S'; }
+ //M = A C (amino)
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'm'; }
+ //M = A C (amino) no gap
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'M'; }
+ //K = G T (keto)
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'k'; }
+ //K = G T (keto) no gap
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'K'; }
+ //Y = T C (pyrimidine)
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'y'; }
+ //Y = T C (pyrimidine) no gap
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'Y'; }
+ //R = G A (purine)
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'r'; }
+ //R = G A (purine) no gap
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'R'; }
+ //only A
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'a'; }
+ //only A no gap
+ else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'A'; }
+ //only T
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 't'; }
+ //only T no gap
+ else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'T'; }
+ //only G
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'g'; }
+ //only G no gap
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'G'; }
+ //only C
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'c'; }
+ //only C no gap
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'C'; }
+ //only gap
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = '-'; }
+ //cutoff removed all counts
+ else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'N'; }
+ else{ m->mothurOut("[ERROR]: cannot find consensus base."); m->mothurOutEndLine(); }
+
+ return conBase;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getBase");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int PrimerDesignCommand::initializeCounts(vector< vector< vector<unsigned int> > >& counts, int length, map<string, int>& seq2Bin, map<string, int>& nameMap, vector<unsigned int>& otuCounts){
+ try {
+ counts.clear();
+ otuCounts.clear();
+ seq2Bin.clear();
+
+ //vector< vector< vector<unsigned int> > > counts - otu < spot_in_alignment < counts_for_A,T,G,C,Gap > > >
+ for (int i = 0; i < list->getNumBins(); i++) {
+ string binNames = list->get(i);
+ vector<string> names;
+ m->splitAtComma(binNames, names);
+ otuCounts.push_back(0);
+
+ //lets be smart and only map the unique names if a name or count file was given to save search time and memory
+ if ((namefile != "") || (countfile != "")) {
+ for (int j = 0; j < names.size(); j++) {
+ map<string, int>::iterator itNames = nameMap.find(names[j]);
+ if (itNames != nameMap.end()) { //add name because its a unique one
+ seq2Bin[names[j]] = i;
+ }
+ }
+ }else { //map everyone
+ for (int j = 0; j < names.size(); j++) { seq2Bin[names[j]] = i; }
+ }
+
+ vector<unsigned int> temp; temp.resize(5, 0); //A,T,G,C,Gap
+ vector< vector<unsigned int> > temp2;
+ for (int j = 0; j < length; j++) {
+ temp2.push_back(temp);
+ }
+ counts.push_back(temp2);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "initializeCounts");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+map<string, int> PrimerDesignCommand::readCount(unsigned long int& numSeqs){
+ try {
+ map<string, int> nameMap;
+
+ CountTable ct;
+ ct.readTable(countfile);
+ vector<string> namesOfSeqs = ct.getNamesOfSeqs();
+ numSeqs = ct.getNumUniqueSeqs();
+
+ for (int i = 0; i < namesOfSeqs.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ nameMap[namesOfSeqs[i]] = ct.getNumSeqs(namesOfSeqs[i]);
+ }
+
+ return nameMap;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "readCount");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int PrimerDesignCommand::getListVector(){
+ try {
+ InputData input(listfile, "list");
+ list = input.getListVector();
+ string lastLabel = list->getLabel();
+
+ if (label == "") { label = lastLabel; return 0; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((list != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { return 0; }
+
+ if(labels.count(list->getLabel()) == 1){
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
+ delete list;
+ list = input.getListVector(lastLabel);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = list->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ delete list;
+ list = input.getListVector();
+ }
+
+
+ if (m->control_pressed) { return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ delete list;
+ list = input.getListVector(lastLabel);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "getListVector");
+ exit(1);
+ }
+}
+//********************************************************************/
+/* A = adenine
+ * C = cytosine
+ * G = guanine
+ * T = thymine
+ * R = G A (purine)
+ * Y = T C (pyrimidine)
+ * K = G T (keto)
+ * M = A C (amino)
+ * S = G C (strong bonds)
+ * W = A T (weak bonds)
+ * B = G T C (all but A)
+ * D = G A T (all but C)
+ * H = A C T (all but G)
+ * V = G C A (all but T)
+ * N = A G C T (any) */
+int PrimerDesignCommand::countDiffs(string oligo, string seq){
+ try {
+
+ int length = oligo.length();
+ int countDiffs = 0;
+
+ for(int i=0;i<length;i++){
+
+ oligo[i] = toupper(oligo[i]);
+ seq[i] = toupper(seq[i]);
+
+ if(oligo[i] != seq[i]){
+ if(oligo[i] == 'A' && (seq[i] != 'A' && seq[i] != 'M' && seq[i] != 'R' && seq[i] != 'W' && seq[i] != 'D' && seq[i] != 'H' && seq[i] != 'V')) { countDiffs++; }
+ else if(oligo[i] == 'C' && (seq[i] != 'C' && seq[i] != 'Y' && seq[i] != 'M' && seq[i] != 'S' && seq[i] != 'B' && seq[i] != 'H' && seq[i] != 'V')) { countDiffs++; }
+ else if(oligo[i] == 'G' && (seq[i] != 'G' && seq[i] != 'R' && seq[i] != 'K' && seq[i] != 'S' && seq[i] != 'B' && seq[i] != 'D' && seq[i] != 'V')) { countDiffs++; }
+ else if(oligo[i] == 'T' && (seq[i] != 'T' && seq[i] != 'Y' && seq[i] != 'K' && seq[i] != 'W' && seq[i] != 'B' && seq[i] != 'D' && seq[i] != 'H')) { countDiffs++; }
+ else if((oligo[i] == '.' || oligo[i] == '-')) { countDiffs++; }
+ else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
+ else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
+ else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
+ else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
+ else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
+ else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
+ else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
+ }
+
+ }
+
+ return countDiffs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PrimerDesignCommand", "countDiffs");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
+
--- /dev/null
+//
+// primerdesigncommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 1/18/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef Mothur_primerdesigncommand_h
+#define Mothur_primerdesigncommand_h
+
+#include "command.hpp"
+#include "listvector.hpp"
+#include "inputdata.h"
+#include "sequence.hpp"
+#include "alignment.hpp"
+#include "needlemanoverlap.hpp"
+
+/**************************************************************************************************/
+
+class PrimerDesignCommand : public Command {
+public:
+ PrimerDesignCommand(string);
+ PrimerDesignCommand();
+ ~PrimerDesignCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "primer.design"; }
+ string getCommandCategory() { return "OTU-Based Approaches"; }
+
+ string getOutputPattern(string);
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Primer.design"; }
+ string getDescription() { return "identify sequence fragments that are specific to particular OTUs"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+ struct fastaLinePair {
+ unsigned long long start;
+ unsigned long long end;
+ fastaLinePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+ };
+
+ bool abort, allLines, large;
+ int cutoff, pdiffs, length, otunumber, processors, alignedLength;
+ string outputDir, listfile, namefile, countfile, fastafile, label;
+ double minTM, maxTM;
+ ListVector* list;
+ vector<string> outputNames;
+
+ int initializeCounts(vector< vector< vector<unsigned int> > >& counts, int length, map<string, int>&, map<string, int>&, vector<unsigned int>&);
+ map<string, int> readCount(unsigned long int&);
+ char getBase(vector<unsigned int> counts, int size);
+ int getListVector();
+ int countDiffs(string, string);
+ set<string> getPrimer(Sequence);
+ bool findPrimer(string, string, vector<int>&, vector<int>&, vector<int>&);
+ int findMeltingPoint(string primer, double&, double&);
+
+ set<int> createProcesses(string, vector<double>&, vector<double>&, set<string>&, vector<Sequence>&);
+ set<int> driver(string, vector<double>&, vector<double>&, set<string>&, vector<Sequence>&, int, int, int&);
+ vector< vector< vector<unsigned int> > > driverGetCounts(map<string, int>&, unsigned long int&, vector<unsigned int>&, unsigned long long&, unsigned long long&);
+ vector<Sequence> createProcessesConSeqs(map<string, int>&, unsigned long int&);
+
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct primerDesignData {
+ string summaryFileName;
+ MothurOut* m;
+ int start;
+ int end;
+ int pdiffs, threadID, otunumber, length;
+ set<string> primers;
+ vector<double> minTms, maxTms;
+ set<int> otusToRemove;
+ vector<Sequence> consSeqs;
+ int numBinsProcessed;
+
+ primerDesignData(){}
+ primerDesignData(string sf, MothurOut* mout, int st, int en, vector<double> min, vector<double> max, set<string> pri, vector<Sequence> seqs, int d, int otun, int l, int tid) {
+ summaryFileName = sf;
+ m = mout;
+ start = st;
+ end = en;
+ pdiffs = d;
+ minTms = min;
+ maxTms = max;
+ primers = pri;
+ consSeqs = seqs;
+ otunumber = otun;
+ length = l;
+ threadID = tid;
+ numBinsProcessed = 0;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyPrimerThreadFunction(LPVOID lpParam){
+ primerDesignData* pDataArray;
+ pDataArray = (primerDesignData*)lpParam;
+
+ try {
+ ofstream outSum;
+ pDataArray->m->openOutputFileAppend(pDataArray->summaryFileName, outSum);
+
+ for (int i = pDataArray->start; i < pDataArray->end; i++) {
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ if (i != (pDataArray->otunumber-1)) {
+ int primerIndex = 0;
+ for (set<string>::iterator it = pDataArray->primers.begin(); it != pDataArray->primers.end(); it++) {
+ vector<int> primerStarts;
+ vector<int> primerEnds;
+ vector<int> mismatches;
+
+ //bool found = findPrimer(conSeqs[i].getUnaligned(), (*it), primerStarts, primerEnds, mismatches);
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ bool found = false; //innocent til proven guilty
+
+ string rawSequence = pDataArray->consSeqs[i].getUnaligned();
+ string primer = *it;
+
+ //look for exact match
+ if(rawSequence.length() < primer.length()) { found = false; }
+ else {
+ //search for primer
+ for (int j = 0; j < rawSequence.length()-pDataArray->length; j++){
+
+ if (pDataArray->m->control_pressed) { found = false; break; }
+
+ string rawChunk = rawSequence.substr(j, pDataArray->length);
+
+ //int numDiff = countDiffs(primer, rawchuck);
+ ///////////////////////////////////////////////////////////////////////
+ int numDiff = 0;
+ string oligo = primer;
+ string seq = rawChunk;
+
+ for(int k=0;k<pDataArray->length;k++){
+
+ oligo[k] = toupper(oligo[k]);
+ seq[k] = toupper(seq[k]);
+
+ if(oligo[k] != seq[k]){
+
+ if((oligo[k] == 'N' || oligo[k] == 'I') && (seq[k] == 'N')) { numDiff++; }
+ else if(oligo[k] == 'R' && (seq[k] != 'A' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'Y' && (seq[k] != 'C' && seq[k] != 'T')) { numDiff++; }
+ else if(oligo[k] == 'M' && (seq[k] != 'C' && seq[k] != 'A')) { numDiff++; }
+ else if(oligo[k] == 'K' && (seq[k] != 'T' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'W' && (seq[k] != 'T' && seq[k] != 'A')) { numDiff++; }
+ else if(oligo[k] == 'S' && (seq[k] != 'C' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'B' && (seq[k] != 'C' && seq[k] != 'T' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'D' && (seq[k] != 'A' && seq[k] != 'T' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'H' && (seq[k] != 'A' && seq[k] != 'T' && seq[k] != 'C')) { numDiff++; }
+ else if(oligo[k] == 'V' && (seq[k] != 'A' && seq[k] != 'C' && seq[k] != 'G')) { numDiff++; }
+ else if(oligo[k] == 'A' && (seq[k] != 'A' && seq[k] != 'M' && seq[k] != 'R' && seq[k] != 'W' && seq[k] != 'D' && seq[k] != 'H' && seq[k] != 'V')) { numDiff++; }
+ else if(oligo[k] == 'C' && (seq[k] != 'C' && seq[k] != 'Y' && seq[k] != 'M' && seq[k] != 'S' && seq[k] != 'B' && seq[k] != 'H' && seq[k] != 'V')) { numDiff++; }
+ else if(oligo[k] == 'G' && (seq[k] != 'G' && seq[k] != 'R' && seq[k] != 'K' && seq[k] != 'S' && seq[k] != 'B' && seq[k] != 'D' && seq[k] != 'V')) { numDiff++; }
+ else if(oligo[k] == 'T' && (seq[k] != 'T' && seq[k] != 'Y' && seq[k] != 'K' && seq[k] != 'W' && seq[k] != 'B' && seq[k] != 'D' && seq[k] != 'H')) { numDiff++; }
+ else if((oligo[k] == '.' || oligo[k] == '-')) { numDiff++; }
+ }
+ }
+ ///////////////////////////////////////////////////////////////////////
+
+ if(numDiff <= pDataArray->pdiffs){
+ primerStarts.push_back(j);
+ primerEnds.push_back(j+pDataArray->length);
+ mismatches.push_back(numDiff);
+ found = true;
+ }
+ }
+ }
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+ //if we found it report to the table
+ if (found) {
+ for (int j = 0; j < primerStarts.size(); j++) {
+ outSum << (i+1) << '\t' << *it << '\t' << primerStarts[j] << '\t' << primerEnds[j] << '\t' << pDataArray->length << '\t' << mismatches[j] << '\t' << pDataArray->minTms[primerIndex] << '\t' << pDataArray->maxTms[primerIndex] << endl;
+ }
+ pDataArray->otusToRemove.insert(i);
+ }
+ primerIndex++;
+ }
+ }
+ pDataArray->numBinsProcessed++;
+ }
+ outSum.close();
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "PrimerDesignCommand", "MyPrimerThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+/**************************************************************************************************/
+
+
+
+
+
+#endif
try {
m = MothurOut::getInstance();
-
- seqName = "";
+
int score;
+ seqName = getSequenceName(qFile);
+
+ if (!m->control_pressed) {
+ string qScoreString = m->getline(qFile);
+ //cout << qScoreString << endl;
+ while(qFile.peek() != '>' && qFile.peek() != EOF){
+ if (m->control_pressed) { break; }
+ string temp = m->getline(qFile);
+ //cout << temp << endl;
+ qScoreString += ' ' + temp;
+ }
+ //cout << "done reading " << endl;
+ istringstream qScoreStringStream(qScoreString);
+ int count = 0;
+ while(!qScoreStringStream.eof()){
+ if (m->control_pressed) { break; }
+ string temp;
+ qScoreStringStream >> temp; m->gobble(qScoreStringStream);
+
+ //check temp to make sure its a number
+ if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; }
+ convert(temp, score);
+
+ //cout << count << '\t' << score << endl;
+ qScores.push_back(score);
+ count++;
+ }
+ }
- qFile >> seqName;
- m->getline(qFile);
- //cout << seqName << endl;
- if (seqName == "") {
- m->mothurOut("Error reading quality file, name blank at position, " + toString(qFile.tellg()));
- m->mothurOutEndLine();
- }
- else{
- seqName = seqName.substr(1);
- }
-
- string qScoreString = m->getline(qFile);
- //cout << qScoreString << endl;
- while(qFile.peek() != '>' && qFile.peek() != EOF){
- if (m->control_pressed) { break; }
- string temp = m->getline(qFile);
- //cout << temp << endl;
- qScoreString += ' ' + temp;
- }
- //cout << "done reading " << endl;
- istringstream qScoreStringStream(qScoreString);
- int count = 0;
- while(!qScoreStringStream.eof()){
- if (m->control_pressed) { break; }
- string temp;
- qScoreStringStream >> temp; m->gobble(qScoreStringStream);
-
- //check temp to make sure its a number
- if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; }
- convert(temp, score);
-
- //cout << count << '\t' << score << endl;
- qScores.push_back(score);
- count++;
- }
- //qScores.pop_back();
-
-// string scores = "";
-//
-// while(!qFile.eof()){
-//
-// qFile >> seqName;
-//
-// //get name
-// if (seqName.length() != 0) {
-// seqName = seqName.substr(1);
-// while (!qFile.eof()) {
-// char c = qFile.get();
-// //gobble junk on line
-// if (c == 10 || c == 13){ break; }
-// }
-// m->gobble(qFile);
-// }
-//
-// //get scores
-// while(qFile){
-// char letter=qFile.get();
-// if((letter == '>')){ qFile.putback(letter); break; }
-// else if (isprint(letter)) { scores += letter; }
-// }
-// m->gobble(qFile);
-//
-// break;
-// }
-//
-// //convert scores string to qScores
-// istringstream qScoreStringStream(scores);
-//
-// int score;
-// while(!qScoreStringStream.eof()){
-//
-// if (m->control_pressed) { break; }
-//
-// qScoreStringStream >> score;
-// qScores.push_back(score);
-// }
-//
-// qScores.pop_back();
-
seqLength = qScores.size();
//cout << "seqlength = " << seqLength << '\t' << count << endl;
}
}
-
+//********************************************************************************************************************
+string QualityScores::getSequenceName(ifstream& qFile) {
+ try {
+ string name = "";
+
+ qFile >> name;
+ m->getline(qFile);
+
+ if (name.length() != 0) {
+
+ name = name.substr(1);
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
+
+ }else{ m->mothurOut("Error in reading your qfile, at position " + toString(qFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ return name;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "QualityScores", "getSequenceName");
+ exit(1);
+ }
+}
+//********************************************************************************************************************
+void QualityScores::setName(string name) {
+ try {
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
+
+ seqName = name;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "QualityScores", "setName");
+ exit(1);
+ }
+}
/**************************************************************************************************/
string QualityScores::getName(){
void updateQScoreErrorMap(map<char, vector<int> >&, string, int, int, int);
void updateForwardMap(vector<vector<int> >&, int, int, int);
void updateReverseMap(vector<vector<int> >&, int, int, int);
- void setName(string n) { seqName = n; }
+ void setName(string n);
void setScores(vector<int> qs) { qScores = qs; seqLength = qScores.size(); }
string seqName;
int seqLength;
+
+ string getSequenceName(ifstream&);
};
/**************************************************************************************************/
referenceSeqs.resize(numRefSeqs);
referenceNames.resize(numRefSeqs);
for(int i=0;i<numRefSeqs;i++){
- referenceSeqs[i] = refs[i].getAligned();
+ if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
+ else { referenceSeqs[i] = refs[i].getUnaligned(); }
referenceNames[i] = refs[i].getName();
}
for(int i=0;i<numRefSeqs;i++){
double length = 0;
- int diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
+ double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
if(diffs < bestRefDiffs){
bestRefDiffs = diffs;
bestRefLength = length;
int end = refLength - 1;
int maxRow = 0;
- double maxRowValue = -100000000000;
+ double maxRowValue = -2147483647;
for(int i=0;i<queryLength;i++){
if(alignMatrix[i][end] > maxRowValue){
maxRow = i;
end = queryLength - 1;
int maxColumn = 0;
- double maxColumnValue = -100000000000;
+ double maxColumnValue = -2147483647;
for(int j=0;j<refLength;j++){
if(alignMatrix[end][j] > maxColumnValue){
--- /dev/null
+//
+// removedistscommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 1/29/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "removedistscommand.h"
+
+//**********************************************************************************************************************
+vector<string> RemoveDistsCommand::setParameters(){
+ try {
+ CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "PhylipColumn", "none","phylip",false,false,true); parameters.push_back(pphylip);
+ CommandParameter pcolumn("column", "InputTypes", "", "", "none", "PhylipColumn", "none","column",false,false,true); parameters.push_back(pcolumn);
+ CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(paccnos);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string RemoveDistsCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The remove.dists command removes distances from a phylip or column file related to groups or sequences listed in an accnos file.\n";
+ helpString += "The remove.dists command parameters are accnos, phylip and column.\n";
+ helpString += "The remove.dists command should be in the following format: get.dists(accnos=yourAccnos, phylip=yourPhylip).\n";
+ helpString += "Example remove.dists(accnos=final.accnos, phylip=final.an.thetayc.0.03.lt.ave.dist).\n";
+ helpString += "Note: No spaces between parameter labels (i.e. accnos), '=' and parameters (i.e.final.accnos).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string RemoveDistsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "phylip") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "column") { pattern = "[filename],pick,[extension]"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+RemoveDistsCommand::RemoveDistsCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["phylip"] = tempOutNames;
+ outputTypes["column"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "RemoveDistsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+RemoveDistsCommand::RemoveDistsCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string,string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["column"] = tempOutNames;
+ outputTypes["phylip"] = tempOutNames;
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("phylip");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["phylip"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("column");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["column"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("accnos");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["accnos"] = inputDir + it->second; }
+ }
+ }
+
+
+ //check for required parameters
+ accnosfile = validParameter.validFile(parameters, "accnos", true);
+ if (accnosfile == "not open") { abort = true; }
+ else if (accnosfile == "not found") {
+ accnosfile = m->getAccnosFile();
+ if (accnosfile != "") { m->mothurOut("Using " + accnosfile + " as input file for the accnos parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("You have no valid accnos file and accnos is required."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }else { m->setAccnosFile(accnosfile); }
+
+ phylipfile = validParameter.validFile(parameters, "phylip", true);
+ if (phylipfile == "not open") { phylipfile = ""; abort = true; }
+ else if (phylipfile == "not found") { phylipfile = ""; }
+ else { m->setPhylipFile(phylipfile); }
+
+ columnfile = validParameter.validFile(parameters, "column", true);
+ if (columnfile == "not open") { columnfile = ""; abort = true; }
+ else if (columnfile == "not found") { columnfile = ""; }
+ else { m->setColumnFile(columnfile); }
+
+ if ((phylipfile == "") && (columnfile == "")) {
+ //is there are current file available for either of these?
+ //give priority to column, then phylip
+ columnfile = m->getColumnFile();
+ if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
+ else {
+ phylipfile = m->getPhylipFile();
+ if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
+ else {
+ m->mothurOut("No valid current files. You must provide a phylip or column file."); m->mothurOutEndLine();
+ abort = true;
+ }
+ }
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "RemoveDistsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int RemoveDistsCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ //get names you want to keep
+ names = m->readAccnos(accnosfile);
+
+ if (m->control_pressed) { return 0; }
+
+ //read through the correct file and output lines you want to keep
+ if (phylipfile != "") { readPhylip(); }
+ if (columnfile != "") { readColumn(); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+
+ if (outputNames.size() != 0) {
+ m->mothurOutEndLine();
+ m->mothurOut("Output File names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ //set fasta file as new current fastafile
+ string current = "";
+ itTypes = outputTypes.find("phylip");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
+ }
+
+ itTypes = outputTypes.find("column");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
+ }
+ }
+
+ return 0;
+ }
+
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "execute");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int RemoveDistsCommand::readPhylip(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(phylipfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(phylipfile));
+ variables["[extension]"] = m->getExtension(phylipfile);
+ string outputFileName = getOutputFileName("phylip", variables);
+
+ ifstream in;
+ m->openInputFile(phylipfile, in);
+
+ float distance;
+ int square, nseqs;
+ string name;
+ unsigned int row;
+ set<unsigned int> rows; //converts names in names to a index
+ row = 0;
+
+ string numTest;
+ in >> numTest >> name;
+
+ if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+ else { convert(numTest, nseqs); }
+
+ //not one we want to remove
+ if (names.count(name) == 0) { rows.insert(row); }
+ row++;
+
+ //is the matrix square?
+ char d;
+ while((d=in.get()) != EOF){
+
+ if(isalnum(d)){
+ square = 1;
+ in.putback(d);
+ for(int i=0;i<nseqs;i++){
+ in >> distance;
+ }
+ break;
+ }
+ if(d == '\n'){
+ square = 0;
+ break;
+ }
+ }
+
+ //map name to row/column
+ if(square == 0){
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ if (names.count(name) == 0) { rows.insert(row); }
+ row++;
+
+ for(int j=0;j<i;j++){
+ if (m->control_pressed) { in.close(); return 0; }
+ in >> distance;
+ }
+ }
+ }
+ else{
+ for(int i=1;i<nseqs;i++){
+ in >> name;
+ if (names.count(name) == 0) { rows.insert(row); }
+ row++;
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { in.close(); return 0; }
+ in >> distance;
+ }
+ }
+ }
+ in.close();
+
+ if (m->control_pressed) { return 0; }
+
+ //read through file only printing rows and columns of seqs in names
+ ifstream inPhylip;
+ m->openInputFile(phylipfile, inPhylip);
+
+ inPhylip >> numTest;
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputTypes["phylip"].push_back(outputFileName); outputNames.push_back(outputFileName);
+ out << names.size() << endl;
+
+ unsigned int count = 0;
+ unsigned int keptCount = 0;
+ if(square == 0){
+ for(int i=0;i<nseqs;i++){
+ inPhylip >> name;
+ bool ignoreRow = false;
+
+ if (names.count(name) != 0) { ignoreRow = true; count++; }
+ else{ out << name << '\t'; keptCount++; }
+
+ for(int j=0;j<i;j++){
+ if (m->control_pressed) { inPhylip.close(); out.close(); return 0; }
+ inPhylip >> distance;
+ if (!ignoreRow) {
+ //is this a column we want
+ if(rows.count(j) != 0) { out << distance << '\t'; }
+ }
+ }
+ if (!ignoreRow) { out << endl; }
+ }
+ }
+ else{
+ for(int i=0;i<nseqs;i++){
+ inPhylip >> name;
+
+ bool ignoreRow = false;
+
+ if (names.count(name) != 0) { ignoreRow = true; count++; }
+ else{ out << name << '\t'; keptCount++; }
+
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { inPhylip.close(); out.close(); return 0; }
+ inPhylip >> distance;
+ if (!ignoreRow) {
+ //is this a column we want
+ if(rows.count(j) != 0) { out << distance << '\t'; }
+ }
+ }
+ if (!ignoreRow) { out << endl; }
+ }
+ }
+ inPhylip.close();
+ out.close();
+
+ if (keptCount == 0) { m->mothurOut("Your file contains ONLY distances related to groups or sequences listed in the accnos file."); m->mothurOutEndLine(); }
+ else if (count != names.size()) {
+ m->mothurOut("[WARNING]: Your accnos file contains " + toString(names.size()) + " groups or sequences, but I only found " + toString(count) + " of them in the phylip file."); m->mothurOutEndLine();
+ //rewrite with new number
+ m->renameFile(outputFileName, outputFileName+".temp");
+ ofstream out2;
+ m->openOutputFile(outputFileName, out2);
+ out2 << keptCount << endl;
+
+ ifstream in3;
+ m->openInputFile(outputFileName+".temp", in3);
+ in3 >> nseqs; m->gobble(in3);
+ char buffer[4096];
+ while (!in3.eof()) {
+ in3.read(buffer, 4096);
+ out2.write(buffer, in3.gcount());
+ }
+ in3.close();
+ out2.close();
+ m->mothurRemove(outputFileName+".temp");
+ }
+
+ m->mothurOut("Removed " + toString(count) + " groups or sequences from your phylip file."); m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "readPhylip");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int RemoveDistsCommand::readColumn(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(columnfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(columnfile));
+ variables["[extension]"] = m->getExtension(columnfile);
+ string outputFileName = getOutputFileName("column", variables);
+ outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(columnfile, in);
+
+ set<string> removeNames;
+ string firstName, secondName;
+ float distance;
+ bool wrote = false;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { out.close(); in.close(); return 0; }
+
+ in >> firstName >> secondName >> distance; m->gobble(in);
+
+ //is either names in the accnos file
+ if (names.count(firstName) != 0) {
+ removeNames.insert(firstName);
+ if (names.count(secondName) != 0) { removeNames.insert(secondName); } }
+ else if (names.count(secondName) != 0) {
+ removeNames.insert(secondName);
+ if (names.count(firstName) != 0) { removeNames.insert(firstName); } }
+ else {
+ wrote = true;
+ out << firstName << '\t' << secondName << '\t' << distance << endl;
+ }
+ }
+ in.close();
+ out.close();
+
+ if (!wrote) { m->mothurOut("Your file contains ONLY distances related to groups or sequences listed in the accnos file."); m->mothurOutEndLine(); }
+ else if (removeNames.size() != names.size()) {
+ m->mothurOut("[WARNING]: Your accnos file contains " + toString(names.size()) + " groups or sequences, but I only found " + toString(removeNames.size()) + " of them in the column file."); m->mothurOutEndLine();
+ }
+
+ m->mothurOut("Removed " + toString(removeNames.size()) + " groups or sequences from your column file."); m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveDistsCommand", "readColumn");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
--- /dev/null
+//
+// removedistscommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 1/29/13.
+// Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef Mothur_removedistscommand_h
+#define Mothur_removedistscommand_h
+
+#include "command.hpp"
+
+class RemoveDistsCommand : public Command {
+
+public:
+
+ RemoveDistsCommand(string);
+ RemoveDistsCommand();
+ ~RemoveDistsCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "remove.dists"; }
+ string getCommandCategory() { return "General"; }
+
+ string getHelpString();
+ string getOutputPattern(string);
+ string getCitation() { return "http://www.mothur.org/wiki/Remove.dists"; }
+ string getDescription() { return "removes distances from a phylip or column file related to groups or sequences listed in an accnos file"; }
+
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+
+private:
+ set<string> names;
+ string accnosfile, phylipfile, columnfile, outputDir;
+ bool abort;
+ vector<string> outputNames;
+
+ int readPhylip();
+ int readColumn();
+
+};
+
+
+#endif
CommandParameter pconstaxonomy("constaxonomy", "InputTypes", "", "", "none", "FNGLT", "none","constaxonomy",false,false); parameters.push_back(pconstaxonomy);
CommandParameter potucorr("otucorr", "InputTypes", "", "", "none", "FNGLT", "none","otucorr",false,false); parameters.push_back(potucorr);
CommandParameter pcorraxes("corraxes", "InputTypes", "", "", "none", "FNGLT", "none","corraxes",false,false); parameters.push_back(pcorraxes);
+ CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none","list",false,false, true); parameters.push_back(plist);
+ CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT", "none","shared",false,false, true); parameters.push_back(pshared);
+ CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string RemoveOtuLabelsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The remove.otulabels command can be used to remove specific otus with the output from classify.otu, otu.association, or corr.axes.\n";
- helpString += "The remove.otulabels parameters are: constaxonomy, otucorr, corraxes, and accnos.\n";
+ helpString += "The remove.otulabels command can be used to remove specific otus with the output from classify.otu, otu.association, or corr.axes. It can also be used to select a set of otus from a shared or list file.\n";
+ helpString += "The remove.otulabels parameters are: constaxonomy, otucorr, corraxes, shared, list, label and accnos.\n";
helpString += "The constaxonomy parameter is input the results of the classify.otu command.\n";
helpString += "The otucorr parameter is input the results of the otu.association command.\n";
helpString += "The corraxes parameter is input the results of the corr.axes command.\n";
+ helpString += "The label parameter is used to analyze specific labels in your input. \n";
helpString += "The remove.otulabels commmand should be in the following format: \n";
helpString += "remove.otulabels(accnos=yourListOfOTULabels, corraxes=yourCorrAxesFile)\n";
return helpString;
try {
string pattern = "";
- if (type == "constaxonomy") { pattern = "[filename],pick,[extension]"; }
- else if (type == "otucorr") { pattern = "[filename],pick,[extension]"; }
- else if (type == "corraxes") { pattern = "[filename],pick,[extension]"; }
+ if (type == "constaxonomy") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "otucorr") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "corraxes") { pattern = "[filename],pick,[extension]"; }
+ else if (type == "list") { pattern = "[filename],[distance],pick,[extension]"; }
+ else if (type == "shared") { pattern = "[filename],[distance],pick,[extension]"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
outputTypes["constaxonomy"] = tempOutNames;
outputTypes["otucorr"] = tempOutNames;
outputTypes["corraxes"] = tempOutNames;
+ outputTypes["shared"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "RemoveOtuLabelsCommand", "RemoveOtuLabelsCommand");
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["otucorr"] = inputDir + it->second; }
}
+
+ it = parameters.find("list");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["list"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("shared");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["shared"] = inputDir + it->second; }
+ }
}
vector<string> tempOutNames;
outputTypes["constaxonomy"] = tempOutNames;
outputTypes["otucorr"] = tempOutNames;
outputTypes["corraxes"] = tempOutNames;
+ outputTypes["shared"] = tempOutNames;
+ outputTypes["list"] = tempOutNames;
+
//check for parameters
accnosfile = validParameter.validFile(parameters, "accnos", true);
if (otucorrfile == "not open") { otucorrfile = ""; abort = true; }
else if (otucorrfile == "not found") { otucorrfile = ""; }
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not open") { listfile = ""; abort = true; }
+ else if (listfile == "not found") { listfile = ""; }
+ else { m->setListFile(listfile); }
+
+ sharedfile = validParameter.validFile(parameters, "shared", true);
+ if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+ else if (sharedfile == "not found") { sharedfile = ""; }
+ else { m->setSharedFile(sharedfile); }
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
- if ((constaxonomyfile == "") && (corraxesfile == "") && (otucorrfile == "")) { m->mothurOut("You must provide one of the following: constaxonomy, corraxes or otucorr."); m->mothurOutEndLine(); abort = true; }
+ if ((constaxonomyfile == "") && (corraxesfile == "") && (otucorrfile == "") && (sharedfile == "") && (listfile == "")) { m->mothurOut("You must provide one of the following: constaxonomy, corraxes, otucorr, shared or list."); m->mothurOutEndLine(); abort = true; }
+
+ if ((sharedfile != "") || (listfile != "")) {
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; }
+ }
}
}
if (constaxonomyfile != "") { readClassifyOtu(); }
if (corraxesfile != "") { readCorrAxes(); }
if (otucorrfile != "") { readOtuAssociation(); }
+ if (listfile != "") { readList(); }
+ if (sharedfile != "") { readShared(); }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
+ string current = "";
+ itTypes = outputTypes.find("list");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
+ }
+
+ itTypes = outputTypes.find("shared");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); }
+ }
+
return 0;
}
catch(exception& e) {
}
}
//**********************************************************************************************************************
+int RemoveOtuLabelsCommand::readShared(){
+ try {
+
+ getShared();
+
+ if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
+
+ vector<string> newLabels;
+
+ //create new "filtered" lookup
+ vector<SharedRAbundVector*> newLookup;
+ for (int i = 0; i < lookup.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(lookup[i]->getLabel());
+ temp->setGroup(lookup[i]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ bool wroteSomething = false;
+ int numRemoved = 0;
+ for (int i = 0; i < lookup[0]->getNumBins(); i++) {
+
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; } return 0; }
+
+ //is this otu on the list
+ if (labels.count(m->currentBinLabels[i]) == 0) {
+ wroteSomething = true;
+ newLabels.push_back(m->currentBinLabels[i]);
+ for (int j = 0; j < newLookup.size(); j++) { //add this OTU to the new lookup
+ newLookup[j]->push_back(lookup[j]->getAbundance(i), lookup[j]->getGroup());
+ }
+ }else { numRemoved++; }
+ }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile));
+ variables["[extension]"] = m->getExtension(sharedfile);
+ variables["[distance]"] = lookup[0]->getLabel();
+ string outputFileName = getOutputFileName("shared", variables);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; }
+
+ m->currentBinLabels = newLabels;
+
+ newLookup[0]->printHeaders(out);
+
+ for (int i = 0; i < newLookup.size(); i++) {
+ out << newLookup[i]->getLabel() << '\t' << newLookup[i]->getGroup() << '\t';
+ newLookup[i]->print(out);
+ }
+ out.close();
+
+ for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; }
+
+ if (wroteSomething == false) { m->mothurOut("Your file contains only OTUs from the .accnos file."); m->mothurOutEndLine(); }
+
+ m->mothurOut("Removed " + toString(numRemoved) + " OTUs from your shared file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveOtuLabelsCommand", "readShared");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int RemoveOtuLabelsCommand::readList(){
+ try {
+ getListVector();
+
+ if (m->control_pressed) { delete list; return 0;}
+
+ ListVector newList;
+ newList.setLabel(list->getLabel());
+ int removedCount = 0;
+ bool wroteSomething = false;
+ string snumBins = toString(list->getNumBins());
+
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ if (m->control_pressed) { delete list; return 0;}
+
+ //create a label for this otu
+ string otuLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { otuLabel += "0"; }
+ }
+ otuLabel += sbinNumber;
+
+ if (labels.count(otuLabel) == 0) {
+ newList.push_back(list->get(i));
+ }else { removedCount++; }
+ }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(listfile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(listfile));
+ variables["[extension]"] = m->getExtension(listfile);
+ variables["[distance]"] = list->getLabel();
+ string outputFileName = getOutputFileName("list", variables);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ delete list;
+ //print new listvector
+ if (newList.getNumBins() != 0) {
+ wroteSomething = true;
+ newList.print(out);
+ }
+ out.close();
+
+ if (wroteSomething == false) { m->mothurOut("Your file contains only OTUs from the .accnos file."); m->mothurOutEndLine(); }
+ outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName);
+
+ m->mothurOut("Removed " + toString(removedCount) + " OTUs from your list file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveOtuLabelsCommand", "readList");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int RemoveOtuLabelsCommand::getListVector(){
+ try {
+ InputData input(listfile, "list");
+ list = input.getListVector();
+ string lastLabel = list->getLabel();
+
+ if (label == "") { label = lastLabel; return 0; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((list != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { return 0; }
+
+ if(labels.count(list->getLabel()) == 1){
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
+ delete list;
+ list = input.getListVector(lastLabel);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = list->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ delete list;
+ list = input.getListVector();
+ }
+
+
+ if (m->control_pressed) { return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ delete list;
+ list = input.getListVector(lastLabel);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveOtuLabelsCommand", "getListVector");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int RemoveOtuLabelsCommand::getShared(){
+ try {
+ InputData input(sharedfile, "sharedfile");
+ lookup = input.getSharedRAbundVectors();
+ string lastLabel = lookup[0]->getLabel();
+
+ if (label == "") { label = lastLabel; return 0; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((lookup[0] != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { return 0; }
+
+ if(labels.count(lookup[0]->getLabel()) == 1){
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = lookup[0]->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input.getSharedRAbundVectors();
+ }
+
+
+ if (m->control_pressed) { return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+ lookup = input.getSharedRAbundVectors(lastLabel);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RemoveOtuLabelsCommand", "getShared");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
//
#include "command.hpp"
+#include "inputdata.h"
+#include "listvector.hpp"
+#include "sharedrabundvector.h"
/**************************************************************************************************/
private:
bool abort;
- string outputDir, accnosfile, constaxonomyfile, otucorrfile, corraxesfile;
+ string outputDir, accnosfile, constaxonomyfile, otucorrfile, corraxesfile, listfile, sharedfile, label;
vector<string> outputNames;
set<string> labels;
+ ListVector* list;
+ vector<SharedRAbundVector*> lookup;
int readClassifyOtu();
int readOtuAssociation();
int readCorrAxes();
+ int readList();
+ int readShared();
+ int getListVector();
+ int getShared();
};
/**************************************************************************************************/
vector<string> ScreenSeqsCommand::setParameters(){
try {
CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
+ CommandParameter pcontigsreport("contigsreport", "InputTypes", "", "", "report", "none", "none","contigsreport",false,true,true); parameters.push_back(pcontigsreport);
+ CommandParameter palignreport("alignreport", "InputTypes", "", "", "report", "none", "none","alignreport",false,false); parameters.push_back(palignreport);
+ CommandParameter psummary("summary", "InputTypes", "", "", "report", "none", "none","summary",false,false); parameters.push_back(psummary);
CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile);
- CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none","alignreport",false,false); parameters.push_back(palignreport);
+
CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax);
CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart);
CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria);
CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+
+ //report parameters
+ CommandParameter pminoverlap("minoverlap", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminoverlap);
+ CommandParameter postart("ostart", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(postart);
+ CommandParameter poend("oend", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(poend);
+ CommandParameter pmismatches("mismatches", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmismatches);
+ CommandParameter pmaxn("maxn", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxn);
+ CommandParameter pminscore("minscore", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminscore);
+ CommandParameter pmaxinsert("maxinsert", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxinsert);
+ CommandParameter pminsim("minsim", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminsim);
+
+
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
try {
string helpString = "";
helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
- helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
+ helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, contigsreport, summary, taxonomy, optimize, criteria and processors.\n";
helpString += "The fasta parameter is required.\n";
- helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
+ helpString += "The contigsreport parameter allows you to use the contigsreport file to determine if a sequence is good. Screening parameters include: minoverlap, ostart, oend and mismatches. \n";
+ helpString += "The alignreport parameter allows you to use the alignreport file to determine if a sequence is good. Screening parameters include: minsim, minscore and maxinsert. \n";
+ helpString += "The summary parameter allows you to use the summary file from summary.seqs to save time processing.\n";
+ helpString += "The taxonomy parameter allows you to remove bad seqs from taxonomy files.\n";
helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
- helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
+ helpString += "The maxn parameter allows you to set and maximum number of N's allowed in a sequence. \n";
+ helpString += "The minoverlap parameter allows you to set and minimum overlap. The default is -1. \n";
+ helpString += "The ostart parameter is used to set an overlap position the \"good\" sequences must start by. The default is -1. \n";
+ helpString += "The oend parameter is used to set an overlap position the \"good\" sequences must end after. The default is -1.\n";
+ helpString += "The mismatches parameter allows you to set and maximum mismatches in the contigs.report. \n";
+ helpString += "The minsim parameter allows you to set the minimum similarity to template sequences during alignment. Found in column \'SimBtwnQuery&Template\' in align.report file.\n";
+ helpString += "The minscore parameter allows you to set the minimum search score during alignment. Found in column \'SearchScore\' in align.report file.\n";
+ helpString += "The maxinsert parameter allows you to set the maximum number of insertions during alignment. Found in column \'LongestInsert\' in align.report file.\n";
helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
else if (type == "qfile") { pattern = "[filename],good,[extension]"; }
else if (type == "alignreport") { pattern = "[filename],good.align.report"; }
+ else if (type == "contigsreport") { pattern = "[filename],good.contigs.report"; }
+ else if (type == "summary") { pattern = "[filename],good.summary"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
outputTypes["name"] = tempOutNames;
outputTypes["group"] = tempOutNames;
outputTypes["alignreport"] = tempOutNames;
+ outputTypes["contigsreport"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["taxonomy"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["taxonomy"] = tempOutNames;
outputTypes["count"] = tempOutNames;
-
+ outputTypes["contigsreport"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+
+
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (inputDir == "not found"){ inputDir = ""; }
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["alignreport"] = inputDir + it->second; }
}
+
+ it = parameters.find("contigsreport");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["contigsreport"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("summary");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["summary"] = inputDir + it->second; }
+ }
it = parameters.find("qfile");
//user has given a template file
else if (countfile == "not found") { countfile = ""; }
else { m->setCountTableFile(countfile); }
+ contigsreport = validParameter.validFile(parameters, "contigsreport", true);
+ if (contigsreport == "not open") { contigsreport = ""; abort = true; }
+ else if (contigsreport == "not found") { contigsreport = ""; }
+
+ summaryfile = validParameter.validFile(parameters, "summary", true);
+ if (summaryfile == "not open") { summaryfile = ""; abort = true; }
+ else if (summaryfile == "not found") { summaryfile = ""; }
+
if ((namefile != "") && (countfile != "")) {
m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
}
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+ temp = validParameter.validFile(parameters, "minoverlap", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, minOverlap);
+
+ temp = validParameter.validFile(parameters, "ostart", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, oStart);
+
+ temp = validParameter.validFile(parameters, "oend", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, oEnd);
+
+ temp = validParameter.validFile(parameters, "mismatches", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, mismatches);
+
+ temp = validParameter.validFile(parameters, "maxn", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, maxN);
+
+ temp = validParameter.validFile(parameters, "minscore", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, minScore);
+
+ temp = validParameter.validFile(parameters, "maxinsert", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, maxInsert);
+
+ temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, minSim);
+
temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
if (temp == "not found"){ temp = "none"; }
m->splitAtDash(temp, optimize);
+
+ if ((contigsreport != "") && ((summaryfile != "") || ( alignreport != ""))) {
+ m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
+ }
+
+ if ((alignreport != "") && ((summaryfile != "") || ( contigsreport != ""))) {
+ m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
+ }
+
+ if ((summaryfile != "") && ((alignreport != "") || ( contigsreport != ""))) {
+ m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
+ }
+ //check to make sure you have the files you need for certain screening
+ if ((contigsreport == "") && ((minOverlap != -1) || (oStart != -1) || (oEnd != -1) || (mismatches != -1))) {
+ m->mothurOut("[ERROR]: minoverlap, ostart, oend and mismatches can only be used with a contigs.report file, aborting.\n"); abort=true;
+ }
+
+ if ((alignreport == "") && ((minScore != -1) || (maxInsert != -1) || (minSim != -1))) {
+ m->mothurOut("[ERROR]: minscore, maxinsert and minsim can only be used with a align.report file, aborting.\n"); abort=true;
+ }
+
//check for invalid optimize options
set<string> validOptimizers;
- validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
+ validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); validOptimizers.insert("maxn");
+ if (contigsreport != "") { validOptimizers.insert("minoverlap"); validOptimizers.insert("ostart"); validOptimizers.insert("oend"); validOptimizers.insert("mismatches"); }
+ if (alignreport != "") { validOptimizers.insert("minscore"); validOptimizers.insert("maxinsert"); validOptimizers.insert("minsim"); }
+
for (int i = 0; i < optimize.size(); i++) {
if (validOptimizers.count(optimize[i]) == 0) {
- m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
+ m->mothurOut(optimize[i] + " is not a valid optimizer with your input files. Valid options are ");
+ string valid = "";
+ for (set<string>::iterator it = validOptimizers.begin(); it != validOptimizers.end(); it++) {
+ valid += (*it) + ", ";
+ }
+ if (valid.length() != 0) { valid = valid.substr(0, valid.length()-2); }
+ m->mothurOut(valid + ".");
+ m->mothurOutEndLine();
optimize.erase(optimize.begin()+i);
i--;
}
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- //if the user want to optimize we need to know the 90% mark
- vector<unsigned long long> positions;
- if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
- //use the namefile to optimize correctly
- if (namefile != "") { nameMap = m->readNames(namefile); }
- else if (countfile != "") {
- CountTable ct;
- ct.readTable(countfile);
- nameMap = ct.getNameMap();
- }
- getSummary(positions);
- }
- else {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- positions = m->divideFile(fastafile, processors);
- for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
- #else
- if(processors == 1){ lines.push_back(linePair(0, 1000)); }
- else {
- int numFastaSeqs = 0;
- positions = m->setFilePosFasta(fastafile, numFastaSeqs);
- if (positions.size() < processors) { processors = positions.size(); }
-
- //figure out how many sequences you have to process
- int numSeqsPerProcessor = numFastaSeqs / processors;
- for (int i = 0; i < processors; i++) {
- int startIndex = i * numSeqsPerProcessor;
- if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
- lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
- }
- }
- #endif
- }
+ map<string, string> badSeqNames;
+ int start = time(NULL);
+ int numFastaSeqs = 0;
- map<string, string> variables;
- variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
- string badAccnosFile = getOutputFileName("accnos",variables);
- variables["[extension]"] = m->getExtension(fastafile);
- string goodSeqFile = getOutputFileName("fasta", variables);
-
+ if ((contigsreport == "") && (summaryfile == "") && (alignreport == "")) { numFastaSeqs = screenFasta(badSeqNames); }
+ else { numFastaSeqs = screenReports(badSeqNames); }
- int numFastaSeqs = 0;
- set<string> badSeqNames;
- int start = time(NULL);
-
-#ifdef USE_MPI
- int pid, numSeqsPerProcessor;
- int tag = 2001;
- vector<unsigned long long> MPIPos;
-
- MPI_Status status;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
- MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
- MPI_File inMPI;
- MPI_File outMPIGood;
- MPI_File outMPIBadAccnos;
-
- int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
- int inMode=MPI_MODE_RDONLY;
-
- char outGoodFilename[1024];
- strcpy(outGoodFilename, goodSeqFile.c_str());
-
- char outBadAccnosFilename[1024];
- strcpy(outBadAccnosFilename, badAccnosFile.c_str());
-
- char inFileName[1024];
- strcpy(inFileName, fastafile.c_str());
-
- MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
- MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
- MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
-
- if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
-
- if (pid == 0) { //you are the root process
-
- MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
-
- //send file positions to all processes
- for(int i = 1; i < processors; i++) {
- MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
- MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
- }
-
- //figure out how many sequences you have to align
- numSeqsPerProcessor = numFastaSeqs / processors;
- int startIndex = pid * numSeqsPerProcessor;
- if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
-
- //align your part
- driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
-
- if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
-
- for (int i = 1; i < processors; i++) {
- //get bad lists
- int badSize;
- MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
- }
- }else{ //you are a child process
- MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
- MPIPos.resize(numFastaSeqs+1);
- MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
-
- //figure out how many sequences you have to align
- numSeqsPerProcessor = numFastaSeqs / processors;
- int startIndex = pid * numSeqsPerProcessor;
- if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
-
- //align your part
- driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
-
- if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
-
- //send bad list
- int badSize = badSeqNames.size();
- MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
- }
-
- //close files
- MPI_File_close(&inMPI);
- MPI_File_close(&outMPIGood);
- MPI_File_close(&outMPIBadAccnos);
- MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
-
-#else
- if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
- else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
-#endif
-
- #ifdef USE_MPI
- MPI_Comm_rank(MPI_COMM_WORLD, &pid);
-
- if (pid == 0) { //only one process should fix files
-
- //read accnos file with all names in it, process 0 just has its names
- MPI_File inMPIAccnos;
- MPI_Offset size;
-
- char inFileName[1024];
- strcpy(inFileName, badAccnosFile.c_str());
-
- MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
- MPI_File_get_size(inMPIAccnos, &size);
-
- char* buffer = new char[size];
- MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
-
- string tempBuf = buffer;
- if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
- istringstream iss (tempBuf,istringstream::in);
-
- delete buffer;
- MPI_File_close(&inMPIAccnos);
-
- badSeqNames.clear();
- string tempName;
- while (!iss.eof()) {
- iss >> tempName; m->gobble(iss);
- badSeqNames.insert(tempName);
- }
- #endif
-
+ #ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) { //only one process should fix files
+ #endif
+
if(namefile != "" && groupfile != "") {
screenNameGroupFile(badSeqNames);
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
}else if(namefile != "") {
screenNameGroupFile(badSeqNames);
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
}else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
else if (countfile != "") { screenCountFile(badSeqNames); }
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
- if(alignreport != "") { screenAlignReport(badSeqNames); }
if(qualfile != "") { screenQual(badSeqNames); }
if(taxonomy != "") { screenTaxonomy(badSeqNames); }
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
#ifdef USE_MPI
}
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
- m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
- m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
m->mothurOutEndLine();
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
}
- itTypes = outputTypes.find("count");
- if (itTypes != outputTypes.end()) {
- if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+ itTypes = outputTypes.find("count");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+ }
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "execute");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::runFastaScreening(map<string, string>& badSeqNames){
+ try{
+ int numFastaSeqs = 0;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
+ string badAccnosFile = getOutputFileName("accnos",variables);
+ variables["[extension]"] = m->getExtension(fastafile);
+ string goodSeqFile = getOutputFileName("fasta", variables);
+ outputNames.push_back(goodSeqFile); outputTypes["fasta"].push_back(goodSeqFile);
+ outputNames.push_back(badAccnosFile); outputTypes["accnos"].push_back(badAccnosFile);
+
+#ifdef USE_MPI
+ int pid, numSeqsPerProcessor;
+ int tag = 2001;
+ vector<unsigned long long> MPIPos;
+
+ MPI_Status status;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ MPI_File inMPI;
+ MPI_File outMPIGood;
+ MPI_File outMPIBadAccnos;
+
+ int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
+ int inMode=MPI_MODE_RDONLY;
+
+ char outGoodFilename[1024];
+ strcpy(outGoodFilename, goodSeqFile.c_str());
+
+ char outBadAccnosFilename[1024];
+ strcpy(outBadAccnosFilename, badAccnosFile.c_str());
+
+ char inFileName[1024];
+ strcpy(inFileName, fastafile.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
+ MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
+
+ if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
+
+ if (pid == 0) { //you are the root process
+
+ MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
+
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
+
+ //figure out how many sequences you have to align
+ numSeqsPerProcessor = numFastaSeqs / processors;
+ int startIndex = pid * numSeqsPerProcessor;
+ if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
+
+ //align your part
+ driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
+
+ if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
+
+ for (int i = 1; i < processors; i++) {
+ //get bad lists
+ int badSize;
+ MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
+ }
+ }else{ //you are a child process
+ MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ MPIPos.resize(numFastaSeqs+1);
+ MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+
+ //figure out how many sequences you have to align
+ numSeqsPerProcessor = numFastaSeqs / processors;
+ int startIndex = pid * numSeqsPerProcessor;
+ if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
+
+ //align your part
+ driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
+
+ if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
+
+ //send bad list
+ int badSize = badSeqNames.size();
+ MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
+ }
+
+ //close files
+ MPI_File_close(&inMPI);
+ MPI_File_close(&outMPIGood);
+ MPI_File_close(&outMPIBadAccnos);
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+
+#else
+ if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+ else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+
+ if (m->control_pressed) { m->mothurRemove(goodSeqFile); return numFastaSeqs; }
+#endif
+
+#ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) { //only one process should fix files
+
+ //read accnos file with all names in it, process 0 just has its names
+ MPI_File inMPIAccnos;
+ MPI_Offset size;
+
+ char inFileName[1024];
+ strcpy(inFileName, badAccnosFile.c_str());
+
+ MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
+ MPI_File_get_size(inMPIAccnos, &size);
+
+ char* buffer = new char[size];
+ MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
+
+ string tempBuf = buffer;
+ if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
+ istringstream iss (tempBuf,istringstream::in);
+
+ delete buffer;
+ MPI_File_close(&inMPIAccnos);
+
+ badSeqNames.clear();
+ string tempName, trashCode;
+ while (!iss.eof()) {
+ iss >> tempName >> trashCode; m->gobble(iss);
+ badSeqNames[tempName] = trashCode;
+ }
+ }
+#endif
+
+
+ return numFastaSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "runFastaScreening");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenReports(map<string, string>& badSeqNames){
+ try{
+ int numFastaSeqs = 0;
+ bool summarizedFasta = false;
+
+ //did not provide a summary file, but set a parameter that requires summarizing the fasta file
+ //or did provide a summary file, but set maxn parameter so we must summarize the fasta file
+ vector<unsigned long long> positions;
+ if (((summaryfile == "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize)))) || ((summaryfile != "") && m->inUsersGroups("maxn", optimize))) {
+ //use the namefile to optimize correctly
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummary(positions);
+ summarizedFasta = true;
+ } else {
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+ #else
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+ #endif
+ }
+
+ if ((summaryfile != "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize))) && !summarizedFasta) { //summarize based on summaryfile
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummaryReport();
+ }else if ((contigsreport != "") && ((m->inUsersGroups("minoverlap", optimize)) || (m->inUsersGroups("ostart", optimize)) || (m->inUsersGroups("oend", optimize)) || (m->inUsersGroups("mismatches", optimize)))) { //optimize settings based on contigs file
+ optimizeContigs();
+ }else if ((alignreport != "") && ((m->inUsersGroups("minsim", optimize)) || (m->inUsersGroups("minscore", optimize)) || (m->inUsersGroups("maxinsert", optimize)))) { //optimize settings based on contigs file
+ optimizeAlign();
+ }
+
+
+ //provided summary file, and did not set maxn so no need to summarize fasta
+ if (summaryfile != "") { numFastaSeqs = screenSummary(badSeqNames); }
+ //add in any seqs that fail due to contigs report results
+ else if (contigsreport != "") { numFastaSeqs = screenContigs(badSeqNames); }
+ //add in any seqs that fail due to align report
+ else if (alignreport != "") { numFastaSeqs = screenAlignReport(badSeqNames); }
+
+ return numFastaSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenReports");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ScreenSeqsCommand::screenAlignReport(map<string, string>& badSeqNames){
+ try {
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
+ string outSummary = getOutputFileName("alignreport",variables);
+ outputNames.push_back(outSummary); outputTypes["alignreport"].push_back(outSummary);
+
+ string name, TemplateName, SearchMethod, AlignmentMethod;
+ //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
+ //checking for minScore, maxInsert, minSim
+ int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert;
+ float SearchScore, SimBtwnQueryTemplate;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(alignreport, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(maxInsert != -1 && maxInsert < LongestInsert) { goodSeq = 0; trashCode += "insert|"; }
+ if(minScore != -1 && minScore > SearchScore) { goodSeq = 0; trashCode += "score|"; }
+ if(minSim != -1 && minSim > SimBtwnQueryTemplate) { goodSeq = 0; trashCode += "sim|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenContigs(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport));
+ string outSummary = getOutputFileName("contigsreport",variables);
+ outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary);
+
+ string name;
+ //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
+ int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(contigsreport, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(oStart != -1 && oStart < thisOStart) { goodSeq = 0; trashCode += "ostart|"; }
+ if(oEnd != -1 && oEnd > thisOEnd) { goodSeq = 0; trashCode += "oend|"; }
+ if(maxN != -1 && maxN < numNs) { goodSeq = 0; trashCode += "n|"; }
+ if(minOverlap != -1 && minOverlap > OLength) { goodSeq = 0; trashCode += "olength|"; }
+ if(mismatches != -1 && mismatches < numMisMatches) { goodSeq = 0; trashCode += "mismatches|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+
+
+ return count;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenContigs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenSummary(map<string, string>& badSeqNames){
+ try{
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile));
+ string outSummary = getOutputFileName("summary",variables);
+ outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary);
+
+ string name;
+ int start, end, length, ambigs, polymer, numReps;
+
+ ofstream out;
+ m->openOutputFile(outSummary, out);
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(summaryfile, in);
+ out << (m->getline(in)) << endl; //skip headers
+
+ int count = 0;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
+
+ bool goodSeq = 1; // innocent until proven guilty
+ string trashCode = "";
+ if(startPos != -1 && startPos < start) { goodSeq = 0; trashCode += "start|"; }
+ if(endPos != -1 && endPos > end) { goodSeq = 0; trashCode += "end|"; }
+ if(maxAmbig != -1 && maxAmbig < ambigs) { goodSeq = 0; trashCode += "ambig|"; }
+ if(maxHomoP != -1 && maxHomoP < polymer) { goodSeq = 0; trashCode += "homop|"; }
+ if(minLength != -1 && minLength > length) { goodSeq = 0; trashCode += "<length|"; }
+ if(maxLength != -1 && maxLength < length) { goodSeq = 0; trashCode += ">length|"; }
+
+ if(goodSeq == 1){
+ out << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ else{ badSeqNames[name] = trashCode; }
+ count++;
+ }
+ in.close();
+ out.close();
+
+ int oldBadSeqsCount = badSeqNames.size();
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
+ m->renameFile(outSummary, outSummary+".temp");
+
+ ofstream out2;
+ m->openOutputFile(outSummary, out2);
+
+ //read summary file
+ ifstream in2;
+ m->openInputFile(outSummary+".temp", in2);
+ out2 << (m->getline(in2)) << endl; //skip headers
+
+ while (!in2.eof()) {
+
+ if (m->control_pressed) { in2.close(); out2.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2);
+
+ if (badSeqNames.count(name) == 0) { //are you good?
+ out2 << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl;
+ }
+ }
+ in2.close();
+ out2.close();
+ m->mothurRemove(outSummary+".temp");
+ }
+
+ if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; }
+
+
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenSummary");
+ exit(1);
+ }
+}
+//***************************************************************************************************************/
+int ScreenSeqsCommand::screenFasta(map<string, string>& badSeqNames){
+ try{
+
+
+ //if the user want to optimize we need to know the 90% mark
+ vector<unsigned long long> positions;
+ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+ //use the namefile to optimize correctly
+ if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
+ getSummary(positions);
+ }else {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+ }
+
+ if (m->control_pressed) { return 0; }
+
+ int numFastaSeqs = runFastaScreening(badSeqNames);
+
+ return numFastaSeqs;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenFasta");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::screenNameGroupFile(map<string, string> badSeqNames){
+ try {
+ ifstream inputNames;
+ m->openInputFile(namefile, inputNames);
+ map<string, string> badSeqGroups;
+ string seqName, seqList, group;
+ map<string, string>::iterator it;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
+ variables["[extension]"] = m->getExtension(namefile);
+ string goodNameFile = getOutputFileName("name", variables);
+ outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
+
+ ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
+
+ while(!inputNames.eof()){
+ if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
+
+ inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList;
+ it = badSeqNames.find(seqName);
+
+ if(it != badSeqNames.end()){
+ badSeqNames.erase(it);
+
+ if(namefile != ""){
+ int start = 0;
+ for(int i=0;i<seqList.length();i++){
+ if(seqList[i] == ','){
+ badSeqGroups[seqList.substr(start,i-start)] = it->second;
+ start = i+1;
+ }
+ }
+ badSeqGroups[seqList.substr(start,seqList.length()-start)] = it->second;
+ }
+ }
+ else{
+ goodNameOut << seqName << '\t' << seqList << endl;
+ }
+ m->gobble(inputNames);
+ }
+ inputNames.close();
+ goodNameOut.close();
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ m->mothurOut("Your namefile does not include the sequence " + it->first + " please correct.");
+ m->mothurOutEndLine();
+ }
+ }
+
+ if(groupfile != ""){
+
+ ifstream inputGroups;
+ m->openInputFile(groupfile, inputGroups);
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
+ variables["[extension]"] = m->getExtension(groupfile);
+ string goodGroupFile = getOutputFileName("group", variables);
+
+ outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
+
+ ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
+
+ while(!inputGroups.eof()){
+ if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; }
+
+ inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
+
+ it = badSeqGroups.find(seqName);
+
+ if(it != badSeqGroups.end()){
+ badSeqGroups.erase(it);
+ }
+ else{
+ goodGroupOut << seqName << '\t' << group << endl;
+ }
+ m->gobble(inputGroups);
+ }
+ inputGroups.close();
+ goodGroupOut.close();
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqGroups.size() != 0) {
+ for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
+ m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct.");
+ m->mothurOutEndLine();
+ }
+ }
+ }
+
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ScreenSeqsCommand::getSummaryReport(){
+ try {
+
+ vector<int> startPosition;
+ vector<int> endPosition;
+ vector<int> seqLength;
+ vector<int> ambigBases;
+ vector<int> longHomoPolymer;
+
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) {
+#endif
+
+
+ //read summary file
+ ifstream in;
+ m->openInputFile(summaryfile, in);
+ m->getline(in);
+
+ string name;
+ int start, end, length, ambigs, polymer, numReps;
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
+
+ int num = 1;
+ if ((namefile != "") || (countfile !="")) {
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = nameMap.find(name);
+
+ if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ startPosition.push_back(start);
+ endPosition.push_back(end);
+ seqLength.push_back(length);
+ ambigBases.push_back(ambigs);
+ longHomoPolymer.push_back(polymer);
+ }
+
+ }
+ in.close();
+
+ sort(startPosition.begin(), startPosition.end());
+ sort(endPosition.begin(), endPosition.end());
+ sort(seqLength.begin(), seqLength.end());
+ sort(ambigBases.begin(), ambigBases.end());
+ sort(longHomoPolymer.begin(), longHomoPolymer.end());
+
+ //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
+ int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
+
+ for (int i = 0; i < optimize.size(); i++) {
+ if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
+ else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
+ }
+
+#ifdef USE_MPI
+ }
+
+ MPI_Status status;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ if (pid == 0) {
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ }
+ }else {
+ MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ }
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+#endif
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "getSummaryReport");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ScreenSeqsCommand::optimizeContigs(){
+ try {
+ vector<int> olengths;
+ vector<int> oStarts;
+ vector<int> oEnds;
+ vector<int> numMismatches;
+ vector<int> numNs;
+
+ vector<unsigned long long> positions;
+ vector<linePair> contigsLines;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFilePerLine(contigsreport, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { contigsLines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if(processors == 1){ contigsLines.push_back(linePair(0, 1000)); }
+ else {
+ int numContigsSeqs = 0;
+ positions = m->setFilePosEachLine(contigsreport, numContigsSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numContigsSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numContigsSeqs - i * numSeqsPerProcessor; }
+ contigsLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) {
+ driverContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines[0]);
+#else
+ createProcessesContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines);
+
+ if (m->control_pressed) { return 0; }
+#endif
+ sort(olengths.begin(), olengths.end());
+ sort(oStarts.begin(), oStarts.end());
+ sort(oEnds.begin(), oEnds.end());
+ sort(numMismatches.begin(), numMismatches.end());
+ sort(numNs.begin(), numNs.end());
+
+ //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
+ int criteriaPercentile = int(oStarts.size() * (criteria / (float) 100));
+
+ for (int i = 0; i < optimize.size(); i++) {
+ if (optimize[i] == "ostart") { oStart = oStarts[criteriaPercentile]; m->mothurOut("Optimizing ostart to " + toString(oStart) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "oend") { int endcriteriaPercentile = int(oEnds.size() * ((100 - criteria) / (float) 100)); oEnd = oEnds[endcriteriaPercentile]; m->mothurOut("Optimizing oend to " + toString(oEnd) + "."); m->mothurOutEndLine();}
+ else if (optimize[i] == "mismatches") { mismatches = numMismatches[criteriaPercentile]; m->mothurOut("Optimizing mismatches to " + toString(mismatches) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "minoverlap") { int mincriteriaPercentile = int(olengths.size() * ((100 - criteria) / (float) 100)); minOverlap = olengths[mincriteriaPercentile]; m->mothurOut("Optimizing minoverlap to " + toString(minOverlap) + "."); m->mothurOutEndLine(); }
+
+ }
+
+#ifdef USE_MPI
+ }
+
+ MPI_Status status;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ if (pid == 0) {
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&minOverlap, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&oStart, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&oEnd, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&mismatches, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ }
+ }else {
+ MPI_Recv(&minOverlap, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&oStart, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&oEnd, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&mismatches, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ }
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+#endif
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
+ exit(1);
+ }
+}
+/**************************************************************************************/
+int ScreenSeqsCommand::driverContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, linePair filePos) {
+ try {
+
+ string name;
+ //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
+ int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
+
+ ifstream in;
+ m->openInputFile(contigsreport, in);
+
+ in.seekg(filePos.start);
+ if (filePos.start == 0) { //read headers
+ m->getline(in); m->gobble(in);
+ }
+
+ bool done = false;
+ int count = 0;
+
+ while (!done) {
+
+ if (m->control_pressed) { in.close(); return 1; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; m->gobble(in);
+
+ int num = 1;
+ if ((namefile != "") || (countfile !="")){
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = nameMap.find(name);
+
+ if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ ostartPosition.push_back(thisOStart);
+ oendPosition.push_back(thisOEnd);
+ oLength.push_back(OLength);
+ omismatches.push_back(numMisMatches);
+ numNs.push_back(numns);
+ }
+
+ count++;
+
+ //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ unsigned long long pos = in.tellg();
+ if ((pos == -1) || (pos >= filePos.end)) { break; }
+#else
+ if (in.eof()) { break; }
+#endif
+ }
+
+ in.close();
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "driverContigsSummary");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int ScreenSeqsCommand::createProcessesContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, vector<linePair> contigsLines) {
+ try {
+
+ int process = 1;
+ int num = 0;
+ vector<int> processIDS;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[process]);
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = contigsreport + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+
+ out << num << endl;
+ out << ostartPosition.size() << endl;
+ for (int k = 0; k < ostartPosition.size(); k++) { out << ostartPosition[k] << '\t'; } out << endl;
+ for (int k = 0; k < oendPosition.size(); k++) { out << oendPosition[k] << '\t'; } out << endl;
+ for (int k = 0; k < oLength.size(); k++) { out << oLength[k] << '\t'; } out << endl;
+ for (int k = 0; k < omismatches.size(); k++) { out << omismatches[k] << '\t'; } out << endl;
+ for (int k = 0; k < numNs.size(); k++) { out << numNs[k] << '\t'; } out << endl;
+
+ out.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[0]);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //parent reads in and combine Filter info
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempFilename = contigsreport + toString(processIDS[i]) + ".num.temp";
+ ifstream in;
+ m->openInputFile(tempFilename, in);
+
+ int temp, tempNum;
+ in >> tempNum; m->gobble(in); num += tempNum;
+ in >> tempNum; m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; ostartPosition.push_back(temp); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; oendPosition.push_back(temp); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; oLength.push_back(temp); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; omismatches.push_back(temp); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; numNs.push_back(temp); } m->gobble(in);
+
+ in.close();
+ m->mothurRemove(tempFilename);
+ }
+
+
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ /*
+ vector<contigsSumData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+
+ // Allocate memory for thread data.
+ contigsSumData* tempSum = new contigsSumData(contigsreport, m, contigsLines[i].start, contigsLines[i].end, namefile, countfile, nameMap);
+ pDataArray.push_back(tempSum);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyContigsSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+ */
+ contigsLines[processors-1].start = 0;
+ //do your part
+ num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[processors-1]);
+ /*
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
+ for (int k = 0; k < pDataArray[i]->ostartPosition.size(); k++) { ostartPosition.push_back(pDataArray[i]->ostartPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->oendPosition.size(); k++) { oendPosition.push_back(pDataArray[i]->oendPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->oLength.size(); k++) { oLength.push_back(pDataArray[i]->oLength[k]); }
+ for (int k = 0; k < pDataArray[i]->omismatches.size(); k++) { omismatches.push_back(pDataArray[i]->omismatches[k]); }
+ for (int k = 0; k < pDataArray[i]->numNs.size(); k++) { numNs.push_back(pDataArray[i]->numNs[k]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+ */
+#endif
+ return num;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "createProcessesContigsSummary");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ScreenSeqsCommand::optimizeAlign(){
+ try {
+
+ vector<float> sims;
+ vector<float> scores;
+ vector<int> inserts;
+
+ vector<unsigned long long> positions;
+ vector<linePair> alignLines;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFilePerLine(alignreport, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { alignLines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if(processors == 1){ alignLines.push_back(linePair(0, 1000)); }
+ else {
+ int numAlignSeqs = 0;
+ positions = m->setFilePosEachLine(alignreport, numAlignSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numAlignSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numAlignSeqs - i * numSeqsPerProcessor; }
+ alignLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) {
+ driverAlignSummary(sims, scores, inserts, alignLines[0]);
+#else
+ createProcessesAlignSummary(sims, scores, inserts, alignLines);
+
+ if (m->control_pressed) { return 0; }
+#endif
+ sort(sims.begin(), sims.end());
+ sort(scores.begin(), scores.end());
+ sort(inserts.begin(), inserts.end());
+
+ //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
+ int criteriaPercentile = int(sims.size() * (criteria / (float) 100));
+
+ for (int i = 0; i < optimize.size(); i++) {
+ if (optimize[i] == "minsim") { int mincriteriaPercentile = int(sims.size() * ((100 - criteria) / (float) 100)); minSim = sims[mincriteriaPercentile]; m->mothurOut("Optimizing minsim to " + toString(minSim) + "."); m->mothurOutEndLine();}
+ else if (optimize[i] == "minscore") { int mincriteriaPercentile = int(scores.size() * ((100 - criteria) / (float) 100)); minScore = scores[mincriteriaPercentile]; m->mothurOut("Optimizing minscore to " + toString(minScore) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "maxinsert") { maxInsert = inserts[criteriaPercentile]; m->mothurOut("Optimizing maxinsert to " + toString(maxInsert) + "."); m->mothurOutEndLine(); }
+ }
+
+#ifdef USE_MPI
+ }
+
+ MPI_Status status;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ if (pid == 0) {
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&minSim, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&minScore, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxInsert, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ }
+ }else {
+ MPI_Recv(&minSim, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&minScore, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxInsert, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
}
-
- m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
- m->mothurOutEndLine();
-
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+#endif
return 0;
}
catch(exception& e) {
- m->errorOut(e, "ScreenSeqsCommand", "execute");
+ m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
exit(1);
}
}
-
-//***************************************************************************************************************
-
-int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
+/**************************************************************************************/
+int ScreenSeqsCommand::driverAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, linePair filePos) {
try {
- ifstream inputNames;
- m->openInputFile(namefile, inputNames);
- set<string> badSeqGroups;
- string seqName, seqList, group;
- set<string>::iterator it;
- map<string, string> variables;
- variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
- variables["[extension]"] = m->getExtension(namefile);
- string goodNameFile = getOutputFileName("name", variables);
- outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
- ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
+ string name, TemplateName, SearchMethod, AlignmentMethod;
+ //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
+ //checking for minScore, maxInsert, minSim
+ int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert;
+ float SearchScore, SimBtwnQueryTemplate;
+
+ ifstream in;
+ m->openInputFile(alignreport, in);
+
+ in.seekg(filePos.start);
+ if (filePos.start == 0) { //read headers
+ m->getline(in); m->gobble(in);
+ }
+
+ bool done = false;
+ int count = 0;
+
+ while (!done) {
+
+ if (m->control_pressed) { in.close(); return 1; }
+
+ in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
+
+ int num = 1;
+ if ((namefile != "") || (countfile !="")){
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = nameMap.find(name);
+
+ if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ sims.push_back(SimBtwnQueryTemplate);
+ scores.push_back(SearchScore);
+ inserts.push_back(LongestInsert);
+ }
+
+ count++;
+
+ //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ unsigned long long pos = in.tellg();
+ if ((pos == -1) || (pos >= filePos.end)) { break; }
+#else
+ if (in.eof()) { break; }
+#endif
+ }
- while(!inputNames.eof()){
- if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
+ in.close();
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "driverAlignSummary");
+ exit(1);
+ }
+}
- inputNames >> seqName >> seqList;
- it = badSeqNames.find(seqName);
+/**************************************************************************************************/
+int ScreenSeqsCommand::createProcessesAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, vector<linePair> alignLines) {
+ try {
+
+ int process = 1;
+ int num = 0;
+ vector<int> processIDS;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driverAlignSummary(sims, scores, inserts, alignLines[process]);
- if(it != badSeqNames.end()){
- badSeqNames.erase(it);
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = alignreport + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
- if(namefile != ""){
- int start = 0;
- for(int i=0;i<seqList.length();i++){
- if(seqList[i] == ','){
- badSeqGroups.insert(seqList.substr(start,i-start));
- start = i+1;
- }
- }
- badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
- }
- }
- else{
- goodNameOut << seqName << '\t' << seqList << endl;
- }
- m->gobble(inputNames);
- }
- inputNames.close();
- goodNameOut.close();
-
- //we were unable to remove some of the bad sequences
- if (badSeqNames.size() != 0) {
- for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
- m->mothurOutEndLine();
- }
- }
-
- if(groupfile != ""){
-
- ifstream inputGroups;
- m->openInputFile(groupfile, inputGroups);
- variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
- variables["[extension]"] = m->getExtension(groupfile);
- string goodGroupFile = getOutputFileName("group", variables);
-
- outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
-
- ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
-
- while(!inputGroups.eof()){
- if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; }
-
- inputGroups >> seqName >> group;
+ out << num << endl;
+ out << sims.size() << endl;
+ for (int k = 0; k < sims.size(); k++) { out << sims[k] << '\t'; } out << endl;
+ for (int k = 0; k < scores.size(); k++) { out << scores[k] << '\t'; } out << endl;
+ for (int k = 0; k < inserts.size(); k++) { out << inserts[k] << '\t'; } out << endl;
- it = badSeqGroups.find(seqName);
+ out.close();
- if(it != badSeqGroups.end()){
- badSeqGroups.erase(it);
- }
- else{
- goodGroupOut << seqName << '\t' << group << endl;
- }
- m->gobble(inputGroups);
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
}
- inputGroups.close();
- goodGroupOut.close();
+ }
+
+ num = driverAlignSummary(sims, scores, inserts, alignLines[0]);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //parent reads in and combine Filter info
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempFilename = alignreport + toString(processIDS[i]) + ".num.temp";
+ ifstream in;
+ m->openInputFile(tempFilename, in);
- //we were unable to remove some of the bad sequences
- if (badSeqGroups.size() != 0) {
- for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
- m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
- m->mothurOutEndLine();
- }
- }
+ int temp, tempNum;
+ float temp2;
+ in >> tempNum; m->gobble(in); num += tempNum;
+ in >> tempNum; m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp2; sims.push_back(temp2); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp2; scores.push_back(temp2); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; inserts.push_back(temp); } m->gobble(in);
+
+ in.close();
+ m->mothurRemove(tempFilename);
}
- return 0;
-
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ /*
+ vector<alignsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+
+ // Allocate memory for thread data.
+ alignsData* tempSum = new alignsData(alignreport, m, alignLines[i].start, alignLines[i].end, namefile, countfile, nameMap);
+ pDataArray.push_back(tempSum);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyAlignsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }*/
+ alignLines[processors-1].start = 0;
+ //do your part
+ num = driverAlignSummary(sims, scores, inserts, alignLines[processors-1]);
+ /*
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
+ for (int k = 0; k < pDataArray[i]->sims.size(); k++) { sims.push_back(pDataArray[i]->sims[k]); }
+ for (int k = 0; k < pDataArray[i]->scores.size(); k++) { scores.push_back(pDataArray[i]->scores[k]); }
+ for (int k = 0; k < pDataArray[i]->inserts.size(); k++) { inserts.push_back(pDataArray[i]->inserts[k]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+ */
+#endif
+ return num;
}
catch(exception& e) {
- m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
+ m->errorOut(e, "ScreenSeqsCommand", "createProcessesAlignSummary");
exit(1);
}
}
vector<int> seqLength;
vector<int> ambigBases;
vector<int> longHomoPolymer;
+ vector<int> numNs;
vector<unsigned long long> positions;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
if (pid == 0) {
- driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
+ driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
#else
int numSeqs = 0;
//#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
- numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
+ numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
}else{
- numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
+ numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile);
}
if (m->control_pressed) { return 0; }
- //#else
- // numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
- // if (m->control_pressed) { return 0; }
- //#endif
#endif
sort(startPosition.begin(), startPosition.end());
sort(endPosition.begin(), endPosition.end());
sort(seqLength.begin(), seqLength.end());
sort(ambigBases.begin(), ambigBases.end());
sort(longHomoPolymer.begin(), longHomoPolymer.end());
+ sort(numNs.begin(), numNs.end());
//numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
+ else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
}
#ifdef USE_MPI
MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
}
}else {
MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#endif
}
}
/**************************************************************************************/
-int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {
+int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename, linePair filePos) {
try {
ifstream in;
if (current.getName() != "") {
int num = 1;
- if (namefile != "") {
+ if ((namefile != "") || (countfile !="")){
//make sure this sequence is in the namefile, else error
map<string, int>::iterator it = nameMap.find(current.getName());
}
//for each sequence this sequence represents
+ int numns = current.getNumNs();
for (int i = 0; i < num; i++) {
startPosition.push_back(current.getStartPos());
endPosition.push_back(current.getEndPos());
seqLength.push_back(current.getNumBases());
ambigBases.push_back(current.getAmbigBases());
longHomoPolymer.push_back(current.getLongHomoPolymer());
+ numNs.push_back(numns);
}
count++;
}
}
/**************************************************************************************************/
-int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
+int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename) {
try {
int process = 1;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
+ num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[process]);
//pass numSeqs to parent
ofstream out;
for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
+ for (int k = 0; k < numNs.size(); k++) { out << numNs[k] << '\t'; } out << endl;
out.close();
}
}
- num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
+ num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
+ for (int k = 0; k < tempNum; k++) { in >> temp; numNs.push_back(temp); } m->gobble(in);
in.close();
m->mothurRemove(tempFilename);
for( int i=0; i<processors-1; i++ ){
// Allocate memory for thread data.
- sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
+ sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, countfile, nameMap);
pDataArray.push_back(tempSum);
//MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
}
//do your part
- num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
+ num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[processors-1]);
//Wait until all threads have terminated.
WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); }
for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); }
+ for (int k = 0; k < pDataArray[i]->numNs.size(); k++) { numNs.push_back(pDataArray[i]->numNs[k]); }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
//***************************************************************************************************************
-int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
+int ScreenSeqsCommand::screenGroupFile(map<string, string> badSeqNames){
try {
ifstream inputGroups;
m->openInputFile(groupfile, inputGroups);
string seqName, group;
- set<string>::iterator it;
+ map<string, string>::iterator it;
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
variables["[extension]"] = m->getExtension(groupfile);
while(!inputGroups.eof()){
if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
- inputGroups >> seqName >> group;
+ inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
it = badSeqNames.find(seqName);
if(it != badSeqNames.end()){
//we were unable to remove some of the bad sequences
if (badSeqNames.size() != 0) {
for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
+ m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct.");
m->mothurOutEndLine();
}
}
}
}
//***************************************************************************************************************
-int ScreenSeqsCommand::screenCountFile(set<string> badSeqNames){
+int ScreenSeqsCommand::screenCountFile(map<string, string> badSeqNames){
try {
ifstream in;
m->openInputFile(countfile, in);
- set<string>::iterator it;
+ map<string, string>::iterator it;
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
variables["[extension]"] = m->getExtension(countfile);
//we were unable to remove some of the bad sequences
if (badSeqNames.size() != 0) {
for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your count file does not include the sequence " + *it + " please correct.");
+ m->mothurOut("Your count file does not include the sequence " + it->first + " please correct.");
m->mothurOutEndLine();
}
}
}
//***************************************************************************************************************
-int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
- try {
- ifstream inputAlignReport;
- m->openInputFile(alignreport, inputAlignReport);
- string seqName, group;
- set<string>::iterator it;
-
- map<string, string> variables;
- variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
- string goodAlignReportFile = getOutputFileName("alignreport", variables);
-
- outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
- ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
-
- while (!inputAlignReport.eof()) { // need to copy header
- char c = inputAlignReport.get();
- goodAlignReportOut << c;
- if (c == 10 || c == 13){ break; }
- }
-
- while(!inputAlignReport.eof()){
- if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
-
- inputAlignReport >> seqName;
- it = badSeqNames.find(seqName);
- string line;
- while (!inputAlignReport.eof()) { // need to copy header
- char c = inputAlignReport.get();
- line += c;
- if (c == 10 || c == 13){ break; }
- }
-
- if(it != badSeqNames.end()){
- badSeqNames.erase(it);
- }
- else{
- goodAlignReportOut << seqName << '\t' << line;
- }
- m->gobble(inputAlignReport);
- }
-
- if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
-
- //we were unable to remove some of the bad sequences
- if (badSeqNames.size() != 0) {
- for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
- m->mothurOutEndLine();
- }
- }
-
- inputAlignReport.close();
- goodAlignReportOut.close();
-
- if (m->control_pressed) { m->mothurRemove(goodAlignReportFile); return 0; }
-
- return 0;
-
- }
- catch(exception& e) {
- m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
- exit(1);
- }
-
-}
-//***************************************************************************************************************
-
-int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
+int ScreenSeqsCommand::screenTaxonomy(map<string, string> badSeqNames){
try {
ifstream input;
m->openInputFile(taxonomy, input);
string seqName, tax;
- set<string>::iterator it;
+ map<string, string>::iterator it;
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy));
variables["[extension]"] = m->getExtension(taxonomy);
while(!input.eof()){
if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
- input >> seqName >> tax;
+ input >> seqName; m->gobble(input); input >> tax;
it = badSeqNames.find(seqName);
if(it != badSeqNames.end()){ badSeqNames.erase(it); }
//we were unable to remove some of the bad sequences
if (badSeqNames.size() != 0) {
for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct.");
+ m->mothurOut("Your taxonomy file does not include the sequence " + it->first + " please correct.");
m->mothurOutEndLine();
}
}
}
//***************************************************************************************************************
-int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
+int ScreenSeqsCommand::screenQual(map<string, string> badSeqNames){
try {
ifstream in;
m->openInputFile(qualfile, in);
- set<string>::iterator it;
+ map<string, string>::iterator it;
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile));
variables["[extension]"] = m->getExtension(qualfile);
//we were unable to remove some of the bad sequences
if (badSeqNames.size() != 0) {
for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
+ m->mothurOut("Your qual file does not include the sequence " + it->first + " please correct.");
m->mothurOutEndLine();
}
}
}
//**********************************************************************************************************************
-int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
+int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, map<string, string>& badSeqNames){
try {
ofstream goodFile;
m->openOutputFile(goodFName, goodFile);
bool done = false;
int count = 0;
-
+
while (!done) {
if (m->control_pressed) { return 0; }
Sequence currSeq(inFASTA); m->gobble(inFASTA);
if (currSeq.getName() != "") {
bool goodSeq = 1; // innocent until proven guilty
- if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
- if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
- if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
- if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+ string trashCode = "";
+ //have the report files found you bad
+ map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
+ if (it != badSeqNames.end()) { goodSeq = 0; trashCode = it->second; }
+
+ if (summaryfile == "") { //summaryfile includes these so no need to check again
+ if(startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; }
+ if(endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|";}
+ if(maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|";}
+ if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|";}
+ if(minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "<length|";}
+ if(maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|";}
+ }
+
+ if (contigsreport == "") { //contigs report includes this so no need to check again
+ if(maxN != -1 && maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; }
+ }
if(goodSeq == 1){
currSeq.printSequence(goodFile);
+ }else{
+ badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
+ badSeqNames[currSeq.getName()] = trashCode;
}
- else{
- badAccnosFile << currSeq.getName() << endl;
- badSeqNames.insert(currSeq.getName());
- }
- count++;
+ count++;
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
}
//**********************************************************************************************************************
#ifdef USE_MPI
-int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
+int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, map<string, string>& badSeqNames){
try {
string outputString = "";
MPI_Status statusGood;
//process seq
if (currSeq.getName() != "") {
bool goodSeq = 1; // innocent until proven guilty
- if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
- if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
- if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
- if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+ string trashCode = "";
+ //have the report files found you bad
+ map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
+ if (it != badSeqNames.end()) { goodSeq = 0; trashCode = it->second; }
+
+ if (summaryfile == "") { //summaryfile includes these so no need to check again
+ if(startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; }
+ if(endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|";}
+ if(maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|";}
+ if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|";}
+ if(minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "<length|";}
+ if(maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|";}
+ }
+
+ if (contigsreport == "") { //contigs report includes this so no need to check again
+ if(maxN != -1 && maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; }
+ }
+
if(goodSeq == 1){
outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
}
else{
- badSeqNames.insert(currSeq.getName());
+ badSeqNames[currSeq.getName()] = trashCode;
//write to bad accnos file
- outputString = currSeq.getName() + "\n";
+ outputString = currSeq.getName() + "\t" + trashCode.substr(0, trashCode.length()-1) + "\n";
length = outputString.length();
char* buf3 = new char[length];
#endif
/**************************************************************************************************/
-int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
+int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, map<string, string>& badSeqNames) {
try {
vector<int> processIDS;
if (ableToOpen == 0) {
badSeqNames.clear();
- string tempName;
+ string tempName, trashCode;
while (!inBad.eof()) {
- inBad >> tempName; m->gobble(inBad);
- badSeqNames.insert(tempName);
+ inBad >> tempName >> trashCode; m->gobble(inBad);
+ badSeqNames[tempName] = trashCode;
}
inBad.close();
}
if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
// Allocate memory for thread data.
- sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
+ sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN, badSeqNames, filename, summaryfile, contigsreport, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
pDataArray.push_back(tempSum);
//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
- for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
+ for (map<string, string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames[it->first] = it->second; }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
vector<linePair> lines;
- int screenNameGroupFile(set<string>);
- int screenGroupFile(set<string>);
- int screenCountFile(set<string>);
- int screenAlignReport(set<string>);
- int screenQual(set<string>);
- int screenTaxonomy(set<string>);
-
- int driver(linePair, string, string, string, set<string>&);
- int createProcesses(string, string, string, set<string>&);
+ int screenNameGroupFile(map<string, string>);
+ int screenGroupFile(map<string, string>);
+ int screenCountFile(map<string, string>);
+ int screenAlignReport(map<string, string>&);
+ int screenQual(map<string, string>);
+ int screenTaxonomy(map<string, string>);
+ int optimizeContigs();
+ int optimizeAlign();
+ int driver(linePair, string, string, string, map<string, string>&);
+ int createProcesses(string, string, string, map<string, string>&);
+ int screenSummary(map<string, string>&);
+ int screenContigs(map<string, string>&);
+ int runFastaScreening(map<string, string>&);
+ int screenFasta(map<string, string>&);
+ int screenReports(map<string, string>&);
+ int getSummary(vector<unsigned long long>&);
+ int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
+ int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair);
+ int getSummaryReport();
+ int driverContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, linePair);
+ int createProcessesContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<linePair>);
+ int driverAlignSummary(vector<float>&, vector<float>&, vector<int>&, linePair);
+ int createProcessesAlignSummary(vector<float>&, vector<float>&, vector<int>&, vector<linePair>);
+
#ifdef USE_MPI
- int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, set<string>&);
+ int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, map<string, string>&);
#endif
bool abort;
- string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile;
- int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria;
+ string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile;
+ int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert;
+ float minSim, minScore;
vector<string> outputNames;
vector<string> optimize;
map<string, int> nameMap;
- int getSummary(vector<unsigned long long>&);
- int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
- int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair);
+
};
/**************************************************************************************************/
vector<int> seqLength;
vector<int> ambigBases;
vector<int> longHomoPolymer;
- string filename, namefile;
+ vector<int> numNs;
+ string filename, namefile, countfile;
unsigned long long start;
unsigned long long end;
int count;
sumData(){}
- sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, map<string, int> nam) {
+ sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
+ filename = f;
+ namefile = nf;
+ countfile = cf;
+ m = mout;
+ start = st;
+ end = en;
+ nameMap = nam;
+ count = 0;
+ }
+};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct contigsSumData {
+ vector<int> ostartPosition;
+ vector<int> oendPosition;
+ vector<int> oLength;
+ vector<int> omismatches;
+ vector<int> numNs;
+ string filename, namefile, countfile;
+ unsigned long long start;
+ unsigned long long end;
+ int count;
+ MothurOut* m;
+ map<string, int> nameMap;
+
+
+ contigsSumData(){}
+ contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
+ filename = f;
+ namefile = nf;
+ countfile = cf;
+ m = mout;
+ start = st;
+ end = en;
+ nameMap = nam;
+ count = 0;
+ }
+};
+/**************************************************************************************************/
+struct alignsData {
+ vector<float> sims;
+ vector<float> scores;
+ vector<int> inserts;
+ string filename, namefile, countfile;
+ unsigned long long start;
+ unsigned long long end;
+ int count;
+ MothurOut* m;
+ map<string, int> nameMap;
+
+
+ alignsData(){}
+ alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
filename = f;
namefile = nf;
+ countfile = cf;
m = mout;
start = st;
end = en;
count = 0;
}
};
+
/**************************************************************************************************/
//custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
struct sumScreenData {
- int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength;
+ int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN;
unsigned long long start;
unsigned long long end;
int count;
MothurOut* m;
string goodFName, badAccnosFName, filename;
- set<string> badSeqNames;
+ map<string, string> badSeqNames;
+ string summaryfile, contigsreport;
sumScreenData(){}
- sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) {
+ sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map<string, string> bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) {
startPos = s;
endPos = e;
minLength = minl;
maxLength = maxl;
maxAmbig = a;
maxHomoP = h;
+ maxN = mn;
filename = f;
goodFName = gf;
badAccnosFName = bf;
m = mout;
start = st;
end = en;
+ summaryfile = sum;
+ contigsreport = cont;
+ badSeqNames = bs;
count = 0;
}
};
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
}
- pDataArray->count = pDataArray->end;
+
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+ pDataArray->count++;
+
if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
Sequence current(in); pDataArray->m->gobble(in);
if (current.getName() != "") {
int num = 1;
- if (pDataArray->namefile != "") {
+ if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
//make sure this sequence is in the namefile, else error
map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
}
//for each sequence this sequence represents
+ int numns = current.getNumNs();
for (int i = 0; i < num; i++) {
pDataArray->startPosition.push_back(current.getStartPos());
pDataArray->endPosition.push_back(current.getEndPos());
pDataArray->seqLength.push_back(current.getNumBases());
pDataArray->ambigBases.push_back(current.getAmbigBases());
pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer());
+ pDataArray->numNs.push_back(numns);
}
}
}
}
/**************************************************************************************************/
+static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){
+ contigsSumData* pDataArray;
+ pDataArray = (contigsSumData*)lpParam;
+
+ try {
+ string name;
+ //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
+ int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
+
+ ifstream in;
+ pDataArray->m->openInputFile(pDataArray->filename, in);
+
+ //print header if you are process 0
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+ in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in);
+ }else { //this accounts for the difference in line endings.
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
+ }
+
+
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ pDataArray->count++;
+
+ if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
+
+ //seqname start end nbases ambigs polymer numSeqs
+ in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in);
+
+ int num = 1;
+ if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = pDataArray->nameMap.find(name);
+
+ if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ pDataArray->ostartPosition.push_back(thisOStart);
+ pDataArray->oendPosition.push_back(thisOEnd);
+ pDataArray->oLength.push_back(OLength);
+ pDataArray->omismatches.push_back(numMisMatches);
+ pDataArray->numNs.push_back(numns);
+ }
+ }
+
+ in.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){
+ alignsData* pDataArray;
+ pDataArray = (alignsData*)lpParam;
+
+ try {
+
+ string name, TemplateName, SearchMethod, AlignmentMethod;
+ //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
+ //checking for minScore, maxInsert, minSim
+ int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert;
+ float SearchScore, SimBtwnQueryTemplate;
+
+ ifstream in;
+ pDataArray->m->openInputFile(pDataArray->filename, in);
+
+ //print header if you are process 0
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+ in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in);
+ }else { //this accounts for the difference in line endings.
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
+ }
+
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ pDataArray->count++;
+
+ if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
+
+ in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in);
+ cout << i << '\t' << name << endl;
+ int num = 1;
+ if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = pDataArray->nameMap.find(name);
+
+ if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ pDataArray->sims.push_back(SimBtwnQueryTemplate);
+ pDataArray->scores.push_back(SearchScore);
+ pDataArray->inserts.push_back(LongestInsert);
+ }
+ }
+
+ in.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){
sumScreenData* pDataArray;
pDataArray = (sumScreenData*)lpParam;
}else { //this accounts for the difference in line endings.
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
}
-
- pDataArray->count = pDataArray->end;
+
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+ pDataArray->count++;
+
if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; }
Sequence currSeq(in); pDataArray->m->gobble(in);
if (currSeq.getName() != "") {
bool goodSeq = 1; // innocent until proven guilty
- if(goodSeq == 1 && pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; }
- if(goodSeq == 1 && pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; }
- if(goodSeq == 1 && pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
- if(goodSeq == 1 && pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
- if(goodSeq == 1 && pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; }
- if(goodSeq == 1 && pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+ string trashCode = "";
+ //have the report files found you bad
+ map<string, string>::iterator it = pDataArray->badSeqNames.find(currSeq.getName());
+ if (it != pDataArray->badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } //found it
+
+ if (pDataArray->summaryfile == "") {
+ if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; }
+ if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|"; }
+ if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|"; }
+ if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|"; }
+ if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "<length|"; }
+ if(pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|"; }
+ }
+ if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again
+ if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; }
+ }
+
if(goodSeq == 1){
currSeq.printSequence(goodFile);
}
else{
- badAccnosFile << currSeq.getName() << endl;
- pDataArray->badSeqNames.insert(currSeq.getName());
+ badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
+ pDataArray->badSeqNames[currSeq.getName()] = trashCode;
}
}
}
else{
if(reportFileName != ""){
- m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual fileare the same length.");
+ m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual file are the same length.");
m->mothurOutEndLine();
}
}
int numParentSeqs = -1;
int closestRefIndex = -1;
- numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport);
+ string querySeq = query.getAligned();
+ if (!aligned) { querySeq = query.getUnaligned(); }
+
+ numParentSeqs = chimeraTest.analyzeQuery(query.getName(), querySeq, outChimeraReport);
closestRefIndex = chimeraTest.getClosestRefIndex();
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
}
- pDataArray->count = pDataArray->end;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+ pDataArray->count++;
+
if (pDataArray->m->control_pressed) { in.close(); outSummary.close(); pDataArray->count = 1; return 1; }
Sequence current(in); pDataArray->m->gobble(in);
m = MothurOut::getInstance();
initialize();
name = newName;
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
//setUnaligned removes any gap characters for us
setUnaligned(sequence);
m = MothurOut::getInstance();
initialize();
name = newName;
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
//setUnaligned removes any gap characters for us
setUnaligned(sequence);
m = MothurOut::getInstance();
initialize();
- fastaString >> name;
-
- if (name.length() != 0) {
+ name = getSequenceName(fastaString);
- name = name.substr(1);
+ if (!m->control_pressed) {
string sequence;
//read comments
setUnaligned(sequence);
if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
-
- }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
}
catch(exception& e) {
m = MothurOut::getInstance();
initialize();
- fastaString >> name;
-
- if (name.length() != 0) {
+ name = getSequenceName(fastaString);
- name = name.substr(1);
+ if (!m->control_pressed) {
string sequence;
//read comments
if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
- }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
}
catch(exception& e) {
try {
m = MothurOut::getInstance();
initialize();
- fastaFile >> name;
-
- if (name.length() != 0) {
+ name = getSequenceName(fastaFile);
- name = name.substr(1);
+ if (!m->control_pressed) {
string sequence;
if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
- }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
}
catch(exception& e) {
try {
m = MothurOut::getInstance();
initialize();
- fastaFile >> name;
extraInfo = "";
- if (name.length() != 0) {
-
- name = name.substr(1);
-
+ name = getSequenceName(fastaFile);
+
+ if (!m->control_pressed) {
string sequence;
//read comments
setUnaligned(sequence);
if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
-
- }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
}
catch(exception& e) {
try {
m = MothurOut::getInstance();
initialize();
- fastaFile >> name;
+ name = getSequenceName(fastaFile);
- if (name.length() != 0) {
- name = name.substr(1);
+ if (!m->control_pressed) {
string sequence;
//read comments
if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
- }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+ }
}
catch(exception& e) {
exit(1);
}
}
-
+//********************************************************************************************************************
+string Sequence::getSequenceName(ifstream& fastaFile) {
+ try {
+ string name = "";
+
+ fastaFile >> name;
+
+ if (name.length() != 0) {
+
+ name = name.substr(1);
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
+
+ }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ return name;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Sequence", "getSequenceName");
+ exit(1);
+ }
+}
+//********************************************************************************************************************
+string Sequence::getSequenceName(istringstream& fastaFile) {
+ try {
+ string name = "";
+
+ fastaFile >> name;
+
+ if (name.length() != 0) {
+
+ name = name.substr(1);
+
+ for (int i = 0; i < name.length(); i++) {
+ if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
+ }
+
+ }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ return name;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Sequence", "getSequenceName");
+ exit(1);
+ }
+}
//********************************************************************************************************************
string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
try {
int Sequence::getNumBases(){
return numBases;
}
+//********************************************************************************************************************
+
+int Sequence::getNumNs(){
+ int numNs = 0;
+ for (int i = 0; i < unaligned.length(); i++) {
+ if(toupper(unaligned[i]) == 'N') { numNs++; }
+ }
+ return numNs;
+}
//********************************************************************************************************************
string getPairwise();
string getUnaligned();
string getInlineSeq();
+ int getNumNs();
int getNumBases();
int getStartPos();
int getEndPos();
string getCommentString(ifstream&);
string getSequenceString(istringstream&, int&);
string getCommentString(istringstream&);
+ string getSequenceName(ifstream&);
+ string getSequenceName(istringstream&);
string name;
string unaligned;
string aligned;
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int j = 0; j < pDataArray[i]->mapfileNames.size(); j++) {
mapfileNames.push_back(pDataArray[i]->mapfileNames[j]);
}
MothurOut* m;
int start;
int end;
- int sigma, threadID;
+ int sigma, threadID, count;
vector<string> groups;
vector<string> mapfileNames;
sigma = s;
threadID = tid;
groups = gr;
+ count=0;
}
};
//precluster each group
for (int k = pDataArray->start; k < pDataArray->end; k++) {
+ pDataArray->count++;
+
int start = time(NULL);
if (pDataArray->m->control_pressed) { return 0; }
~SplitGroupCommand() {}
vector<string> setParameters();
- string getCommandName() { return "split.group"; }
+ string getCommandName() { return "split.groups"; }
string getCommandCategory() { return "Sequence Processing"; }
string getHelpString();
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
- numSeqs += pDataArray[i]->count;
+ numSeqs += pDataArray[i]->numSeqs;
+ if (pDataArray[i]->count != pDataArray[i]->end) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
int tempNum = pDataArray[i]->position.size();
if (position.size() < tempNum) { position.resize(tempNum, 0); }
if (averageQ.size() < tempNum) { averageQ.resize(tempNum, 0); }
string filename;
unsigned long long start;
unsigned long long end;
- int count;
+ int count, numSeqs;
MothurOut* m;
bool hasNameMap;
map<string, int> nameMap;
in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
}
- int count = 0;
+ pDataArray->count = 0;
+ pDataArray->numSeqs = 0;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
else { pDataArray->scores.at(i)[thisScores[i]] += num; }
}
- count += num;
+ pDataArray->numSeqs += num;
+ pDataArray->count++;
}
}
- pDataArray->count = count;
in.close();
return 0;
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
m->mothurRemove((sumFileName + toString(processIDS[i]) + ".temp"));
if (iters != 0) {
//we need to find the average distance and standard deviation for each groups distance
-
- vector< vector<seqDist> > calcAverages; calcAverages.resize(sumCalculators.size());
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- calcAverages[i].resize(calcDistsTotals[0][i].size());
-
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].seq1 = calcDists[i][j].seq1;
- calcAverages[i][j].seq2 = calcDists[i][j].seq2;
- calcAverages[i][j].dist = 0.0;
- }
- }
-
- for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
- }
- }
- }
-
- for (int i = 0; i < calcAverages.size(); i++) { //finds average.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist /= (float) iters;
- }
- }
+ vector< vector<seqDist> > calcAverages = m->getAverages(calcDistsTotals);
//find standard deviation
- vector< vector<seqDist> > stdDev; stdDev.resize(sumCalculators.size());
- for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero.
- stdDev[i].resize(calcDistsTotals[0][i].size());
-
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].seq1 = calcDists[i][j].seq1;
- stdDev[i][j].seq2 = calcDists[i][j].seq2;
- stdDev[i][j].dist = 0.0;
- }
- }
-
- for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
- for (int i = 0; i < stdDev.size(); i++) {
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
- }
- }
- }
-
- for (int i = 0; i < stdDev.size(); i++) { //finds average.
- for (int j = 0; j < stdDev[i].size(); j++) {
- stdDev[i][j].dist /= (float) iters;
- stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
- }
- }
+ vector< vector<seqDist> > stdDev = m->getStandardDeviation(calcDistsTotals, calcAverages);
//print results
for (int i = 0; i < calcDists.size(); i++) {
unsigned long long end;
MothurOut* m;
string sumFile;
+ int count;
summarySharedData(){}
summarySharedData(string sf, MothurOut* mout, unsigned long long st, unsigned long long en, vector<string> est, vector<SharedRAbundVector*> lu) {
end = en;
Estimators = est;
thisLookup = lu;
+ count=0;
}
};
/**************************************************************************************************/
vector<SharedRAbundVector*> subset;
for (int k = pDataArray->start; k < pDataArray->end; k++) { // pass cdd each set of groups to compare
-
+ pDataArray->count++;
for (int l = 0; l < k; l++) {
outputFileHandle << pDataArray->thisLookup[0]->getLabel() << '\t';
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
for (int k = 0; k < calcDists.size(); k++) {
if (iters != 1) {
//we need to find the average distance and standard deviation for each groups distance
-
- vector< vector<seqDist> > calcAverages; calcAverages.resize(treeCalculators.size());
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- calcAverages[i].resize(calcDistsTotals[0][i].size());
-
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].seq1 = calcDists[i][j].seq1;
- calcAverages[i][j].seq2 = calcDists[i][j].seq2;
- calcAverages[i][j].dist = 0.0;
- }
- }
-
- for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
- for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
- }
- }
- }
-
- for (int i = 0; i < calcAverages.size(); i++) { //finds average.
- for (int j = 0; j < calcAverages[i].size(); j++) {
- calcAverages[i][j].dist /= (float) iters;
- }
- }
+ vector< vector<seqDist> > calcAverages = m->getAverages(calcDistsTotals);
//create average tree for each calc
for (int i = 0; i < calcDists.size(); i++) {
unsigned long long start;
unsigned long long end;
MothurOut* m;
+ int count;
treeSharedData(){}
treeSharedData(MothurOut* mout, unsigned long long st, unsigned long long en, vector<string> est, vector<SharedRAbundVector*> lu) {
end = en;
Estimators = est;
thisLookup = lu;
+ count=0;
}
};
/**************************************************************************************************/
vector<SharedRAbundVector*> subset;
for (int k = pDataArray->start; k < pDataArray->end; k++) { // pass cdd each set of groups to compare
+ pDataArray->count++;
+
for (int l = 0; l < k; l++) {
if (k != l) { //we dont need to similiarity of a groups to itself
//**********************************************************************************************************************
vector<string> TrimFlowsCommand::setParameters(){
try {
- CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","flow",false,true,true); parameters.push_back(pflow);
+ CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","flow-file",false,true,true); parameters.push_back(pflow);
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(poligos);
CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "","",false,false); parameters.push_back(pmaxhomop);
CommandParameter pmaxflows("maxflows", "Number", "", "450", "", "", "","",false,false); parameters.push_back(pmaxflows);
if (type == "flow") { pattern = "[filename],[tag],flow"; }
else if (type == "fasta") { pattern = "[filename],flow.fasta"; }
- else if (type == "file") { pattern = "[filename],[tag],flow.files"; }
+ else if (type == "file") { pattern = "[filename],flow.files"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
if(allFiles){
set<string> namesAlreadyProcessed;
- variables["[tag]"] = "";
flowFilesFileName = getOutputFileName("file",variables);
m->openOutputFile(flowFilesFileName, output);
output.close();
}
else{
- variables["[tag]"] = "";
flowFilesFileName = getOutputFileName("file",variables);
m->openOutputFile(flowFilesFileName, output);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != pDataArray[i]->lineEnd) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->lineEnd) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
map<string, int>::iterator it2 = groupCounts.find(it->first);
if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
- pDataArray->count = pDataArray->lineEnd;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
if (pDataArray->m->control_pressed) {
string origSeq = currSeq.getUnaligned();
if (origSeq != "") {
+ pDataArray->count++;
int barcodeIndex = 0;
int primerIndex = 0;
string thisGroup = "";
if (pDataArray->createGroup) {
if(pDataArray->barcodes.size() != 0){
- string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+ thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
if (pDataArray->primers.size() != 0) {
if (pDataArray->primerNameVector[primerIndex] != "") {
if(thisGroup != "") {
int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
try {
//we need to find the average distance and standard deviation for each groups distance
-
//finds sum
- vector<double> averages; averages.resize(numComp, 0);
- for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
- for (int i = 0; i < dists[thisIter].size(); i++) {
- averages[i] += dists[thisIter][i];
- }
- }
-
- //finds average.
- for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; }
+ vector<double> averages = m->getAverages(dists);
//find standard deviation
- vector<double> stdDev; stdDev.resize(numComp, 0);
-
- for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
- for (int j = 0; j < dists[thisIter].size(); j++) {
- stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
- }
- }
- for (int i = 0; i < stdDev.size(); i++) {
- stdDev[i] /= (float) subsampleIters;
- stdDev[i] = sqrt(stdDev[i]);
- }
+ vector<double> stdDev = m->getStandardDeviation(dists, averages);
//make matrix with scores in it
- vector< vector<double> > avedists; avedists.resize(m->getNumGroups());
+ vector< vector<double> > avedists; //avedists.resize(m->getNumGroups());
for (int i = 0; i < m->getNumGroups(); i++) {
- avedists[i].resize(m->getNumGroups(), 0.0);
+ vector<double> temp;
+ for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); }
+ avedists.push_back(temp);
}
//make matrix with scores in it
- vector< vector<double> > stddists; stddists.resize(m->getNumGroups());
+ vector< vector<double> > stddists; //stddists.resize(m->getNumGroups());
for (int i = 0; i < m->getNumGroups(); i++) {
- stddists[i].resize(m->getNumGroups(), 0.0);
+ vector<double> temp;
+ for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); }
+ //stddists[i].resize(m->getNumGroups(), 0.0);
+ stddists.push_back(temp);
}
+ if (m->debug) { m->mothurOut("[DEBUG]: about to fill matrix.\n"); }
+
//flip it so you can print it
int count = 0;
for (int r=0; r<m->getNumGroups(); r++) {
}
}
+ if (m->debug) { m->mothurOut("[DEBUG]: done filling matrix.\n"); }
+
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
variables["[tag]"] = toString(treeNum+1);
//we need to find the average distance and standard deviation for each groups distance
//finds sum
- vector<double> averages; averages.resize(numComp, 0);
- for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
- for (int i = 0; i < dists[thisIter].size(); i++) {
- averages[i] += dists[thisIter][i];
- }
- }
-
- //finds average.
- for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; }
+ vector<double> averages = m->getAverages(dists);
//find standard deviation
- vector<double> stdDev; stdDev.resize(numComp, 0);
-
- for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
- for (int j = 0; j < dists[thisIter].size(); j++) {
- stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
- }
- }
- for (int i = 0; i < stdDev.size(); i++) {
- stdDev[i] /= (float) subsampleIters;
- stdDev[i] = sqrt(stdDev[i]);
- }
+ vector<double> stdDev = m->getStandardDeviation(dists, averages);
//make matrix with scores in it
- vector< vector<double> > avedists; avedists.resize(m->getNumGroups());
+ vector< vector<double> > avedists; //avedists.resize(m->getNumGroups());
for (int i = 0; i < m->getNumGroups(); i++) {
- avedists[i].resize(m->getNumGroups(), 0.0);
+ vector<double> temp;
+ for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); }
+ avedists.push_back(temp);
}
//make matrix with scores in it
- vector< vector<double> > stddists; stddists.resize(m->getNumGroups());
+ vector< vector<double> > stddists; //stddists.resize(m->getNumGroups());
for (int i = 0; i < m->getNumGroups(); i++) {
- stddists[i].resize(m->getNumGroups(), 0.0);
+ vector<double> temp;
+ for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); }
+ //stddists[i].resize(m->getNumGroups(), 0.0);
+ stddists.push_back(temp);
}
+
//flip it so you can print it
int count = 0;
//in essence you want to run it like a single
if (vCalcs[i]->getName() == "sharedsobs") {
singleCalc = new Sobs();
- if (sharedOtus) {
+ if (sharedOtus && (labels.size() != 0)) {
string filenameShared = outputDir + m->getRootName(m->getSimpleName(inputfile)) + lookup[0]->getLabel() + "." + vCalcs[i]->getName() + ".sharedotus";
outputNames.push_back(filenameShared);
subset.push_back(lookup[0]); subset.push_back(lookup[1]);
vector<string> labels;
vector<double> sharedab = vCalcs[i]->getValues(subset, labels);
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() << '\t' << labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.clear();
subset.push_back(lookup[0]); subset.push_back(lookup[2]);
vector<double> sharedac = vCalcs[i]->getValues(subset, labels);
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.clear();
subset.push_back(lookup[1]); subset.push_back(lookup[2]);
vector<double> sharedbc = vCalcs[i]->getValues(subset, labels);
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.clear();
subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[2]);
vector<double> sharedabc = vCalcs[i]->getValues(subset, labels);
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[0]); subset.push_back(lookup[1]);
data = vCalcs[i]->getValues(subset, labels);
sharedAB = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[0]); subset.push_back(lookup[2]);
data = vCalcs[i]->getValues(subset, labels);
sharedAC = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[0]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset, labels);
sharedAD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[1]); subset.push_back(lookup[2]);
data = vCalcs[i]->getValues(subset, labels);
sharedBC = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[1]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset, labels);
sharedBD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[1]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[2]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset, labels);
sharedCD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[2]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[2]);
data = vCalcs[i]->getValues(subset, labels);
sharedABC = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup()+ "-" + lookup[2]->getGroup()<< '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[0]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset, labels);
sharedACD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
subset.push_back(lookup[1]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset,labels);
sharedBCD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
}
- if (labels.size() != 0) { outShared << labels[labels.size()-1]; }
+ outShared << labels[labels.size()-1];
outShared << endl;
}
//cout << "num bcd = " << sharedBCD << endl;
subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[3]);
data = vCalcs[i]->getValues(subset, labels);
sharedABD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";
}
- if (labels.size() != 0) { outShared << labels[labels.size()-1]; }
+ outShared << labels[labels.size()-1];
outShared << endl;
}
//cout << "num abd = " << sharedABD << endl;
//get estimate for all four
data = vCalcs[i]->getValues(lookup, labels);
sharedABCD = data[0];
- if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) {
+ if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) {
outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t';
for (int k = 0; k < labels.size()-1; k++) {
outShared << labels[k] << ",";