CommandParameter pchunks("chunks", "Number", "", "4", "", "", "",false,false); parameters.push_back(pchunks);
CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "",false,false); parameters.push_back(pminchunk);
CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "",false,false); parameters.push_back(pidsmoothwindow);
+ CommandParameter pdups("dups", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pdups);
+
//CommandParameter pminsmoothid("minsmoothid", "Number", "", "0.95", "", "", "",false,false); parameters.push_back(pminsmoothid);
CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "",false,false); parameters.push_back(pmaxp);
CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps);
string helpString = "";
helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n";
- helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
+ helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dups, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n";
helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+ helpString += "If the dups parameter is true, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=t.\n";
helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n";
temp = validParameter.validFile(parameters, "skipgaps2", false); if (temp == "not found") { temp = "t"; }
skipgaps2 = m->isTrue(temp);
+
+ string usedDups = "true";
+ temp = validParameter.validFile(parameters, "dups", false);
+ if (temp == "not found") {
+ if (groupfile != "") { temp = "true"; }
+ else { temp = "false"; usedDups = ""; }
+ }
+ dups = m->isTrue(temp);
+
if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
if (hasCount) { delete cparser; }
else { delete sparser; }
-
- int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName);
-
- m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine();
- m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
+
+ if (dups) {
+ int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName);
+ m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine();
+ m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine();
+ }
+
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
}else{
//find unique name
itUnique = uniqueNames.find(name);
- if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
else {
itChimeras = chimerasInFile.find((itUnique->second));
vector<char*> cPara;
string uchimeCommand = uchimeLocation;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- uchimeCommand += " ";
-#else
- uchimeCommand = "\"" + uchimeCommand + "\"";
-#endif
+ uchimeCommand = "\"" + uchimeCommand + "\" ";
+
char* tempUchime;
tempUchime= new char[uchimeCommand.length()+1];
*tempUchime = '\0';
int driver(string, string, string, string, int&);
int createProcesses(string, string, string, string, int&);
- bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName;
+ bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups;
string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation;
int processors;
for (int j = 0; j < cPara.size(); j++) { uchimeParameters[j] = cPara[j]; commandString += toString(cPara[j]) + " "; }
//int numArgs = cPara.size();
+ commandString = "\"" + commandString + "\"";
+
//uchime_main(numArgs, uchimeParameters);
//cout << "commandString = " << commandString << endl;
if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); }
try {
CommandParameter pfasta("ffastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
CommandParameter prfasta("rfastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(prfasta);
- CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign);
+ CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
+ CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
+ CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
+ CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+ CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+ CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+
+ CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign);
+ CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
try {
string helpString = "";
helpString += "The make.contigs command reads a forward fastq file and a reverse fastq file and outputs new fasta and quality files.\n";
- helpString += "The make.contigs command parameters are ffastq, rfastq, align, match, mismatch, gapopen, gapextend and processors.\n";
+ helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
+ helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n";
helpString += "The ffastq and rfastq parameters are required.\n";
helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
+ helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
+ helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
+ helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+ helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+ helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
+ helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
helpString += "The make.contigs command should be in the following format: \n";
helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n";
else {
if (type == "fasta") { outputFileName = "contigs.fasta"; }
else if (type == "qfile") { outputFileName = "contigs.qual"; }
+ else if (type == "group") { outputFileName = "groups"; }
else if (type == "mismatch") { outputFileName = "contigs.mismatch"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
}
vector<string> tempOutNames;
outputTypes["fasta"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
outputTypes["mismatch"] = tempOutNames;
}
catch(exception& e) {
outputTypes["fasta"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["mismatch"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["rfastq"] = inputDir + it->second; }
}
+
+ it = parameters.find("oligos");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["oligos"] = inputDir + it->second; }
+ }
}
ffastqfile = validParameter.validFile(parameters, "ffastq", true);
if (rfastqfile == "not open") { rfastqfile = ""; abort = true; }
else if (rfastqfile == "not found") { rfastqfile = ""; abort=true; m->mothurOut("The rfastq parameter is required.\n"); }
+ oligosfile = validParameter.validFile(parameters, "oligos", true);
+ if (oligosfile == "not found") { oligosfile = ""; }
+ else if(oligosfile == "not open") { abort = true; }
+ else { m->setOligosFile(oligosfile); }
+
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(ffastqfile); }
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+
+ temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, bdiffs);
+
+ temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, pdiffs);
+
+ temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, ldiffs);
+
+ temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, sdiffs);
+
+ temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
+ m->mothurConvert(temp, tdiffs);
+
+ if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
+
+ temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
+ allFiles = m->isTrue(temp);
align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; }
if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); }
}
+
+ string currentGroup = "";
+ itTypes = outputTypes.find("group");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { currentGroup = (itTypes->second)[0]; m->setGroupFile(currentGroup); }
+ }
//output files created by command
m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
-
return 0;
}
catch(exception& e) {
exit(1);
}
}
+//***************************************************************************************************************
+//illumina data requires paired forward and reverse data
+//BARCODE atgcatgc atgcatgc groupName
+//PRIMER atgcatgc atgcatgc groupName
+//PRIMER atgcatgc atgcatgc
+bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
+ try {
+ ifstream in;
+ m->openInputFile(oligosfile, in);
+
+ ofstream test;
+
+ string type, foligo, roligo, group;
+
+ int indexPrimer = 0;
+ int indexBarcode = 0;
+ set<string> uniquePrimers;
+ set<string> uniqueBarcodes;
+
+ while(!in.eof()){
+
+ in >> type;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
+
+ if(type[0] == '#'){
+ while (!in.eof()) { char c = in.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ m->gobble(in);
+ }
+ else{
+ m->gobble(in);
+ //make type case insensitive
+ for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
+
+ in >> foligo;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reading - " + foligo + ".\n"); }
+
+ for(int i=0;i<foligo.length();i++){
+ foligo[i] = toupper(foligo[i]);
+ if(foligo[i] == 'U') { foligo[i] = 'T'; }
+ }
+
+ if(type == "PRIMER"){
+ m->gobble(in);
+
+ in >> roligo;
+
+ for(int i=0;i<roligo.length();i++){
+ roligo[i] = toupper(roligo[i]);
+ if(roligo[i] == 'U') { roligo[i] = 'T'; }
+ }
+ roligo = reverseOligo(roligo);
+
+ group = "";
+
+ // get rest of line in case there is a primer name
+ while (!in.eof()) {
+ char c = in.get();
+ if (c == 10 || c == 13){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ else { group += c; }
+ }
+
+ oligosPair newPrimer(foligo, roligo);
+
+ //check for repeat barcodes
+ string tempPair = foligo+roligo;
+ if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); }
+ else { uniquePrimers.insert(tempPair); }
+
+ if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } }
+
+ primers[indexPrimer]=newPrimer; indexPrimer++;
+ primerNameVector.push_back(group);
+ }else if(type == "BARCODE"){
+ m->gobble(in);
+
+ in >> roligo;
+
+ for(int i=0;i<roligo.length();i++){
+ roligo[i] = toupper(roligo[i]);
+ if(roligo[i] == 'U') { roligo[i] = 'T'; }
+ }
+ roligo = reverseOligo(roligo);
+
+ oligosPair newPair(foligo, roligo);
+
+ group = "";
+ while (!in.eof()) {
+ char c = in.get();
+ if (c == 10 || c == 13){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ else { group += c; }
+ }
+
+ if (m->debug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); }
+
+ //check for repeat barcodes
+ string tempPair = foligo+roligo;
+ if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); }
+ else { uniqueBarcodes.insert(tempPair); }
+
+ barcodes[indexBarcode]=newPair; indexBarcode++;
+ barcodeNameVector.push_back(group);
+ }else if(type == "LINKER"){
+ linker.push_back(foligo);
+ }else if(type == "SPACER"){
+ spacer.push_back(foligo);
+ }
+ else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); }
+ }
+ m->gobble(in);
+ }
+ in.close();
+
+ if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
+
+ //add in potential combos
+ if(barcodeNameVector.size() == 0){
+ oligosPair temp("", "");
+ barcodes[0] = temp;
+ barcodeNameVector.push_back("");
+ }
+
+ if(primerNameVector.size() == 0){
+ oligosPair temp("", "");
+ primers[0] = temp;
+ primerNameVector.push_back("");
+ }
+
+ fastaFileNames.resize(barcodeNameVector.size());
+ for(int i=0;i<fastaFileNames.size();i++){
+ fastaFileNames[i].assign(primerNameVector.size(), "");
+ }
+ qualFileNames = fastaFileNames;
+
+ if(allFiles){
+ set<string> uniqueNames; //used to cleanup outputFileNames
+ for(map<int, oligosPair>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
+ for(map<int, oligosPair>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
+
+ string primerName = primerNameVector[itPrimer->first];
+ string barcodeName = barcodeNameVector[itBar->first];
+
+ string comboGroupName = "";
+ string fastaFileName = "";
+ string qualFileName = "";
+ string nameFileName = "";
+ string countFileName = "";
+
+ if(primerName == ""){
+ comboGroupName = barcodeNameVector[itBar->first];
+ }
+ else{
+ if(barcodeName == ""){
+ comboGroupName = primerNameVector[itPrimer->first];
+ }
+ else{
+ comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
+ }
+ }
+
+
+ ofstream temp;
+ fastaFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".fasta";
+ if (uniqueNames.count(fastaFileName) == 0) {
+ outputNames.push_back(fastaFileName);
+ outputTypes["fasta"].push_back(fastaFileName);
+ uniqueNames.insert(fastaFileName);
+ }
+
+ fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
+ m->openOutputFile(fastaFileName, temp); temp.close();
+
+
+ qualFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".qual";
+ if (uniqueNames.count(qualFileName) == 0) {
+ outputNames.push_back(qualFileName);
+ outputTypes["qfile"].push_back(qualFileName);
+ }
+
+ qualFileNames[itBar->first][itPrimer->first] = qualFileName;
+ m->openOutputFile(qualFileName, temp); temp.close();
+ }
+ }
+ }
+
+ bool allBlank = true;
+ for (int i = 0; i < barcodeNameVector.size(); i++) {
+ if (barcodeNameVector[i] != "") {
+ allBlank = false;
+ break;
+ }
+ }
+ for (int i = 0; i < primerNameVector.size(); i++) {
+ if (primerNameVector[i] != "") {
+ allBlank = false;
+ break;
+ }
+ }
+
+ if (allBlank) {
+ m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
+ allFiles = false;
+ return false;
+ }
+
+ return true;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "getOligos");
+ exit(1);
+ }
+}
+//********************************************************************/
+string MakeContigsCommand::reverseOligo(string oligo){
+ try {
+ string reverse = "";
+
+ for(int i=oligo.length()-1;i>=0;i--){
+
+ if(oligo[i] == 'A') { reverse += 'T'; }
+ else if(oligo[i] == 'T'){ reverse += 'A'; }
+ else if(oligo[i] == 'U'){ reverse += 'A'; }
+
+ else if(oligo[i] == 'G'){ reverse += 'C'; }
+ else if(oligo[i] == 'C'){ reverse += 'G'; }
+
+ else if(oligo[i] == 'R'){ reverse += 'Y'; }
+ else if(oligo[i] == 'Y'){ reverse += 'R'; }
+
+ else if(oligo[i] == 'M'){ reverse += 'K'; }
+ else if(oligo[i] == 'K'){ reverse += 'M'; }
+
+ else if(oligo[i] == 'W'){ reverse += 'W'; }
+ else if(oligo[i] == 'S'){ reverse += 'S'; }
+
+ else if(oligo[i] == 'B'){ reverse += 'V'; }
+ else if(oligo[i] == 'V'){ reverse += 'B'; }
+
+ else if(oligo[i] == 'D'){ reverse += 'H'; }
+ else if(oligo[i] == 'H'){ reverse += 'D'; }
+
+ else { reverse += 'N'; }
+ }
+
+
+ return reverse;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "reverseOligo");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
#include "needlemanoverlap.hpp"
#include "blastalign.hpp"
#include "noalign.hpp"
-
+#include "trimoligos.h"
struct fastqRead {
vector<int> scores;
void help() { m->mothurOut(getHelpString()); }
private:
- bool abort;
- string outputDir, ffastqfile, rfastqfile, align;
+ bool abort, allFiles;
+ string outputDir, ffastqfile, rfastqfile, align, oligosfile;
float match, misMatch, gapOpen, gapExtend;
- int processors, longestBase, threshold;
+ int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
vector<string> outputNames;
+ map<int, oligosPair> barcodes;
+ map<int, oligosPair> primers;
+ vector<string> linker;
+ vector<string> spacer;
+ vector<string> primerNameVector;
+ vector<string> barcodeNameVector;
+
+ map<string, int> groupCounts;
+ //map<string, int> combos;
+ //map<string, int> groupToIndex;
+ //vector<string> groupVector;
+
fastqRead readFastq(ifstream&);
vector< vector<string> > readFastqFiles(int&);
bool checkReads(fastqRead&, fastqRead&);
int createProcesses(vector< vector<string> >, string, string, string);
int driver(vector<string>, string, string, string);
+ bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
+ string reverseOligo(string);
};
/**************************************************************************************************/
if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
Sequence currSeq(in);
+
+ if (!dups) {//adjust name if needed
+ map<string, string>::iterator it = uniqueMap.find(currSeq.getName());
+ if (it != uniqueMap.end()) { currSeq.setName(it->second); }
+ }
+
name = currSeq.getName();
if (name != "") {
if (names.count(name) == 0) {
wroteSomething = true;
- currSeq.printSequence(out);
+ currSeq.printSequence(out);
}else { removedCount++; }
}
m->gobble(in);
m->gobble(in);
+ if (!dups) {//adjust name if needed
+ map<string, string>::iterator it = uniqueMap.find(saveName);
+ if (it != uniqueMap.end()) { name = ">" + it->second; saveName = it->second; }
+ }
+
if (names.count(saveName) == 0) {
wroteSomething = true;
wroteSomething = true;
out << validSecond[0] << '\t';
+ //we are changing the unique name in the fasta file
+ uniqueMap[firstCol] = validSecond[0];
//you know you have at least one valid second since first column is valid
for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
in >> name; //read from first column
in >> tax; //read from second column
+ if (!dups) {//adjust name if needed
+ map<string, string>::iterator it = uniqueMap.find(name);
+ if (it != uniqueMap.end()) { name = it->second; }
+ }
+
//if this name is in the accnos file
if (names.count(name) == 0) {
wroteSomething = true;
+
out << name << '\t' << tax << endl;
}else { removedCount++; }
if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
in >> name; //read from first column
+
+ if (!dups) {//adjust name if needed
+ map<string, string>::iterator it = uniqueMap.find(name);
+ if (it != uniqueMap.end()) { name = it->second; }
+ }
//if this name is in the accnos file
if (names.count(name) == 0) {
string accnosfile, fastafile, namefile, groupfile, countfile, alignfile, listfile, taxfile, qualfile, outputDir;
bool abort, dups;
vector<string> outputNames;
+ map<string, string> uniqueMap;
int readFasta();
int readName();
int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) {
try {
//find group read belongs to
- TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
int success = 1;
string trashCode = "";
else{ currentSeqsDiffs += success; }
}
- if(rbarcodes.size() != 0){
- success = trimOligos.stripRBarcode(currSeq, currQual, barcode);
- if(success > bdiffs) { trashCode += 'b'; }
- else{ currentSeqsDiffs += success; }
- }
-
if(numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
if(success > sdiffs) { trashCode += 's'; }
}
else if(type == "BARCODE"){
inOligos >> group;
-
- //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
- //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
- string temp = "";
- while (!inOligos.eof()) {
- char c = inOligos.get();
- if (c == 10 || c == 13){ break; }
- else if (c == 32 || c == 9){;} //space or tab
- else { temp += c; }
- }
- //then this is illumina data with 4 columns
- if (temp != "") {
- string reverseBarcode = reverseOligo(group); //reverse barcode
- group = temp;
-
- //check for repeat barcodes
- map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
- if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
-
- rbarcodes[reverseBarcode]=indexBarcode;
- }
-
//check for repeat barcodes
map<string, int>::iterator itBar = barcodes.find(oligo);
if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
barcodes[oligo]=indexBarcode; indexBarcode++;
barcodeNameVector.push_back(group);
-
}else if(type == "LINKER"){
linker.push_back(oligo);
}else if(type == "SPACER"){
int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs;
set<string> seqNames;
map<string, int> barcodes;
- map<string, int> rbarcodes;
map<string, int> primers;
vector<string> linker, spacer, primerNameVector, barcodeNameVector, revPrimer;
vector<vector<int> > numSplitReads;
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
-TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
try {
m = MothurOut::getInstance();
sdiffs = s;
barcodes = br;
- rbarcodes = rbr;
primers = pr;
revPrimer = r;
linker = lk;
}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
-TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br, vector<string> lk, vector<string> sp){
try {
m = MothurOut::getInstance();
ldiffs = l;
sdiffs = s;
- barcodes = br;
- primers = pr;
- revPrimer = r;
+ ibarcodes = br;
+ iprimers = pr;
linker = lk;
spacer = sp;
}
m->errorOut(e, "TrimOligos", "stripBarcode");
exit(1);
}
+}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
+ try {
+ //look for forward barcode
+ string rawFSequence = forwardSeq.getUnaligned();
+ string rawRSequence = reverseSeq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the forward barcode
+ for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ string foligo = it->second.forward;
+ string roligo = it->second.reverse;
+
+ if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+ if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+ group = it->first;
+ forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
+ reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+ forwardQual.trimQScores(foligo.length(), -1);
+ reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+ else { //try aligning and see if you can find it
+
+ //look for forward
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (ibarcodes.size() > 0) {
+ for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ if(it->second.forward.length() > maxLength){ maxLength = it->second.forward.length(); }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minFGroup = -1;
+ int minFPos = 0;
+
+ for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ string oligo = it->second.forward;
+
+ if(rawFSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minFGroup = it->first;
+ minFPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minFPos++;
+ }
+ }
+ }else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{
+ //check for reverse match
+ if (alignment != NULL) { delete alignment; }
+ maxLength = 0;
+
+ if (ibarcodes.size() > 0) {
+ for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ if(it->second.reverse.length() > maxLength){ maxLength = it->second.reverse.length(); }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ minDiff = 1e6;
+ minCount = 1;
+ int minRGroup = -1;
+ int minRPos = 0;
+
+ for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ string oligo = it->second.reverse;
+
+ if(rawRSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawRSequence.substr((rawRSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+ for(int i=0;i<alnLength;i++){ if(oligo[i] != '-'){ alnLength = i; break; } }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minRGroup = it->first;
+ minRPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minRPos++;
+ }
+ }
+ }else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{
+ //we have an acceptable match for the forward and reverse, but do they match?
+ if (minFGroup == minRGroup) {
+ group = minFGroup;
+ forwardSeq.setUnaligned(rawFSequence.substr(minFPos));
+ reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-minRPos)));
+ forwardQual.trimQScores(minFPos, -1);
+ reverseQual.trimQScores(-1, rawRSequence.length()-minRPos);
+ success = minDiff;
+ }else { success = bdiffs + 100; }
+ }
+ }
+
+ if (alignment != NULL) { delete alignment; }
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripIBarcode");
+ exit(1);
+ }
}
//*******************************************************************/
}
}
-//*******************************************************************/
+/*******************************************************************
int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
try {
}
}
-//*******************************************************************/
+/*******************************************************************
int TrimOligos::stripRBarcode(Sequence& seq, int& group){
try {
#include "sequence.hpp"
#include "qualityscores.h"
+struct oligosPair {
+ string forward;
+ string reverse;
+
+ oligosPair() { forward = ""; reverse = ""; }
+ oligosPair(string f, string r) : forward(f), reverse(r) {}
+ ~oligosPair() {}
+};
class TrimOligos {
public:
TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
- TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
- TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<int, oligosPair>, map<int, oligosPair>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, linker, spacer
~TrimOligos();
int stripBarcode(Sequence&, int&);
int stripBarcode(Sequence&, QualityScores&, int&);
-
- int stripRBarcode(Sequence&, int&);
- int stripRBarcode(Sequence&, QualityScores&, int&);
-
+ int stripBarcode(Sequence&, Sequence&, QualityScores&, QualityScores&, int&);
+
int stripForward(Sequence&, int&);
int stripForward(Sequence&, QualityScores&, int&, bool);
+ int stripForward(Sequence&, Sequence&, QualityScores&, QualityScores&, int&);
bool stripReverse(Sequence&);
bool stripReverse(Sequence&, QualityScores&);
int pdiffs, bdiffs, ldiffs, sdiffs;
map<string, int> barcodes;
- map<string, int> rbarcodes;
map<string, int> primers;
vector<string> revPrimer;
vector<string> linker;
vector<string> spacer;
+ map<int, oligosPair> ibarcodes;
+ map<int, oligosPair> iprimers;
MothurOut* m;
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
while (moreSeqs) {
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
-
- if(rbarcodes.size() != 0){
- success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
- if(success > bdiffs) { trashCode += 'b'; }
- else{ currentSeqsDiffs += success; }
- }
if(numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
tempPrimerQualFileNames,
tempNameFileNames,
lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
- pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
+ pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
}
else if(type == "BARCODE"){
inOligos >> group;
-
- //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
- //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
- string temp = "";
- while (!inOligos.eof()) {
- char c = inOligos.get();
- if (c == 10 || c == 13){ break; }
- else if (c == 32 || c == 9){;} //space or tab
- else { temp += c; }
- }
- //then this is illumina data with 4 columns
- if (temp != "") {
-
- for(int i=0;i<group.length();i++){
- group[i] = toupper(group[i]);
- if(group[i] == 'U') { group[i] = 'T'; }
- }
-
- if (m->debug) { m->mothurOut("[DEBUG]: reading reverse " + group + ", and group = " + temp + ".\n"); }
-
- string reverseBarcode = reverseOligo(group); //reverse barcode
- //check for repeat barcodes
- map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
- if (itBar != rbarcodes.end()) { m->mothurOut("reverse barcode " + group + " is in your oligos file already."); m->mothurOutEndLine(); }
-
- group = temp;
- rbarcodes[reverseBarcode]=indexBarcode;
- }else { if (m->debug) { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); } }
-
//check for repeat barcodes
map<string, int>::iterator itBar = barcodes.find(oligo);
if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
-
+
barcodes[oligo]=indexBarcode; indexBarcode++;
barcodeNameVector.push_back(group);
}else if(type == "LINKER"){
vector<string> revPrimer, outputNames;
set<string> filesToRemove;
map<string, int> barcodes;
- map<string, int> rbarcodes;
vector<string> groupVector;
map<string, int> primers;
vector<string> linker;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer;
map<string, int> barcodes;
- map<string, int> rbarcodes;
map<string, int> primers;
map<string, int> nameCount;
vector<string> linker;
trimData(){}
trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
- int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
+ int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
sdiffs = sd;
tdiffs = td;
barcodes = bar;
- rbarcodes = rbar;
primers = pri; numFPrimers = primers.size();
revPrimer = revP; numRPrimers = revPrimer.size();
linker = li; numLinkers = linker.size();
}
- TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
pDataArray->count = pDataArray->lineEnd;
for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
else{ currentSeqsDiffs += success; }
}
- if(pDataArray->rbarcodes.size() != 0){
- success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
- if(success > pDataArray->bdiffs) { trashCode += 'b'; }
- else{ currentSeqsDiffs += success; }
- }
-
if(pDataArray->numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
if(success > pDataArray->sdiffs) { trashCode += 's'; }