#include "needlemanoverlap.hpp"
#include "trimoligos.h"
+
//**********************************************************************************************************************
vector<string> TrimSeqsCommand::setParameters(){
try {
- CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
- CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
- CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
- CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
- CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
- CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
- CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
- CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
- CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
- CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
- CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
- CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
- CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
- CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
- CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
- CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
- CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
- CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
- CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
- CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
- CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
- CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
- CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
- CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
- CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
- CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
+ CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
+ CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile);
+ CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
+ CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
+ CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip);
+ CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient);
+ CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
+ CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop);
+ CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength);
+ CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength);
+ CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
+ CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
+ CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs);
+ CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs);
+ CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
+ CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward);
+ CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim);
+ CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold);
+ CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage);
+ CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage);
+ CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage);
+ CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize);
+ CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize);
+ CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst);
+ CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
string helpString = "";
helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
- helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
+ helpString += "The trim.seqs command parameters are fasta, name, count, flip, checkorient, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
helpString += "The fasta parameter is required.\n";
helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
+ helpString += "The checkorient parameter will check the reverse compliment of the sequence if the barcodes and primers cannot be found in the forward. The default is false.\n";
helpString += "The oligos parameter allows you to provide an oligos file.\n";
helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
+ helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
}
}
//**********************************************************************************************************************
-string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
- try {
- string outputFileName = "";
- map<string, vector<string> >::iterator it;
+string TrimSeqsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
- //is this a type this command creates
- it = outputTypes.find(type);
- if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
- else {
- if (type == "qfile") { outputFileName = "qual"; }
- else if (type == "fasta") { outputFileName = "fasta"; }
- else if (type == "group") { outputFileName = "groups"; }
- else if (type == "name") { outputFileName = "names"; }
- else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
- }
- return outputFileName;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
- exit(1);
- }
+ if (type == "qfile") { pattern = "[filename],[tag],qual"; }
+ else if (type == "fasta") { pattern = "[filename],[tag],fasta"; }
+ else if (type == "group") { pattern = "[filename],groups"; }
+ else if (type == "name") { pattern = "[filename],[tag],names"; }
+ else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimSeqsCommand", "getOutputPattern");
+ exit(1);
+ }
}
-
-
//**********************************************************************************************************************
TrimSeqsCommand::TrimSeqsCommand(){
try {
- abort = true; calledHelp = true;
+ abort = true; calledHelp = true;
setParameters();
vector<string> tempOutNames;
outputTypes["fasta"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["group"] = tempOutNames;
outputTypes["name"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
outputTypes["qfile"] = tempOutNames;
outputTypes["group"] = tempOutNames;
outputTypes["name"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
+
+ it = parameters.find("count");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["count"] = inputDir + it->second; }
+ }
}
if (temp == "not found") { nameFile = ""; }
else if(temp == "not open") { nameFile = ""; abort = true; }
else { nameFile = temp; m->setNameFile(nameFile); }
+
+ countfile = validParameter.validFile(parameters, "count", true);
+ if (countfile == "not open") { abort = true; countfile = ""; }
+ else if (countfile == "not found") { countfile = ""; }
+ else { m->setCountTableFile(countfile); }
+
+ if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
m->mothurConvert(temp, qThreshold);
temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
keepforward = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "checkorient", false); if (temp == "not found") { temp = "F"; }
+ reorient = m->isTrue(temp);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
abort = true;
}
- if (nameFile == "") {
- vector<string> files; files.push_back(fastaFile);
- parser.getNameFile(files);
- }
+ if (countfile == "") {
+ if (nameFile == "") {
+ vector<string> files; files.push_back(fastaFile);
+ parser.getNameFile(files);
+ }
+ }
}
}
if (abort == true) { if (calledHelp) { return 0; } return 2; }
+ pairedOligos = false;
numFPrimers = 0; //this needs to be initialized
numRPrimers = 0;
numSpacers = 0;
vector<vector<string> > qualFileNames;
vector<vector<string> > nameFileNames;
- string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
+ variables["[tag]"] = "trim";
+ string trimSeqFile = getOutputFileName("fasta",variables);
+ string trimQualFile = getOutputFileName("qfile",variables);
outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
-
- string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
+
+ variables["[tag]"] = "scrap";
+ string scrapSeqFile = getOutputFileName("fasta",variables);
+ string scrapQualFile = getOutputFileName("qfile",variables);
outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
- string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
- string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
-
if (qFileName != "") {
outputNames.push_back(trimQualFile);
outputNames.push_back(scrapQualFile);
outputTypes["qfile"].push_back(scrapQualFile);
}
- string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
- string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
+ variables["[tag]"] = "trim";
+ string trimNameFile = getOutputFileName("name",variables);
+ variables["[tag]"] = "scrap";
+ string scrapNameFile = getOutputFileName("name",variables);
if (nameFile != "") {
m->readNames(nameFile, nameMap);
outputTypes["name"].push_back(trimNameFile);
outputTypes["name"].push_back(scrapNameFile);
}
+
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
+ variables["[tag]"] = "trim";
+ string trimCountFile = getOutputFileName("count",variables);
+ variables["[tag]"] = "scrap";
+ string scrapCountFile = getOutputFileName("count",variables);
+
+ if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile, true);
+ nameCount = ct.getNameMap();
+ outputNames.push_back(trimCountFile);
+ outputNames.push_back(scrapCountFile);
+ outputTypes["count"].push_back(trimCountFile);
+ outputTypes["count"].push_back(scrapCountFile);
+ }
+
if (m->control_pressed) { return 0; }
string outputGroupFileName;
if(oligoFile != ""){
createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
- if (createGroup) {
- outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
+ if ((createGroup) && (countfile == "")){
+ map<string, string> myvariables;
+ myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
+ outputGroupFileName = getOutputFileName("group",myvariables);
outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
}
}
-
+
+ if (!pairedOligos) { if (reorient) { m->mothurOut("[WARNING]: You cannot use reorient without paired barcodes or primers, skipping."); m->mothurOutEndLine(); reorient = false; } }
+
+ if (m->control_pressed) { return 0; }
+
//fills lines and qlines
setLines(fastaFile, qFileName);
if(processors == 1){
- driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
+ driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
}else{
- createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
+ createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
}
for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
outputNames = outputNames2;
- for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
- ifstream in;
- m->openInputFile(it->first, in);
-
- ofstream out;
- string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group");
- outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
- m->openOutputFile(thisGroupName, out);
-
- while (!in.eof()){
- if (m->control_pressed) { break; }
-
- Sequence currSeq(in); m->gobble(in);
- out << currSeq.getName() << '\t' << it->second << endl;
+ for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
+ ifstream in;
+ m->openInputFile(it->first, in);
+
+ ofstream out;
+ map<string, string> myvariables;
+ myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first));
+ string thisGroupName = "";
+ if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
+ else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
+ m->openOutputFile(thisGroupName, out);
+
+ if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
+
+ while (!in.eof()){
+ if (m->control_pressed) { break; }
- if (nameFile != "") {
- map<string, string>::iterator itName = nameMap.find(currSeq.getName());
- if (itName != nameMap.end()) {
- vector<string> thisSeqsNames;
- m->splitAtChar(itName->second, thisSeqsNames, ',');
- for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
- out << thisSeqsNames[k] << '\t' << it->second << endl;
- }
- }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ Sequence currSeq(in); m->gobble(in);
+ if (countfile == "") {
+ out << currSeq.getName() << '\t' << it->second << endl;
+
+ if (nameFile != "") {
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ vector<string> thisSeqsNames;
+ m->splitAtChar(itName->second, thisSeqsNames, ',');
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ out << thisSeqsNames[k] << '\t' << it->second << endl;
+ }
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
+ }else {
+ map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
+ if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
+ else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
}
- }
- in.close();
- out.close();
- }
+ }
+ in.close();
+ out.close();
+ }
+
+ if (countfile != "") { //create countfile with group info included
+ CountTable* ct = new CountTable();
+ ct->readTable(trimCountFile, true);
+ map<string, int> justTrimmedNames = ct->getNameMap();
+ delete ct;
+
+ CountTable newCt;
+ for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
+ vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
+ for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
+ newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
+ map<string, string>::iterator it2 = groupMap.find(itNames->first);
+ if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
+ else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ }
+ newCt.printTable(trimCountFile);
+ }
}
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
}
+
+ itTypes = outputTypes.find("count");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+ }
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
}
/**************************************************************************************/
-
-int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
+int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
try {
m->openOutputFile(scrapNFileName, scrapNameFile);
}
+ ofstream trimCountFile;
+ ofstream scrapCountFile;
+ if(countfile != ""){
+ m->openOutputFile(trimCFileName, trimCountFile);
+ m->openOutputFile(scrapCFileName, scrapCountFile);
+ if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
+ }
ofstream outGroupsFile;
- if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
+ if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
if(allFiles){
for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
-
+ int numBarcodes = barcodes.size();
+ TrimOligos* trimOligos = NULL;
+ if (pairedOligos) { trimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, pairedPrimers, pairedBarcodes); numBarcodes = pairedBarcodes.size(); }
+ else { trimOligos = new TrimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); }
+
+ TrimOligos* rtrimOligos = NULL;
+ if (reorient) {
+ //create reoriented primer and barcode pairs
+ map<int, oligosPair> rpairedPrimers, rpairedBarcodes;
+ for (map<int, oligosPair>::iterator it = pairedPrimers.begin(); it != pairedPrimers.end(); it++) {
+ oligosPair tempPair(reverseOligo((it->second).reverse), (reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer
+ rpairedPrimers[it->first] = tempPair;
+ //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << primerNameVector[it->first] << endl;
+ }
+ for (map<int, oligosPair>::iterator it = pairedBarcodes.begin(); it != pairedBarcodes.end(); it++) {
+ oligosPair tempPair(reverseOligo((it->second).reverse), (reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode
+ rpairedBarcodes[it->first] = tempPair;
+ //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << barcodeNameVector[it->first] << endl;
+ }
+ rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size();
+ }
+
while (moreSeqs) {
- if (m->control_pressed) {
+ if (m->control_pressed) {
+ delete trimOligos; if (reorient) { delete rtrimOligos; }
inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
- if (createGroup) { outGroupsFile.close(); }
-
- if(qFileName != ""){
- qFile.close();
- }
- for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
-
- return 0;
+ if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
+ if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
+ if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
+ if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
}
int success = 1;
Sequence currSeq(inFASTA); m->gobble(inFASTA);
//cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
- QualityScores currQual;
+ Sequence savedSeq(currSeq.getName(), currSeq.getAligned());
+
+ QualityScores currQual; QualityScores savedQual;
if(qFileName != ""){
currQual = QualityScores(qFile); m->gobble(qFile);
+ savedQual.setName(currQual.getName()); savedQual.setScores(currQual.getScores());
+ //cout << currQual.getName() << endl;
}
-
+
string origSeq = currSeq.getUnaligned();
if (origSeq != "") {
int primerIndex = 0;
if(numLinkers != 0){
- success = trimOligos.stripLinker(currSeq, currQual);
+ success = trimOligos->stripLinker(currSeq, currQual);
if(success > ldiffs) { trashCode += 'k'; }
else{ currentSeqsDiffs += success; }
}
- if(barcodes.size() != 0){
- success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
- if(success > bdiffs) { trashCode += 'b'; }
- else{ currentSeqsDiffs += success; }
- }
-
- if(rbarcodes.size() != 0){
- success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
- if(success > bdiffs) { trashCode += 'b'; }
+ if(numBarcodes != 0){
+ success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex);
+ if(success > bdiffs) {
+ trashCode += 'b';
+ }
else{ currentSeqsDiffs += success; }
}
if(numSpacers != 0){
- success = trimOligos.stripSpacer(currSeq, currQual);
+ success = trimOligos->stripSpacer(currSeq, currQual);
if(success > sdiffs) { trashCode += 's'; }
else{ currentSeqsDiffs += success; }
}
if(numFPrimers != 0){
- success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
- if(success > pdiffs) { trashCode += 'f'; }
+ success = trimOligos->stripForward(currSeq, currQual, primerIndex, keepforward);
+ if(success > pdiffs) {
+ trashCode += 'f';
+ }
else{ currentSeqsDiffs += success; }
}
if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
if(numRPrimers != 0){
- success = trimOligos.stripReverse(currSeq, currQual);
+ success = trimOligos->stripReverse(currSeq, currQual);
if(!success) { trashCode += 'r'; }
}
-
+
+ if (reorient && (trashCode != "")) { //if you failed and want to check the reverse
+ int thisSuccess = 0;
+ string thisTrashCode = "";
+ int thisCurrentSeqsDiffs = 0;
+
+ int thisBarcodeIndex = 0;
+ int thisPrimerIndex = 0;
+
+ if(numBarcodes != 0){
+ thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex);
+ if(thisSuccess > bdiffs) { thisTrashCode += "b"; }
+ else{ thisCurrentSeqsDiffs += thisSuccess; }
+ }
+
+ if(numFPrimers != 0){
+ thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, keepforward);
+ if(thisSuccess > pdiffs) { thisTrashCode += "f"; }
+ else{ thisCurrentSeqsDiffs += thisSuccess; }
+ }
+
+ if (thisCurrentSeqsDiffs > tdiffs) { thisTrashCode += 't'; }
+
+ if (thisTrashCode == "") {
+ trashCode = thisTrashCode;
+ success = thisSuccess;
+ currentSeqsDiffs = thisCurrentSeqsDiffs;
+ barcodeIndex = thisBarcodeIndex;
+ primerIndex = thisPrimerIndex;
+ savedSeq.reverseComplement();
+ currSeq.setAligned(savedSeq.getAligned());
+ if(qFileName != ""){
+ savedQual.flipQScores();
+ currQual.setScores(savedQual.getScores());
+ }
+ }else { trashCode += "(" + thisTrashCode + ")"; }
+ }
+
if(keepFirst != 0){
success = keepFirstTrim(currSeq, currQual);
}
}
}
+ if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
+
if(trashCode.length() == 0){
- currSeq.setAligned(currSeq.getUnaligned());
- currSeq.printSequence(trimFASTAFile);
-
- if(qFileName != ""){
- currQual.printQScores(trimQualFile);
- }
-
-
- if(nameFile != ""){
- map<string, string>::iterator itName = nameMap.find(currSeq.getName());
- if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
- else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
- }
-
- if (createGroup) {
- if(barcodes.size() != 0){
- string thisGroup = barcodeNameVector[barcodeIndex];
- if (primers.size() != 0) {
+ string thisGroup = "";
+ if (createGroup) {
+ if(numBarcodes != 0){
+ thisGroup = barcodeNameVector[barcodeIndex];
+ if (numFPrimers != 0) {
if (primerNameVector[primerIndex] != "") {
if(thisGroup != "") {
thisGroup += "." + primerNameVector[primerIndex];
}
}
}
-
- outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
-
- int numRedundants = 0;
- if (nameFile != "") {
- map<string, string>::iterator itName = nameMap.find(currSeq.getName());
- if (itName != nameMap.end()) {
- vector<string> thisSeqsNames;
- m->splitAtChar(itName->second, thisSeqsNames, ',');
- numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
- for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
- outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
- }
- }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
- }
-
- map<string, int>::iterator it = groupCounts.find(thisGroup);
- if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
- else { groupCounts[it->first] += (1 + numRedundants); }
+ }
+ }
+
+ int pos = thisGroup.find("ignore");
+ if (pos == string::npos) {
+ currSeq.setAligned(currSeq.getUnaligned());
+ currSeq.printSequence(trimFASTAFile);
+
+ if(qFileName != ""){
+ currQual.printQScores(trimQualFile);
+ }
+
+
+ if(nameFile != ""){
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
+ else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
+
+ int numRedundants = 0;
+ if (countfile != "") {
+ map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
+ if (itCount != nameCount.end()) {
+ trimCountFile << itCount->first << '\t' << itCount->second << endl;
+ numRedundants = itCount->second-1;
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
+ }
+
+ if (createGroup) {
+ if(numBarcodes != 0){
+
+ if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
+
+ if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+ else { groupMap[currSeq.getName()] = thisGroup; }
+
+ if (nameFile != "") {
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ vector<string> thisSeqsNames;
+ m->splitAtChar(itName->second, thisSeqsNames, ',');
+ numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+ }
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
+
+ map<string, int>::iterator it = groupCounts.find(thisGroup);
+ if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
+ else { groupCounts[it->first] += (1 + numRedundants); }
- }
- }
-
- if(allFiles){
- ofstream output;
- m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
- currSeq.printSequence(output);
- output.close();
-
- if(qFileName != ""){
- m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
- currQual.printQScores(output);
- output.close();
- }
-
- if(nameFile != ""){
- map<string, string>::iterator itName = nameMap.find(currSeq.getName());
- if (itName != nameMap.end()) {
- m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
- output << itName->first << '\t' << itName->second << endl;
- output.close();
- }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
- }
- }
+ }
+ }
+
+ if(allFiles){
+ ofstream output;
+ m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
+ currSeq.printSequence(output);
+ output.close();
+
+ if(qFileName != ""){
+ m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
+ currQual.printQScores(output);
+ output.close();
+ }
+
+ if(nameFile != ""){
+ map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+ if (itName != nameMap.end()) {
+ m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
+ output << itName->first << '\t' << itName->second << endl;
+ output.close();
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
+ }
+ }
+ }
}
else{
if(nameFile != ""){ //needs to be before the currSeq name is changed
if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
}
+ if (countfile != "") {
+ map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
+ if (itCount != nameCount.end()) {
+ trimCountFile << itCount->first << '\t' << itCount->second << endl;
+ }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
+ }
+
currSeq.setName(currSeq.getName() + '|' + trashCode);
currSeq.setUnaligned(origSeq);
currSeq.setAligned(origSeq);
//report progress
if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
-
+ delete trimOligos;
+ if (reorient) { delete rtrimOligos; }
inFASTA.close();
trimFASTAFile.close();
scrapFASTAFile.close();
if (createGroup) { outGroupsFile.close(); }
if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
+ if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
return count;
}
/**************************************************************************************************/
-int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
+int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
try {
int process = 1;
(scrapQualFileName + toString(getpid()) + ".temp"),
(trimNameFileName + toString(getpid()) + ".temp"),
(scrapNameFileName + toString(getpid()) + ".temp"),
+ (trimCountFileName + toString(getpid()) + ".temp"),
+ (scrapCountFileName + toString(getpid()) + ".temp"),
(groupFile + toString(getpid()) + ".temp"),
tempFASTAFileNames,
tempPrimerQualFileNames,
tempNameFileNames,
lines[process],
qLines[process]);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
//pass groupCounts to parent
if(createGroup){
for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
out << it->first << '\t' << it->second << endl;
}
+
+ out << groupMap.size() << endl;
+ for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
+ out << it->first << '\t' << it->second << endl;
+ }
out.close();
}
exit(0);
m->openOutputFile(trimNameFileName, temp); temp.close();
m->openOutputFile(scrapNameFileName, temp); temp.close();
}
+ if (countfile != "") {
+ m->openOutputFile(trimCountFileName, temp); temp.close();
+ m->openOutputFile(scrapCountFileName, temp); temp.close();
+ }
- driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
+ driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
HANDLE hThreadArray[processors-1];
//Create processor worker threads.
- for( int i=0; i<processors-1; i++){
+ for( int h=0; h<processors-1; h++){
string extension = "";
- if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+ if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
vector<vector<string> > tempNameFileNames = nameFileNames;
trimData* tempTrim = new trimData(filename,
- qFileName, nameFile,
+ qFileName, nameFile, countfile,
(trimFASTAFileName+extension),
(scrapFASTAFileName+extension),
(trimQualFileName+extension),
(scrapQualFileName+extension),
(trimNameFileName+extension),
(scrapNameFileName+extension),
+ (trimCountFileName+extension),
+ (scrapCountFileName+extension),
(groupFile+extension),
tempFASTAFileNames,
tempPrimerQualFileNames,
tempNameFileNames,
- lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
- pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
+ lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
+ pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, pairedBarcodes, pairedPrimers, pairedOligos,
primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
- minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
+ minLength, maxAmbig, maxHomoP, maxLength, flip, reorient, nameMap, nameCount);
pDataArray.push_back(tempTrim);
- hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
}
//parent do my part
m->openOutputFile(trimNameFileName, temp); temp.close();
m->openOutputFile(scrapNameFileName, temp); temp.close();
}
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+ vector<vector<string> > tempNameFileNames = nameFileNames;
+ if(allFiles){
+ ofstream temp;
+ string extension = toString(processors-1) + ".temp";
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += extension;
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+ if(qFileName != ""){
+ tempPrimerQualFileNames[i][j] += extension;
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ if(nameFile != ""){
+ tempNameFileNames[i][j] += extension;
+ m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+ }
- driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
+ driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
processIDS.push_back(processors-1);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
+ if (pDataArray[i]->count != pDataArray[i]->lineEnd) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->lineEnd) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
map<string, int>::iterator it2 = groupCounts.find(it->first);
if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
else { groupCounts[it->first] += it->second; }
}
+ for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
+ map<string, string>::iterator it2 = groupMap.find(it->first);
+ if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
+ else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
}
+
+ if(countfile != ""){
+ m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
+ m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
+ m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
+ m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
+ }
- if(createGroup){
+ if((createGroup)&&(countfile == "")){
m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
}
in >> tempNum; m->gobble(in);
if (tempNum != 0) {
- while (!in.eof()) {
- in >> group >> tempNum; m->gobble(in);
+ for (int i = 0; i < tempNum; i++) {
+ int groupNum;
+ in >> group >> groupNum; m->gobble(in);
map<string, int>::iterator it = groupCounts.find(group);
- if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
- else { groupCounts[it->first] += tempNum; }
+ if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
+ else { groupCounts[it->first] += groupNum; }
+ }
+ }
+ in >> tempNum; m->gobble(in);
+ if (tempNum != 0) {
+ for (int i = 0; i < tempNum; i++) {
+ string group, seqName;
+ in >> seqName >> group; m->gobble(in);
+
+ map<string, string>::iterator it = groupMap.find(seqName);
+ if (it == groupMap.end()) { groupMap[seqName] = group; }
+ else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
}
}
+
in.close(); m->mothurRemove(tempFile);
}
#endif
string sname = ""; nameStream >> sname;
sname = sname.substr(1);
+ m->checkName(sname);
+
map<string, int>::iterator it = firstSeqNames.find(sname);
if(it != firstSeqNames.end()) { //this is the start of a new chunk
}
for (int i = 0; i < (fastaFilePos.size()-1); i++) {
+ if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
}
ofstream test;
- string type, oligo, group;
+ string type, oligo, roligo, group;
+ bool hasPrimer = false; bool hasPairedBarcodes = false;
int indexPrimer = 0;
int indexBarcode = 0;
+ int indexPairedPrimer = 0;
+ int indexPairedBarcode = 0;
+ set<string> uniquePrimers;
+ set<string> uniqueBarcodes;
while(!inOligos.eof()){
inOligos >> type;
-
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
+
if(type[0] == '#'){
while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
m->gobble(inOligos);
for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
inOligos >> oligo;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
for(int i=0;i<oligo.length();i++){
oligo[i] = toupper(oligo[i]);
// get rest of line in case there is a primer name
while (!inOligos.eof()) {
char c = inOligos.get();
- if (c == 10 || c == 13){ break; }
+ if (c == 10 || c == 13 || c == -1){ break; }
else if (c == 32 || c == 9){;} //space or tab
else { group += c; }
}
map<string, int>::iterator itPrime = primers.find(oligo);
if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
+ if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
+
primers[oligo]=indexPrimer; indexPrimer++;
primerNameVector.push_back(group);
}
+ else if (type == "PRIMER"){
+ m->gobble(inOligos);
+
+ inOligos >> roligo;
+
+ for(int i=0;i<roligo.length();i++){
+ roligo[i] = toupper(roligo[i]);
+ if(roligo[i] == 'U') { roligo[i] = 'T'; }
+ }
+ roligo = reverseOligo(roligo);
+
+ group = "";
+
+ // get rest of line in case there is a primer name
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13 || c == -1){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ else { group += c; }
+ }
+
+ oligosPair newPrimer(oligo, roligo);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
+
+ //check for repeat barcodes
+ string tempPair = oligo+roligo;
+ if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); }
+ else { uniquePrimers.insert(tempPair); }
+
+ if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } }
+
+ pairedPrimers[indexPairedPrimer]=newPrimer; indexPairedPrimer++;
+ primerNameVector.push_back(group);
+ hasPrimer = true;
+ }
else if(type == "REVERSE"){
//Sequence oligoRC("reverse", oligo);
//oligoRC.reverseComplement();
//barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
//or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
- string temp = "";
- while (!inOligos.eof()) {
- char c = inOligos.get();
- if (c == 10 || c == 13){ break; }
+
+ string temp = "";
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13 || c == -1){ break; }
else if (c == 32 || c == 9){;} //space or tab
else { temp += c; }
- }
+ }
//then this is illumina data with 4 columns
- if (temp != "") {
- string reverseBarcode = reverseOligo(group); //reverse barcode
+ if (temp != "") {
+ hasPairedBarcodes = true;
+ string reverseBarcode = group; //reverseOligo(group); //reverse barcode
group = temp;
+ for(int i=0;i<reverseBarcode.length();i++){
+ reverseBarcode[i] = toupper(reverseBarcode[i]);
+ if(reverseBarcode[i] == 'U') { reverseBarcode[i] = 'T'; }
+ }
+
+ reverseBarcode = reverseOligo(reverseBarcode);
+ oligosPair newPair(oligo, reverseBarcode);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); }
+
//check for repeat barcodes
- map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
- if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
-
- rbarcodes[reverseBarcode]=indexBarcode;
- }
+ string tempPair = oligo+reverseBarcode;
+ if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); }
+ else { uniqueBarcodes.insert(tempPair); }
- //check for repeat barcodes
- map<string, int>::iterator itBar = barcodes.find(oligo);
- if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
-
- barcodes[oligo]=indexBarcode; indexBarcode++;
- barcodeNameVector.push_back(group);
+ pairedBarcodes[indexPairedBarcode]=newPair; indexPairedBarcode++;
+ barcodeNameVector.push_back(group);
+ }else {
+ //check for repeat barcodes
+ map<string, int>::iterator itBar = barcodes.find(oligo);
+ if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
+
+ barcodes[oligo]=indexBarcode; indexBarcode++;
+ barcodeNameVector.push_back(group);
+ }
}else if(type == "LINKER"){
linker.push_back(oligo);
}else if(type == "SPACER"){
spacer.push_back(oligo);
}
- else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
+ else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
}
m->gobble(inOligos);
}
inOligos.close();
+ if (hasPairedBarcodes || hasPrimer) {
+ pairedOligos = true;
+ if ((primers.size() != 0) || (barcodes.size() != 0) || (linker.size() != 0) || (spacer.size() != 0) || (revPrimer.size() != 0)) { m->control_pressed = true; m->mothurOut("[ERROR]: cannot mix paired primers and barcodes with non paired or linkers and spacers, quitting."); m->mothurOutEndLine(); return 0; }
+ }else if (reorient) { m->mothurOut("[Warning]: cannot use checkorient without paired barcodes or primers, ignoring.\n"); m->mothurOutEndLine(); reorient = false; }
+
if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
-
+
//add in potential combos
if(barcodeNameVector.size() == 0){
barcodes[""] = 0;
if(allFiles){
set<string> uniqueNames; //used to cleanup outputFileNames
- for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
- for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
-
- string primerName = primerNameVector[itPrimer->second];
- string barcodeName = barcodeNameVector[itBar->second];
-
- string comboGroupName = "";
- string fastaFileName = "";
- string qualFileName = "";
- string nameFileName = "";
-
- if(primerName == ""){
- comboGroupName = barcodeNameVector[itBar->second];
- }
- else{
- if(barcodeName == ""){
- comboGroupName = primerNameVector[itPrimer->second];
- }
- else{
- comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
- }
- }
-
-
- ofstream temp;
- fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
- if (uniqueNames.count(fastaFileName) == 0) {
- outputNames.push_back(fastaFileName);
- outputTypes["fasta"].push_back(fastaFileName);
- uniqueNames.insert(fastaFileName);
- }
-
- fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
- m->openOutputFile(fastaFileName, temp); temp.close();
-
- if(qFileName != ""){
- qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
- if (uniqueNames.count(qualFileName) == 0) {
- outputNames.push_back(qualFileName);
- outputTypes["qfile"].push_back(qualFileName);
- }
-
- qualFileNames[itBar->second][itPrimer->second] = qualFileName;
- m->openOutputFile(qualFileName, temp); temp.close();
- }
-
- if(nameFile != ""){
- nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
- if (uniqueNames.count(nameFileName) == 0) {
- outputNames.push_back(nameFileName);
- outputTypes["name"].push_back(nameFileName);
- }
-
- nameFileNames[itBar->second][itPrimer->second] = nameFileName;
- m->openOutputFile(nameFileName, temp); temp.close();
- }
-
- }
- }
+ if (pairedOligos) {
+ for(map<int, oligosPair>::iterator itBar = pairedBarcodes.begin();itBar != pairedBarcodes.end();itBar++){
+ for(map<int, oligosPair>::iterator itPrimer = pairedPrimers.begin();itPrimer != pairedPrimers.end(); itPrimer++){
+
+ string primerName = primerNameVector[itPrimer->first];
+ string barcodeName = barcodeNameVector[itBar->first];
+
+ if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
+ else {
+ string comboGroupName = "";
+ string fastaFileName = "";
+ string qualFileName = "";
+ string nameFileName = "";
+ string countFileName = "";
+
+ if(primerName == ""){
+ comboGroupName = barcodeNameVector[itBar->first];
+ }
+ else{
+ if(barcodeName == ""){
+ comboGroupName = primerNameVector[itPrimer->first];
+ }
+ else{
+ comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
+ }
+ }
+
+
+ ofstream temp;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
+ variables["[tag]"] = comboGroupName;
+ fastaFileName = getOutputFileName("fasta", variables);
+ if (uniqueNames.count(fastaFileName) == 0) {
+ outputNames.push_back(fastaFileName);
+ outputTypes["fasta"].push_back(fastaFileName);
+ uniqueNames.insert(fastaFileName);
+ }
+
+ fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
+ m->openOutputFile(fastaFileName, temp); temp.close();
+
+ if(qFileName != ""){
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName));
+ qualFileName = getOutputFileName("qfile", variables);
+ if (uniqueNames.count(qualFileName) == 0) {
+ outputNames.push_back(qualFileName);
+ outputTypes["qfile"].push_back(qualFileName);
+ }
+
+ qualFileNames[itBar->first][itPrimer->first] = qualFileName;
+ m->openOutputFile(qualFileName, temp); temp.close();
+ }
+
+ if(nameFile != ""){
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
+ nameFileName = getOutputFileName("name", variables);
+ if (uniqueNames.count(nameFileName) == 0) {
+ outputNames.push_back(nameFileName);
+ outputTypes["name"].push_back(nameFileName);
+ }
+
+ nameFileNames[itBar->first][itPrimer->first] = nameFileName;
+ m->openOutputFile(nameFileName, temp); temp.close();
+ }
+ }
+ }
+ }
+ }else {
+ for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
+ for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
+
+ string primerName = primerNameVector[itPrimer->second];
+ string barcodeName = barcodeNameVector[itBar->second];
+
+ if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
+ else {
+ string comboGroupName = "";
+ string fastaFileName = "";
+ string qualFileName = "";
+ string nameFileName = "";
+ string countFileName = "";
+
+ if(primerName == ""){
+ comboGroupName = barcodeNameVector[itBar->second];
+ }
+ else{
+ if(barcodeName == ""){
+ comboGroupName = primerNameVector[itPrimer->second];
+ }
+ else{
+ comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
+ }
+ }
+
+
+ ofstream temp;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
+ variables["[tag]"] = comboGroupName;
+ fastaFileName = getOutputFileName("fasta", variables);
+ if (uniqueNames.count(fastaFileName) == 0) {
+ outputNames.push_back(fastaFileName);
+ outputTypes["fasta"].push_back(fastaFileName);
+ uniqueNames.insert(fastaFileName);
+ }
+
+ fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
+ m->openOutputFile(fastaFileName, temp); temp.close();
+
+ if(qFileName != ""){
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName));
+ qualFileName = getOutputFileName("qfile", variables);
+ if (uniqueNames.count(qualFileName) == 0) {
+ outputNames.push_back(qualFileName);
+ outputTypes["qfile"].push_back(qualFileName);
+ }
+
+ qualFileNames[itBar->second][itPrimer->second] = qualFileName;
+ m->openOutputFile(qualFileName, temp); temp.close();
+ }
+
+ if(nameFile != ""){
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
+ nameFileName = getOutputFileName("name", variables);
+ if (uniqueNames.count(nameFileName) == 0) {
+ outputNames.push_back(nameFileName);
+ outputTypes["name"].push_back(nameFileName);
+ }
+
+ nameFileNames[itBar->second][itPrimer->second] = nameFileName;
+ m->openOutputFile(nameFileName, temp); temp.close();
+ }
+ }
+ }
+ }
+ }
}
numFPrimers = primers.size();
+ if (pairedOligos) { numFPrimers = pairedPrimers.size(); }
numRPrimers = revPrimer.size();
numLinkers = linker.size();
numSpacers = spacer.size();
break;
}
}
-
+
if (allBlank) {
m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
allFiles = false;
if(qscores.getName() != ""){
qscores.trimQScores(-1, keepFirst);
}
+
+// sequence.printSequence(cout);cout << endl;
+
sequence.trim(keepFirst);
+
+// sequence.printSequence(cout);cout << endl << endl;;
+
return success;
}
catch(exception& e) {