//**********************************************************************************************************************
vector<string> PcrSeqsCommand::setParameters(){
try {
- CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
- CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
- CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
- CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
- CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
- CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
- CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
- CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
- CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
- CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
- CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
- CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
- CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
+ CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
+ CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
+ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
+ CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
+ CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
+ CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
+ CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
+ CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
+ CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
+ CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
+
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
+ CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
string PcrSeqsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The pcr.seqs command reads a fasta file ...\n";
-
+ helpString += "The pcr.seqs command reads a fasta file.\n";
+ helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n";
+ helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
+ helpString += "The start parameter allows you to provide a starting position to trim to.\n";
+ helpString += "The end parameter allows you to provide a ending position to trim from.\n";
+ helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
+ helpString += "The processors parameter allows you to use multiple processors.\n";
+ helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
+ helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
+ helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
return helpString;
exit(1);
}
}
-
-
+//**********************************************************************************************************************
+string PcrSeqsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
+ else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
+ else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
+ else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
+ else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
+ else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
PcrSeqsCommand::PcrSeqsCommand(){
outputTypes["taxonomy"] = tempOutNames;
outputTypes["group"] = tempOutNames;
outputTypes["name"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
}
catch(exception& e) {
outputTypes["group"] = tempOutNames;
outputTypes["name"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["group"] = inputDir + it->second; }
}
+
+ it = parameters.find("count");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["count"] = inputDir + it->second; }
+ }
}
- //if the user changes the output directory command factory will send this info to us in the output parameter
- outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
//check for required parameters
fastafile = validParameter.validFile(parameters, "fasta", true);
}else if (fastafile == "not open") { fastafile = ""; abort = true; }
else { m->setFastaFile(fastafile); }
-
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
string temp;
else if(groupfile == "not open"){ groupfile = ""; abort = true; }
else { m->setGroupFile(groupfile); }
+ countfile = validParameter.validFile(parameters, "count", true);
+ if (countfile == "not open") { countfile = ""; abort = true; }
+ else if (countfile == "not found") { countfile = ""; }
+ else { m->setCountTableFile(countfile); }
+
+ if ((namefile != "") && (countfile != "")) {
+ m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+ }
+
+ if ((groupfile != "") && (countfile != "")) {
+ m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+ }
+
taxfile = validParameter.validFile(parameters, "taxonomy", true);
if (taxfile == "not found"){ taxfile = ""; }
else if(taxfile == "not open"){ taxfile = ""; abort = true; }
else { m->setTaxonomyFile(taxfile); }
-
- temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
- m->mothurConvert(temp, pdiffs);
-
+
temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
m->mothurConvert(temp, start);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+
+ temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, pdiffs);
nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
}
//check to make sure you didn't forget the name file by mistake
- if (namefile == "") {
- vector<string> files; files.push_back(fastafile);
- parser.getNameFile(files);
- }
+ if (countfile == "") {
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
}
}
if (abort == true) { if (calledHelp) { return 0; } return 2; }
int start = time(NULL);
+ fileAligned = true;
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
- string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
+ variables["[extension]"] = m->getExtension(fastafile);
+ string trimSeqFile = getOutputFileName("fasta",variables);
outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
-
- string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
+ variables["[tag]"] = "scrap";
+ string badSeqFile = getOutputFileName("fasta",variables);
length = 0;
- if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
+ if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; }
if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
vector<unsigned long long> positions;
if (m->control_pressed) { return 0; }
set<string> badNames;
- if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
- else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
+ numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);
if (m->control_pressed) { return 0; }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
if (taxfile != "") { readTax(badNames); }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
+ if (countfile != "") { readCount(badNames); }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
}
+ itTypes = outputTypes.find("count");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+ }
+
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
m->mothurOutEndLine();
vector<int> processIDS;
int process = 1;
int num = 0;
+ int pstart = -1; int pend = -1;
+ bool adjustNeeded = false;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
+ string locationsFile = toString(getpid()) + ".temp";
+ num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, pend, adjustNeeded);
//pass numSeqs to parent
ofstream out;
string tempFile = filename + toString(getpid()) + ".num.temp";
m->openOutputFile(tempFile, out);
+ out << pstart << '\t' << pend << '\t' << adjustNeeded << endl;
out << num << '\t' << badSeqNames.size() << endl;
for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
out << (*it) << endl;
}
}
- num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
+ string locationsFile = toString(getpid()) + ".temp";
+ num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, pend, adjustNeeded);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
string tempFile = filename + toString(processIDS[i]) + ".num.temp";
m->openInputFile(tempFile, in);
int numBadNames = 0; string name = "";
- if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
+ int tpstart = -1; int tpend = -1; bool tempAdjust = false;
+
+ if (!in.eof()) {
+ in >> tpstart >> tpend >> tempAdjust; m->gobble(in);
+
+ if (tempAdjust) { adjustNeeded = true; }
+ if (tpstart != -1) {
+ if (tpstart != pstart) { adjustNeeded = true; }
+ if (tpstart < pstart) { pstart = tpstart; } //smallest start
+ }
+ if (tpend != -1) {
+ if (tpend != pend) { adjustNeeded = true; }
+ if (tpend > pend) { pend = tpend; } //largest end
+ }
+ int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in);
+ }
for (int j = 0; j < numBadNames; j++) {
in >> name; m->gobble(in);
badSeqNames.insert(name);
m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile);
+ m->mothurRemove((toString(processIDS[i]) + ".temp"));
}
#else
DWORD dwThreadIdArray[processors-1];
HANDLE hThreadArray[processors-1];
+ string locationsFile = "locationsFile.txt";
+ m->mothurRemove(locationsFile);
+ m->mothurRemove(goodFileName);
+ m->mothurRemove(badFileName);
+
//Create processor worker threads.
for( int i=0; i<processors-1; i++ ){
if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
// Allocate memory for thread data.
- pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
+ pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, locationsFile+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end);
pDataArray.push_back(tempPcr);
//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
}
//do your part
- num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
+ num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, pend, adjustNeeded);
processIDS.push_back(processors-1);
//Wait until all threads have terminated.
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ if (pDataArray[i]->count != pDataArray[i]->fend) {
+ m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
+ }
+ if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; }
+ if (pDataArray[i]->pstart != -1) {
+ if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; }
+ if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; }
+ } //smallest start
+ if (pDataArray[i]->pend != -1) {
+ if (pDataArray[i]->pend != pend) { adjustNeeded = true; }
+ if (pDataArray[i]->pend > pend) { pend = pDataArray[i]->pend; }
+ } //largest end
+
for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile);
+ m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp"));
}
#endif
+ if (fileAligned && adjustNeeded) { adjustDots(goodFileName, locationsFile, pstart, pend); }
return num;
}
//**********************************************************************************************************************
-int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
+int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set<string>& badSeqNames, linePair filePos, int& pstart, int& pend, bool& adjustNeeded){
try {
ofstream goodFile;
m->openOutputFile(goodFasta, goodFile);
ofstream badFile;
m->openOutputFile(badFasta, badFile);
+
+ ofstream locationsFile;
+ m->openOutputFile(locationsName, locationsFile);
ifstream inFASTA;
m->openInputFile(filename, inFASTA);
bool done = false;
int count = 0;
set<int> lengths;
+ vector< set<int> > locations; //locations[0] = beginning locations, locations[1] = ending locations
+ locations.resize(2);
+
+ //pdiffs, bdiffs, primers, barcodes, revPrimers
+ map<string, int> faked;
+ TrimOligos trim(pdiffs, 0, primers, faked, revPrimer);
while (!done) {
Sequence currSeq(inFASTA); m->gobble(inFASTA);
+ if (fileAligned) { //assume aligned until proven otherwise
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { fileAligned = false; }
+ }
+
string trashCode = "";
+ string locationsString = "";
+ int thisPStart = -1;
+ int thisPEnd = -1;
if (currSeq.getName() != "") {
+ if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); }
+
bool goodSeq = true;
if (oligosfile != "") {
map<int, int> mapAligned;
//process primers
if (primers.size() != 0) {
int primerStart = 0; int primerEnd = 0;
- bool good = findForward(currSeq, primerStart, primerEnd);
+ bool good = trim.findForward(currSeq, primerStart, primerEnd);
if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
else{
//are you aligned
if (aligned) {
- if (!keepprimer) {
- if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
+ if (!keepprimer) {
+ if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned.
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1));
+ if (fileAligned) {
+ thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
+ }
+ }
}
else {
if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));
+ if (fileAligned) {
+ thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
+ }
+ }
}
+ isAligned(currSeq.getAligned(), mapAligned);
}else {
if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
//process reverse primers
if (revPrimer.size() != 0) {
int primerStart = 0; int primerEnd = 0;
- bool good = findReverse(currSeq, primerStart, primerEnd);
+ bool good = trim.findReverse(currSeq, primerStart, primerEnd);
if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
- else{
+ else{
//are you aligned
if (aligned) {
if (!keepprimer) {
if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));
+ if (fileAligned) {
+ thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
+ }
+ }
}
else {
- if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
+ if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1));
+ if (fileAligned) {
+ thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
+ }
+ }
}
}
else {
}
}else if (ecolifile != "") {
//make sure the seqs are aligned
- lengths.insert(currSeq.getAligned().length());
- if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
+ if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
else if (currSeq.getAligned().length() != length) {
m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
}else {
}
}else{ //using start and end to trim
//make sure the seqs are aligned
- lengths.insert(currSeq.getAligned().length());
- if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
+ if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
else {
if (end != -1) {
if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
//trimming removed all bases
if (currSeq.getUnaligned() == "") { goodSeq = false; }
- if(goodSeq == 1) { currSeq.printSequence(goodFile); }
+ if(goodSeq == 1) {
+ currSeq.printSequence(goodFile);
+ if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); }
+ if (locationsString != "") { locationsFile << locationsString; }
+ if (thisPStart != -1) { locations[0].insert(thisPStart); }
+ if (thisPEnd != -1) { locations[1].insert(thisPEnd); }
+ }
else {
badSeqNames.insert(currSeq.getName());
currSeq.setName(currSeq.getName() + '|' + trashCode);
#endif
//report progress
- if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
+ if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
}
//report progress
- if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
+ if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
badFile.close();
goodFile.close();
inFASTA.close();
-
+ locationsFile.close();
+
+ if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); }
+
+ if (fileAligned && !keepdots) { //print out smallest start value and largest end value
+ if ((locations[0].size() > 1) || (locations[1].size() > 1)) { adjustNeeded = true; }
+ if (primers.size() != 0) { set<int>::iterator it = locations[0].begin(); pstart = *it; }
+ if (revPrimer.size() != 0) { set<int>::reverse_iterator it2 = locations[1].rbegin(); pend = *it2; }
+ }
+
return count;
}
catch(exception& e) {
}
}
//********************************************************************/
-bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
- try {
-
- string rawSequence = seq.getUnaligned();
-
- for(int j=0;j<primers.size();j++){
- string oligo = primers[j];
-
- if(rawSequence.length() < oligo.length()) { break; }
-
- //search for primer
- int olength = oligo.length();
- for (int j = 0; j < rawSequence.length()-olength; j++){
- if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
- string rawChunk = rawSequence.substr(j, olength);
- if(compareDNASeq(oligo, rawChunk)) {
- primerStart = j;
- primerEnd = primerStart + olength;
- return true;
- }
-
- }
- }
-
- primerStart = 0; primerEnd = 0;
- return false;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimOligos", "stripForward");
- exit(1);
- }
-}
-//******************************************************************/
-bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
- try {
-
- string rawSequence = seq.getUnaligned();
-
- for(int i=0;i<revPrimer.size();i++){
- string oligo = revPrimer[i];
- if(rawSequence.length() < oligo.length()) { break; }
-
- //search for primer
- int olength = oligo.length();
- for (int j = rawSequence.length()-olength; j >= 0; j--){
- if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
- string rawChunk = rawSequence.substr(j, olength);
-
- if(compareDNASeq(oligo, rawChunk)) {
- primerStart = j;
- primerEnd = primerStart + olength;
- return true;
- }
-
- }
- }
-
- primerStart = 0; primerEnd = 0;
- return false;
- }
- catch(exception& e) {
- m->errorOut(e, "PcrSeqsCommand", "findReverse");
- exit(1);
- }
-}
-//********************************************************************/
bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
try {
+ aligned.clear();
bool isAligned = false;
int countBases = 0;
exit(1);
}
}
+//**********************************************************************************************************************
+int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){
+ try {
+ ifstream inFasta;
+ m->openInputFile(goodFasta, inFasta);
+
+ ifstream inLocations;
+ m->openInputFile(locations, inLocations);
+
+ ofstream out;
+ m->openOutputFile(goodFasta+".temp", out);
+
+ set<int> lengths;
+ //cout << pstart << '\t' << pend << endl;
+
+ while(!inFasta.eof()) {
+ if(m->control_pressed) { break; }
+
+ Sequence seq(inFasta); m->gobble(inFasta);
+
+ string name = "";
+ int thisStart = -1; int thisEnd = -1;
+ if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); }
+ if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); }
+ //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl;
+ //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl;
+
+ if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); }
+ else {
+ if (pstart != -1) {
+ if (thisStart != -1) {
+ if (thisStart != pstart) {
+ string dots = "";
+ for (int i = pstart; i < thisStart; i++) { dots += "."; }
+ thisEnd += dots.length();
+ dots += seq.getAligned();
+ seq.setAligned(dots);
+ }
+ }
+ }
+
+ if (pend != -1) {
+ if (thisEnd != -1) {
+ if (thisEnd != pend) {
+ string dots = seq.getAligned();
+ for (int i = thisEnd; i < pend; i++) { dots += "."; }
+ seq.setAligned(dots);
+ }
+ }
+ }
+ lengths.insert(seq.getAligned().length());
+ }
+
+ seq.printSequence(out);
+ }
+ inFasta.close();
+ inLocations.close();
+ out.close();
+ m->mothurRemove(locations);
+ m->mothurRemove(goodFasta);
+ m->renameFile(goodFasta+".temp", goodFasta);
+
+ //cout << "final lengths = \n";
+ //for (set<int>::iterator it = lengths.begin(); it != lengths.end(); it++) {
+ // cout << *it << endl;
+ // }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "adjustDots");
+ exit(1);
+ }
+}
//********************************************************************/
string PcrSeqsCommand::reverseOligo(string oligo){
try {
m->openInputFile(oligosfile, inOligos);
string type, oligo, group;
+ int primerCount = 0;
while(!inOligos.eof()){
// get rest of line in case there is a primer name
while (!inOligos.eof()) {
char c = inOligos.get();
- if (c == 10 || c == 13){ break; }
+ if (c == 10 || c == 13 || c == -1){ break; }
else if (c == 32 || c == 9){;} //space or tab
}
- primers.push_back(oligo);
+ primers[oligo] = primerCount; primerCount++;
+ //cout << "for oligo = " << oligo << endl;
}else if(type == "REVERSE"){
string oligoRC = reverseOligo(oligo);
revPrimer.push_back(oligoRC);
- //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
+ //cout << "rev oligo = " << oligo << " reverse = " << oligoRC << endl;
}else if(type == "BARCODE"){
- inOligos >> group;
+ inOligos >> group;
+ }else if(type == "PRIMER"){
+ m->gobble(inOligos);
+ primers[oligo] = primerCount; primerCount++;
+
+ string roligo="";
+ inOligos >> roligo;
+
+ for(int i=0;i<roligo.length();i++){
+ roligo[i] = toupper(roligo[i]);
+ if(roligo[i] == 'U') { roligo[i] = 'T'; }
+ }
+ revPrimer.push_back(reverseOligo(roligo));
+
+ // get rest of line in case there is a primer name
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13 || c == -1){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ }
+ //cout << "prim oligo = " << oligo << " reverse = " << roligo << endl;
}else if((type == "LINKER")||(type == "SPACER")) {;}
- else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ else{ m->mothurOut(type + " is not recognized as a valid type. Choices are primer, forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
}
m->gobble(inOligos);
}
try {
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
- string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
+ string outputFileName = getOutputFileName("accnos",variables);
outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
ofstream out;
exit(1);
}
-}
-//******************************************************************/
-bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
- try {
- bool success = 1;
- int length = oligo.length();
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- return success;
- }
- catch(exception& e) {
- m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
- exit(1);
- }
-
}
//***************************************************************************************************************
int PcrSeqsCommand::readName(set<string>& names){
try {
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
- string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
+ variables["[extension]"] = m->getExtension(namefile);
+ string outputFileName = getOutputFileName("name", variables);
ofstream out;
m->openOutputFile(outputFileName, out);
try {
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
- string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
+ variables["[extension]"] = m->getExtension(groupfile);
+ string outputFileName = getOutputFileName("group", variables);
ofstream out;
m->openOutputFile(outputFileName, out);
try {
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
- string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
+ variables["[extension]"] = m->getExtension(taxfile);
+ string outputFileName = getOutputFileName("taxonomy", variables);
+
ofstream out;
m->openOutputFile(outputFileName, out);
exit(1);
}
}
+//***************************************************************************************************************
+int PcrSeqsCommand::readCount(set<string> badSeqNames){
+ try {
+ ifstream in;
+ m->openInputFile(countfile, in);
+ set<string>::iterator it;
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
+ variables["[extension]"] = m->getExtension(countfile);
+ string goodCountFile = getOutputFileName("count", variables);
+
+ outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
+ ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
+
+ string headers = m->getline(in); m->gobble(in);
+ goodCountOut << headers << endl;
+
+ string name, rest; int thisTotal, removedCount; removedCount = 0;
+ bool wroteSomething = false;
+ while (!in.eof()) {
+
+ if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> thisTotal; m->gobble(in);
+ rest = m->getline(in); m->gobble(in);
+
+ if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
+ else{
+ wroteSomething = true;
+ goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
+ }
+ }
+ in.close();
+ goodCountOut.close();
+
+ if (m->control_pressed) { m->mothurRemove(goodCountFile); }
+
+ if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
+
+ //check for groups that have been eliminated
+ CountTable ct;
+ if (ct.testGroups(goodCountFile)) {
+ ct.readTable(goodCountFile, true, false);
+ ct.printTable(goodCountFile);
+ }
+
+ if (m->control_pressed) { m->mothurRemove(goodCountFile); }
+
+ m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
+
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readCOunt");
+ exit(1);
+ }
+}
/**************************************************************************************/