#include "trimoligos.h"
#include "alignment.hpp"
#include "needlemanoverlap.hpp"
+#include "counttable.h"
class PcrSeqsCommand : public Command {
public:
vector<string> setParameters();
string getCommandName() { return "pcr.seqs"; }
string getCommandCategory() { return "Sequence Processing"; }
+
string getHelpString();
+ string getOutputPattern(string);
string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
string getDescription() { return "pcr.seqs"; }
vector<linePair> lines;
bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
- bool abort, keepprimer, keepdots;
- string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
- int start, end, pdiffs, processors, length;
+ bool abort, keepprimer, keepdots, fileAligned;
+ string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch;
+ int start, end, processors, length, pdiffs;
vector<string> revPrimer, outputNames;
- vector<string> primers;
+ map<string, int> primers;
int writeAccnos(set<string>);
int readName(set<string>&);
int readGroup(set<string>);
int readTax(set<string>);
+ int readCount(set<string>);
bool readOligos();
bool readEcoli();
- int driverPcr(string, string, string, set<string>&, linePair);
+ int driverPcr(string, string, string, string, set<string>&, linePair, int&, bool&);
int createProcesses(string, string, string, set<string>&);
- bool findForward(Sequence&, int&, int&);
- bool findReverse(Sequence&, int&, int&);
bool isAligned(string, map<int, int>&);
- bool compareDNASeq(string, string);
string reverseOligo(string);
+ int adjustDots(string, string, int, int);
+
};
/**************************************************************************************************/
// that can be passed using a single void pointer (LPVOID).
struct pcrData {
string filename;
- string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
+ string goodFasta, badFasta, oligosfile, ecolifile, nomatch, locationsName;
unsigned long long fstart;
unsigned long long fend;
- int count, start, end, length;
+ int count, start, end, length, pdiffs, pstart, pend;
MothurOut* m;
- vector<string> primers;
+ map<string, int> primers;
vector<string> revPrimer;
set<string> badSeqNames;
- bool keepprimer, keepdots;
+ bool keepprimer, keepdots, fileAligned, adjustNeeded;
pcrData(){}
- pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
+ pcrData(string f, string gf, string bfn, string loc, MothurOut* mout, string ol, string ec, map<string, int> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, int pd, unsigned long long fst, unsigned long long fen) {
filename = f;
goodFasta = gf;
badFasta = bfn;
nomatch = nm;
keepprimer = kp;
keepdots = kd;
+ end = en;
start = st;
- end = en;
length = l;
fstart = fst;
fend = fen;
+ pdiffs = pd;
+ locationsName = loc;
count = 0;
+ fileAligned = true;
+ adjustNeeded = false;
+ pstart = -1;
+ pend = -1;
}
};
/**************************************************************************************************/
ofstream badFile;
pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
+
+ ofstream locationsFile;
+ pDataArray->m->openOutputFile(pDataArray->locationsName, locationsFile);
ifstream inFASTA;
pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
}
set<int> lengths;
- pDataArray->count = pDataArray->fend;
+ //pdiffs, bdiffs, primers, barcodes, revPrimers
+ map<string, int> faked;
+ set<int> locations; //locations = beginning locations
+
+ TrimOligos trim(pDataArray->pdiffs, 0, pDataArray->primers, faked, pDataArray->revPrimer);
+
for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
-
+ pDataArray->count++;
if (pDataArray->m->control_pressed) { break; }
Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
-
+
+ if (pDataArray->fileAligned) { //assume aligned until proven otherwise
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { pDataArray->fileAligned = false; }
+ }
+
string trashCode = "";
+ string locationsString = "";
+ int thisPStart = -1;
+ int thisPEnd = -1;
if (currSeq.getName() != "") {
bool goodSeq = true;
//process primers
if (pDataArray->primers.size() != 0) {
int primerStart = 0; int primerEnd = 0;
- //bool good = findForward(currSeq, primerStart, primerEnd);
- ///////////////////////////////////////////////////////////////
- bool good = false;
- string rawSequence = currSeq.getUnaligned();
-
- for(int j=0;j<pDataArray->primers.size();j++){
- string oligo = pDataArray->primers[j];
-
- if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
-
- if(rawSequence.length() < oligo.length()) { break; }
-
- //search for primer
- int olength = oligo.length();
- for (int l = 0; l < rawSequence.length()-olength; l++){
- if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
- string rawChunk = rawSequence.substr(l, olength);
- //compareDNASeq(oligo, rawChunk)
- ////////////////////////////////////////////////////////
- bool success = 1;
- for(int k=0;k<olength;k++){
-
- if(oligo[k] != rawChunk[k]){
- if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
- else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
- else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
- else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
- else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
- else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
- else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- ////////////////////////////////////////////////////////////////////
- if(success) {
- primerStart = j;
- primerEnd = primerStart + olength;
- good = true; break;
- }
- }
- if (good) { break; }
- }
-
- if (!good) { primerStart = 0; primerEnd = 0; }
- ///////////////////////////////////////////////////////////////
-
+ bool good = trim.findForward(currSeq, primerStart, primerEnd);
if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
else{
//are you aligned
if (aligned) {
if (!pDataArray->keepprimer) {
- if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
+ if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1));
+ if (pDataArray->fileAligned) {
+ thisPStart = mapAligned[primerEnd-1]+1; //locations.insert(mapAligned[primerEnd-1]+1);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
+ }
+}
}
else {
if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));
+ if (pDataArray->fileAligned) {
+ thisPStart = mapAligned[primerStart]; //locations.insert(mapAligned[primerStart]);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
+ }
+ }
}
+ ///////////////////////////////////////////////////////////////
+ mapAligned.clear();
+ string seq = currSeq.getAligned();
+ int countBases = 0;
+ for (int k = 0; k < seq.length(); k++) {
+ if (!isalpha(seq[k])) { ; }
+ else { mapAligned[countBases] = k; countBases++; }
+ }
+ ///////////////////////////////////////////////////////////////
}else {
if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
//process reverse primers
if (pDataArray->revPrimer.size() != 0) {
int primerStart = 0; int primerEnd = 0;
- bool good = false;
- //findReverse(currSeq, primerStart, primerEnd);
- ///////////////////////////////////////////////////////////////
- string rawSequence = currSeq.getUnaligned();
-
- for(int j=0;j<pDataArray->revPrimer.size();j++){
- string oligo = pDataArray->revPrimer[j];
- if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
- if(rawSequence.length() < oligo.length()) { break; }
-
- //search for primer
- int olength = oligo.length();
- for (int l = rawSequence.length()-olength; l >= 0; l--){
-
- string rawChunk = rawSequence.substr(l, olength);
- //compareDNASeq(oligo, rawChunk)
- ////////////////////////////////////////////////////////
- bool success = 1;
- for(int k=0;k<olength;k++){
-
- if(oligo[k] != rawChunk[k]){
- if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
- else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
- else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
- else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
- else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
- else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
- else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
- else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- ////////////////////////////////////////////////////////////////////
- if(success) {
- primerStart = j;
- primerEnd = primerStart + olength;
- good = true; break;
- }
- }
- if (good) { break; }
- }
-
- if (!good) { primerStart = 0; primerEnd = 0; }
-
- ///////////////////////////////////////////////////////////////
+ bool good = trim.findReverse(currSeq, primerStart, primerEnd);
+
if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
else{
//are you aligned
if (aligned) {
if (!pDataArray->keepprimer) {
if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));
+ if (pDataArray->fileAligned) {
+ thisPEnd = mapAligned[primerStart]; //locations.insert(mapAligned[primerStart]);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
+ }
+
+ }
}
else {
- if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
- else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
+ if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); }
+ else {
+ currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1));
+ if (pDataArray->fileAligned) {
+ thisPEnd = mapAligned[primerEnd-1]+1; //locations.insert(mapAligned[primerEnd-1]+1);
+ locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
+ }
+
+ }
} }
else {
if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
}
}else if (pDataArray->ecolifile != "") {
//make sure the seqs are aligned
- lengths.insert(currSeq.getAligned().length());
- if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+ if (!pDataArray->fileAligned) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
else if (currSeq.getAligned().length() != pDataArray->length) {
pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break;
}else {
}
}else{ //using start and end to trim
//make sure the seqs are aligned
- lengths.insert(currSeq.getAligned().length());
- if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+ if (!pDataArray->fileAligned) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
else {
if (pDataArray->end != -1) {
if (pDataArray->end > currSeq.getAligned().length()) { pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
}
}
- if(goodSeq == 1) { currSeq.printSequence(goodFile); }
+ if(goodSeq == 1) {
+ currSeq.printSequence(goodFile);
+ if (locationsString != "") { locationsFile << locationsString; }
+ if (thisPStart != -1) { locations.insert(thisPStart); }
+ }
else {
pDataArray->badSeqNames.insert(currSeq.getName());
currSeq.setName(currSeq.getName() + '|' + trashCode);
}
//report progress
- if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); }
+ if((i+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1)+"\n"); }
}
//report progress
- if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Thread Processing sequence: " + toString(pDataArray->count)+"\n"); }
goodFile.close();
inFASTA.close();
badFile.close();
+ locationsFile.close();
+
+ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: fileAligned = " + toString(pDataArray->fileAligned) +'\n'); }
+
+ if (pDataArray->fileAligned && !pDataArray->keepdots) { //print out smallest start value and largest end value
+ if (locations.size() > 1) { pDataArray->adjustNeeded = true; }
+ if (pDataArray->primers.size() != 0) { set<int>::iterator it = locations.begin(); pDataArray->pstart = *it; }
+ }
return 0;
-
}
catch(exception& e) {
pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");