#include "needlemanoverlap.hpp"
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+ try {
+ m = MothurOut::getInstance();
+
+ pdiffs = p;
+ bdiffs = b;
+ ldiffs = l;
+ sdiffs = s;
+
+ barcodes = br;
+ rbarcodes = rbr;
+ primers = pr;
+ revPrimer = r;
+ linker = lk;
+ spacer = sp;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "TrimOligos");
+ exit(1);
+ }
+}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
}
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
+ int numDiff = countDiffs(oligo, temp);
if(numDiff < minDiff){
minDiff = numDiff;
alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
-
+
int alnLength = oligo.length();
for(int i=oligo.length()-1;i>=0;i--){
}
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
-
+
int numDiff = countDiffs(oligo, temp);
if(numDiff < minDiff){
exit(1);
}
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, rawSequence.length()-oligo.length());
+ }
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (rbarcodes.size() > 0) {
+ map<string,int>::iterator it;
+
+ for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=0;i<alnLength;i++){
+ if(oligo[i] != '-'){ alnLength = i; break; }
+ }
+ oligo = oligo.substr(alnLength);
+ temp = temp.substr(alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=alnLength-1;i>=0;i--){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, (rawSequence.length()-minPos));
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripRBarcode");
+ exit(1);
+ }
+
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (rbarcodes.size() > 0) {
+ map<string,int>::iterator it;
+
+ for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=0;i<alnLength;i++){
+ if(oligo[i] != '-'){ alnLength = i; break; }
+ }
+ oligo = oligo.substr(alnLength);
+ temp = temp.substr(alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=alnLength-1;i>=0;i--){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripRBarcode");
+ exit(1);
+ }
+
}
//********************************************************************/
int TrimOligos::stripForward(Sequence& seq, int& group){
public:
TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
- TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
~TrimOligos();
int stripBarcode(Sequence&, int&);
int stripBarcode(Sequence&, QualityScores&, int&);
+
+ int stripRBarcode(Sequence&, int&);
+ int stripRBarcode(Sequence&, QualityScores&, int&);
int stripForward(Sequence&, int&);
int stripForward(Sequence&, QualityScores&, int&, bool);
int pdiffs, bdiffs, ldiffs, sdiffs;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
map<string, int> primers;
vector<string> revPrimer;
vector<string> linker;
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
while (moreSeqs) {
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
+
+ if(rbarcodes.size() != 0){
+ success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+ if(success > bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
if(numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
tempPrimerQualFileNames,
tempNameFileNames,
lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
- pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
+ pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
int startIndex = i * numSeqsPerProcessor;
if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
- cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
}
}
}
else if(type == "BARCODE"){
inOligos >> group;
+
+ //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
+ //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
+ string temp = "";
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ else { temp += c; }
+ }
+ //then this is illumina data with 4 columns
+ if (temp != "") {
+ string reverseBarcode = reverseOligo(group); //reverse barcode
+ group = temp;
+
+ //check for repeat barcodes
+ map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+ if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
+
+ rbarcodes[reverseBarcode]=indexBarcode;
+ }
+
//check for repeat barcodes
map<string, int>::iterator itBar = barcodes.find(oligo);
if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
vector<string> revPrimer, outputNames;
set<string> filesToRemove;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
vector<string> groupVector;
map<string, int> primers;
vector<string> linker;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
map<string, int> primers;
vector<string> linker;
vector<string> spacer;
trimData(){}
trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
- int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
+ int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
sdiffs = sd;
tdiffs = td;
barcodes = bar;
+ rbarcodes = rbar;
primers = pri; numFPrimers = primers.size();
revPrimer = revP; numRPrimers = revPrimer.size();
linker = li; numLinkers = linker.size();
}
- TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
pDataArray->count = pDataArray->lineEnd;
for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
if(success > pDataArray->bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
-
+
+ if(pDataArray->rbarcodes.size() != 0){
+ success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+ if(success > pDataArray->bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
if(pDataArray->numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
if(success > pDataArray->sdiffs) { trashCode += 's'; }