From e0ce7cbc93d7d2fbb753ca694182db092a0ea0e7 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 5 Jun 2012 13:08:53 -0400 Subject: [PATCH] added forward and reverse barcodes to trim.seqs to process illumina seqs --- trimoligos.cpp | 263 +++++++++++++++++++++++++++++++++++++++++++- trimoligos.h | 7 +- trimseqscommand.cpp | 33 +++++- trimseqscommand.h | 15 ++- 4 files changed, 307 insertions(+), 11 deletions(-) diff --git a/trimoligos.cpp b/trimoligos.cpp index 8c523ce..2f92cc8 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -12,6 +12,29 @@ #include "needlemanoverlap.hpp" +/********************************************************************/ +//strip, pdiffs, bdiffs, primers, barcodes, revPrimers +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, map rbr, vector r, vector lk, vector sp){ + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + ldiffs = l; + sdiffs = s; + + barcodes = br; + rbarcodes = rbr; + primers = pr; + revPrimer = r; + linker = lk; + spacer = sp; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } +} /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ @@ -129,8 +152,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); + int numDiff = countDiffs(oligo, temp); if(numDiff < minDiff){ minDiff = numDiff; @@ -237,7 +259,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); for(int i=oligo.length()-1;i>=0;i--){ @@ -245,7 +267,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - + int numDiff = countDiffs(oligo, temp); if(numDiff < minDiff){ @@ -285,6 +307,239 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ exit(1); } +} +//*******************************************************************/ +int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); + + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSequence.length()-oligo.length()); + } + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (rbarcodes.size() > 0) { + map::iterator it; + + for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=0;isecond; + minPos = 0; + for(int i=alnLength-1;i>=0;i--){ + if(temp[i] != '-'){ + minPos++; + } + } + } + else if(numDiff == minDiff){ + minCount++; + } + + } + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); + + if(qual.getName() != ""){ + qual.trimQScores(-1, (rawSequence.length()-minPos)); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripRBarcode"); + exit(1); + } + +} +//*******************************************************************/ +int TrimOligos::stripRBarcode(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (rbarcodes.size() > 0) { + map::iterator it; + + for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=0;isecond; + minPos = 0; + for(int i=alnLength-1;i>=0;i--){ + if(temp[i] != '-'){ + minPos++; + } + } + } + else if(numDiff == minDiff){ + minCount++; + } + + } + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); + + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripRBarcode"); + exit(1); + } + } //********************************************************************/ int TrimOligos::stripForward(Sequence& seq, int& group){ diff --git a/trimoligos.h b/trimoligos.h index e3ea7d5..a32b3d8 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -20,11 +20,15 @@ class TrimOligos { public: TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers - TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer + TrimOligos(int,int, int, int, map, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer + TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer ~TrimOligos(); int stripBarcode(Sequence&, int&); int stripBarcode(Sequence&, QualityScores&, int&); + + int stripRBarcode(Sequence&, int&); + int stripRBarcode(Sequence&, QualityScores&, int&); int stripForward(Sequence&, int&); int stripForward(Sequence&, QualityScores&, int&, bool); @@ -43,6 +47,7 @@ class TrimOligos { int pdiffs, bdiffs, ldiffs, sdiffs; map barcodes; + map rbarcodes; map primers; vector revPrimer; vector linker; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 9f8fafb..c019a70 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -562,7 +562,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer); while (moreSeqs) { @@ -607,6 +607,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } + + if(rbarcodes.size() != 0){ + success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); @@ -946,7 +952,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempPrimerQualFileNames, tempNameFileNames, lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, - pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap); @@ -1187,7 +1193,6 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); - cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl; if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } } } @@ -1262,7 +1267,29 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } else if(type == "BARCODE"){ inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + //then this is illumina data with 4 columns + if (temp != "") { + string reverseBarcode = reverseOligo(group); //reverse barcode + group = temp; + + //check for repeat barcodes + map::iterator itBar = rbarcodes.find(reverseBarcode); + if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } + + rbarcodes[reverseBarcode]=indexBarcode; + } + //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } diff --git a/trimseqscommand.h b/trimseqscommand.h index 9ba64c4..ba4e614 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -63,6 +63,7 @@ private: vector revPrimer, outputNames; set filesToRemove; map barcodes; + map rbarcodes; vector groupVector; map primers; vector linker; @@ -101,6 +102,7 @@ struct trimData { double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer; map barcodes; + map rbarcodes; map primers; vector linker; vector spacer; @@ -112,7 +114,7 @@ struct trimData { trimData(){} trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, - int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, + int pd, int bd, int ld, int sd, int td, map pri, map bar, map rbar, vector revP, vector li, vector spa, vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, int minL, int maxA, int maxH, int maxL, bool fli, map nm) { @@ -141,6 +143,7 @@ struct trimData { sdiffs = sd; tdiffs = td; barcodes = bar; + rbarcodes = rbar; primers = pri; numFPrimers = primers.size(); revPrimer = revP; numRPrimers = revPrimer.size(); linker = li; numLinkers = linker.size(); @@ -237,7 +240,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } - TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); pDataArray->count = pDataArray->lineEnd; for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process @@ -277,7 +280,13 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if(success > pDataArray->bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } - + + if(pDataArray->rbarcodes.size() != 0){ + success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); + if(success > pDataArray->bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + if(pDataArray->numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); if(success > pDataArray->sdiffs) { trashCode += 's'; } -- 2.39.2