From d9c3ceb4147b827719046c7d742df14288a07722 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 22 Feb 2012 09:53:19 -0500 Subject: [PATCH] trim.seqs linker and spacer mods --- trimoligos.cpp | 334 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 321 insertions(+), 13 deletions(-) diff --git a/trimoligos.cpp b/trimoligos.cpp index 245cfdc..cf81870 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -154,7 +154,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ else{ //use the best match group = minGroup; seq.setUnaligned(rawSequence.substr(minPos)); - + if(qual.getName() != ""){ qual.trimQScores(minPos, -1); } @@ -586,15 +586,95 @@ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ break; } - if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ - seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(oligo.length())); if(qual.getName() != ""){ - qual.trimQScores(-1, rawSequence.length()-oligo.length()); + qual.trimQScores(oligo.length(), -1); } success = 1; break; } - } + } + + //if you found the linker or if you don't want to allow for diffs + if ((ldiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (linker.size() > 0) { + for(int i = 0; i < linker.size(); i++){ + if(linker[i].length() > maxLength){ + maxLength = linker[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; } @@ -618,13 +698,87 @@ bool TrimOligos::stripLinker(Sequence& seq){ break; } - if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ - seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(oligo.length())); success = 1; break; } } + //if you found the linker or if you don't want to allow for diffs + if ((ldiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (linker.size() > 0) { + for(int i = 0; i < linker.size(); i++){ + if(linker[i].length() > maxLength){ + maxLength = linker[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + return success; } @@ -648,15 +802,95 @@ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ break; } - if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ - seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(oligo.length())); if(qual.getName() != ""){ - qual.trimQScores(-1, rawSequence.length()-oligo.length()); + qual.trimQScores(oligo.length(), -1); } success = 1; break; } - } + } + + //if you found the spacer or if you don't want to allow for diffs + if ((sdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (spacer.size() > 0) { + for(int i = 0; i < spacer.size(); i++){ + if(spacer[i].length() > maxLength){ + maxLength = spacer[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; } @@ -680,13 +914,87 @@ bool TrimOligos::stripSpacer(Sequence& seq){ break; } - if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ - seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(oligo.length())); success = 1; break; } } + //if you found the spacer or if you don't want to allow for diffs + if ((sdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (spacer.size() > 0) { + for(int i = 0; i < spacer.size(); i++){ + if(spacer[i].length() > maxLength){ + maxLength = spacer[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + return success; } -- 2.39.2