#include "trimseqscommand.h"
#include "needlemanoverlap.hpp"
+#include "trimoligos.h"
//**********************************************************************************************************************
vector<string> TrimSeqsCommand::setParameters(){
}
}
- vector<unsigned long int> fastaFilePos;
- vector<unsigned long int> qFilePos;
+ vector<unsigned long long> fastaFilePos;
+ vector<unsigned long long> qFilePos;
setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
int count = 0;
bool moreSeqs = 1;
+ TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
while (moreSeqs) {
int primerIndex = 0;
if(barcodes.size() != 0){
- success = stripBarcode(currSeq, currQual, barcodeIndex);
+ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
if(numFPrimers != 0){
- success = stripForward(currSeq, currQual, primerIndex);
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
if(numRPrimers != 0){
- success = stripReverse(currSeq, currQual);
+ success = trimOligos.stripReverse(currSeq, currQual);
if(!success) { trashCode += 'r'; }
}
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- unsigned long int pos = inFASTA.tellg();
+ unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= line->end)) { break; }
#else
/**************************************************************************************************/
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
//set file positions for fasta file
map<string, int>::iterator it = firstSeqNames.find(sname);
if(it != firstSeqNames.end()) { //this is the start of a new chunk
- unsigned long int pos = inQual.tellg();
+ unsigned long long pos = inQual.tellg();
qfileFilePos.push_back(pos - input.length() - 1);
firstSeqNames.erase(it);
}
//get last file position of qfile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (qfilename.c_str(),"rb");
fastaFilePos.push_back(0); qfileFilePos.push_back(0);
//get last file position of fastafile
FILE * pFile;
- unsigned long int size;
+ unsigned long long size;
//get num bytes in file
pFile = fopen (filename.c_str(),"rb");
exit(1);
}
}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
-
- //can you find the barcode
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
-
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
-
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
-
- for(it;it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
-
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
- try {
- string rawSequence = seq.getUnaligned();
- int success = pdiffs + 1; //guilty until proven innocent
-
- //can you find the primer
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
- success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
- }
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((pdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
-
- for(it;it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
- string oligo = it->first;
-// int length = oligo.length();
-
- if(rawSequence.length() < maxLength){
- success = pdiffs + 100;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
- oligo = oligo.substr(0,alnLength);
- temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=0;i<alnLength;i++){
- if(temp[i] != '-'){
- minPos++;
- }
- }
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > pdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
- }
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripForward");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
- try {
- string rawSequence = seq.getUnaligned();
- bool success = 0; //guilty until proven innocent
-
- for(int i=0;i<numRPrimers;i++){
- string oligo = revPrimer[i];
-
- if(rawSequence.length() < oligo.length()){
- success = 0;
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
- seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
- if(qual.getName() != ""){
- qual.trimQScores(-1, rawSequence.length()-oligo.length());
- }
- success = 1;
- break;
- }
- }
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "stripReverse");
- exit(1);
- }
-}
-
//***************************************************************************************************************
bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
}
}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
- try {
- bool success = 1;
- int length = oligo.length();
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
-
- if(success == 0) { break; }
- }
- else{
- success = 1;
- }
- }
-
- return success;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
- exit(1);
- }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::countDiffs(string oligo, string seq){
- try {
-
- int length = oligo.length();
- int countDiffs = 0;
-
- for(int i=0;i<length;i++){
-
- if(oligo[i] != seq[i]){
- if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
- else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
- else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
- else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
- else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
- else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
- else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
- }
-
- }
-
- return countDiffs;
- }
- catch(exception& e) {
- m->errorOut(e, "TrimSeqsCommand", "countDiffs");
- exit(1);
- }
-
-}
-
//***************************************************************************************************************