From 6e63c5ff52bd2830b689417df8ba3db831e63a96 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 6 Jul 2009 12:41:02 +0000 Subject: [PATCH] fixed bug in sequencedb read for windows users and substr error in trim.seqs --- chimeraseqscommand.cpp | 87 ++++++++- chimeraseqscommand.h | 9 +- sequencedb.cpp | 18 +- trimseqscommand.cpp | 426 +++++++++++++++++++++++------------------ 4 files changed, 331 insertions(+), 209 deletions(-) diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index b810b9f..182c25e 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -107,9 +107,54 @@ int ChimeraSeqsCommand::execute(){ //find average midpoint of seqs midpoint = findAverageMidPoint(); + + //create 2 vectors of sequences, 1 for left side and one for right side + vector left; vector right; + + for (int i = 0; i < seqs.size(); i++) { + //save left side + string seqLeft = seqs[i].getAligned(); + seqLeft = seqLeft.substr(0, midpoint); + Sequence tempLeft(seqs[i].getName(), seqLeft); + left.push_back(tempLeft); + + //save right side + string seqRight = seqs[i].getAligned(); + seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1)); + Sequence tempRight(seqs[i].getName(), seqRight); + right.push_back(tempRight); + } //this should be parallelized - //generatePreferences(); + //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) + //create a matrix containing the distance from left to left and right to right + //calculate distances + SparseMatrix* SparseLeft = new SparseMatrix(); + SparseMatrix* SparseRight = new SparseMatrix(); + + createSparseMatrix(0, left.size(), SparseLeft, left); + createSparseMatrix(0, right.size(), SparseRight, right); + + + //vector distMapRight; + //vector distMapLeft; + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + //distMapRight = vector(globaldata->gListVector->size()); + //distMapLeft = vector(globaldata->gListVector->size()); + for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { + //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; + } + for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { + //distMapRight[currentCell->row][currentCell->column] = currentCell->dist; + } + + + //fill preference structure + //generatePreferences(distMapLeft, distMapRight); //output results to screen @@ -203,20 +248,22 @@ int ChimeraSeqsCommand::findAverageMidPoint(){ } } -/*************************************************************************************************************** -int ChimeraSeqsCommand::createSparseMatrix(int startLine, int endLine, SparseMatrix* sparse){ +/***************************************************************************************************************/ +int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ try { - for(int i=startLine; icalcDist(seqs.get(i), seqs.get(j)); - double dist = distCalculator->getDist(); - + //distCalculator->calcDist(s.get(i), s.get(j)); + float dist = distCalculator->getDist(); + PCell temp(i, j, dist); + sparse->addCell(temp); } + } return 1; @@ -226,5 +273,31 @@ int ChimeraSeqsCommand::createSparseMatrix(int startLine, int endLine, SparseMat exit(1); } } +/*************************************************************************************************************** +void ChimeraSeqsCommand::generatePreferences(vector left, vector right){ + try { + + for (int i = 0; i < left.size(); i++) { + + int iscore = 0; + float closestLeft = 100000.0; + float closestRight = 100000.0; + + for (int j = 0; j < left.size(); j++) { + + //iscore += abs(left + + } + + } + + } + catch(exception& e) { + errorOut(e, "ChimeraSeqsCommand", "generatePreferences"); + exit(1); + } +} +/**************************************************************************************************/ + /**************************************************************************************************/ diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 455ac92..5b55120 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -14,8 +14,11 @@ #include "command.hpp" #include "filterseqscommand.h" #include "sequence.hpp" +#include "sparsematrix.hpp" +#include "dist.h" - +typedef list::iterator MatData; +typedef map SeqMap; //maps sequence to all distance for that seqeunce /***********************************************************/ @@ -36,7 +39,7 @@ private: }; - + Dist* distCalculator; bool abort; string method, fastafile; bool filter, correction; @@ -47,6 +50,8 @@ private: int findAverageMidPoint(); void readSeqs(); + void generatePreferences(SparseMatrix*, SparseMatrix*); + int createSparseMatrix(int, int, SparseMatrix*, vector); }; diff --git a/sequencedb.cpp b/sequencedb.cpp index ca201d5..32fc146 100644 --- a/sequencedb.cpp +++ b/sequencedb.cpp @@ -30,23 +30,11 @@ SequenceDB::SequenceDB(int newSize) { SequenceDB::SequenceDB(ifstream& filehandle) { try{ - string name, sequence, line; - sequence = ""; - int c; - string temp; - - + //read through file - while ((c = filehandle.get()) != EOF) { - name = ""; sequence = ""; - //is this a name - if (c == '>') { - name = readName(filehandle); - sequence = readSequence(filehandle); - }else { mothurOut("Error fasta in your file. Please correct."); mothurOutEndLine(); } - + while (!filehandle.eof()) { //input sequence info into sequencedb - Sequence newSequence(name, sequence); + Sequence newSequence(filehandle); data.push_back(newSequence); //takes care of white space diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index ef0c066..937b130 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -256,263 +256,319 @@ int TrimSeqsCommand::execute(){ //*************************************************************************************************************** void TrimSeqsCommand::getOligos(vector& outFASTAVec){ - - ifstream inOligos; - openInputFile(oligoFile, inOligos); - - ofstream test; - - string type, oligo, group; - int index=0; - - while(!inOligos.eof()){ - inOligos >> type; - - if(type[0] == '#'){ - while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - } - else{ - inOligos >> oligo; - - for(int i=0;i> type; - if(type == "forward"){ - forPrimer.push_back(oligo); - } - else if(type == "reverse"){ - revPrimer.push_back(oligo); + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there } - else if(type == "barcode"){ - inOligos >> group; - barcodes[oligo]=index++; - groupVector.push_back(group); + else{ + inOligos >> oligo; + + for(int i=0;i> group; + barcodes[oligo]=index++; + groupVector.push_back(group); - if(allFiles){ - outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); + if(allFiles){ + outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); + } } } } + + inOligos.close(); + + numFPrimers = forPrimer.size(); + numRPrimers = revPrimer.size(); + } - - inOligos.close(); - - numFPrimers = forPrimer.size(); - numRPrimers = revPrimer.size(); + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "getOligos"); + exit(1); + } + } //*************************************************************************************************************** bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; - } + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; - break; + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = 0; + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 1; + break; + } } + return success; + } - return success; - + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "stripBarcode"); + exit(1); + } + } //*************************************************************************************************************** bool TrimSeqsCommand::stripForward(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;i= minLength && maxLength == 0) { success = 1; } - else if(length >= minLength && length <= maxLength) { success = 1; } - else { success = 0; } + int length = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + + if(length >= minLength && maxLength == 0) { success = 1; } + else if(length >= minLength && length <= maxLength) { success = 1; } + else { success = 0; } + + return success; - return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullLength"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullHomoP(Sequence& seq){ - - int longHomoP = seq.getLongHomoPolymer(); - bool success = 0; //guilty until proven innocent - - if(longHomoP <= maxHomoP){ success = 1; } - else { success = 0; } - - return success; + try { + int longHomoP = seq.getLongHomoPolymer(); + bool success = 0; //guilty until proven innocent + + if(longHomoP <= maxHomoP){ success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullHomoP"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ - - int numNs = seq.getAmbigBases(); - bool success = 0; //guilty until proven innocent - - if(numNs <= maxAmbig) { success = 1; } - else { success = 0; } - - return success; + try { + int numNs = seq.getAmbigBases(); + bool success = 0; //guilty until proven innocent + + if(numNs <= maxAmbig) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullAmbigs"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - - bool success = 1; - int length = oligo.length(); - - for(int i=0;i> name; - if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - int score; - int end = seqLength; - - for(int i=0;i> score; - - if(score <= qThreshold){ - end = i; - break; + try { + string rawSequence = seq.getUnaligned(); + int seqLength = rawSequence.length(); + string name; + + qFile >> name; + if (name.length() != 0) { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } } + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + int score; + int end = seqLength; + + for(int i=0;i> score; + + if(score <= qThreshold){ + end = i; + break; + } + } + for(int i=end+1;i> score; } + + seq.setUnaligned(rawSequence.substr(0,end)); + + return 1; } - for(int i=end+1;i> score; + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); + exit(1); } - - seq.setUnaligned(rawSequence.substr(0,end)); - - return 1; } //*************************************************************************************************************** bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ - - string rawSequence = seq.getUnaligned(); - int seqLength = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - string name; - - qFile >> name; - if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - float score; - float average = 0; - - for(int i=0;i> score; - average += score; + try { + string rawSequence = seq.getUnaligned(); + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + string name; + + qFile >> name; + if (name.length() != 0) { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } } + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + float score; + float average = 0; + + for(int i=0;i> score; + average += score; + } + average /= seqLength; + + if(average >= qAverage) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullQualAverage"); + exit(1); } - average /= seqLength; - - if(average >= qAverage) { success = 1; } - else { success = 0; } - - return success; } //*************************************************************************************************************** -- 2.39.5