5 * Created by Pat Schloss on 2/6/10.
6 * Copyright 2010 Patrick D. Schloss. All rights reserved.
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ParseSFFCommand::getValidParameters(){
16 string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"};
17 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
21 m->errorOut(e, "ParseSFFCommand", "getValidParameters");
25 //**********************************************************************************************************************
26 ParseSFFCommand::ParseSFFCommand(){
29 //initialize outputTypes
30 vector<string> tempOutNames;
31 outputTypes["flow"] = tempOutNames;
34 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
38 //**********************************************************************************************************************
39 vector<string> ParseSFFCommand::getRequiredParameters(){
41 string Array[] = {"sff"};
42 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
46 m->errorOut(e, "ParseSFFCommand", "getRequiredParameters");
50 //**********************************************************************************************************************
51 vector<string> ParseSFFCommand::getRequiredFiles(){
53 vector<string> myArray;
57 m->errorOut(e, "ParseSFFCommand", "getRequiredFiles");
61 //**********************************************************************************************************************
63 ParseSFFCommand::ParseSFFCommand(string option){
67 if(option == "help") {
72 //valid paramters for this command
73 string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"};
74 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
76 OptionParser parser(option);
77 map<string,string> parameters = parser.getParameters();
79 ValidParameters validParameter;
80 map<string,string>::iterator it;
82 //check to make sure all parameters are valid for command
83 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
84 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
87 //initialize outputTypes
88 vector<string> tempOutNames;
89 outputTypes["flow"] = tempOutNames;
91 //if the user changes the input directory command factory will send this info to us in the output parameter
92 string inputDir = validParameter.validFile(parameters, "inputdir", false);
93 if (inputDir == "not found"){ inputDir = ""; }
96 it = parameters.find("sff");
97 //user has given a template file
98 if(it != parameters.end()){
99 path = m->hasPath(it->second);
100 //if the user has not given a path then, add inputdir. else leave path alone.
101 if (path == "") { parameters["sff"] = inputDir + it->second; }
104 it = parameters.find("oligos");
105 //user has given an oligos file
106 if(it != parameters.end()){
107 path = m->hasPath(it->second);
108 //if the user has not given a path then, add inputdir. else leave path alone.
109 if (path == "") { parameters["oligos"] = inputDir + it->second; }
114 //check for required parameters
115 sffFile = validParameter.validFile(parameters, "sff", true);
116 if (sffFile == "not found"){
117 m->mothurOut("sff is a required parameter for the parse.sff command.");
118 m->mothurOutEndLine();
121 else if (sffFile == "not open") { abort = true; }
123 //if the user changes the output directory command factory will send this info to us in the output parameter
124 outputDir = validParameter.validFile(parameters, "outputdir", false);
125 if (outputDir == "not found"){
127 outputDir += m->hasPath(sffFile); //if user entered a file with a path then preserve it
130 //check for optional parameter and set defaults
131 // ...at some point should added some additional type checking...
132 oligoFile = validParameter.validFile(parameters, "oligos", true);
133 if (oligoFile == "not found") { oligoFile = ""; }
134 else if(oligoFile == "not open"){ abort = true; }
136 string temp = validParameter.validFile(parameters, "minlength", false);
137 if (temp == "not found") { temp = "0"; }
138 convert(temp, minLength);
141 catch(exception& e) {
142 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
147 //**********************************************************************************************************************
149 ParseSFFCommand::~ParseSFFCommand() { /* do nothing */ }
151 //**********************************************************************************************************************
153 int ParseSFFCommand::execute(){
155 if (abort == true) { return 0; }
158 m->openInputFile(sffFile, inSFF);
160 cout.setf(ios::fixed, ios::floatfield);
161 cout.setf(ios::showpoint);
162 cout << setprecision(2);
164 vector<ofstream*> flowFileNames;
166 getOligos(flowFileNames);
169 flowFileNames.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
170 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow")); outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow"));
173 for(int i=0;i<flowFileNames.size();i++){
174 flowFileNames[i]->setf(ios::fixed, ios::floatfield);
175 flowFileNames[i]->setf(ios::showpoint);
176 *flowFileNames[i] << setprecision(2);
179 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
181 // ofstream fastaFile;
182 // m->openOutputFile(m->getRootName(sffFile) + "fasta", fastaFile);
184 // ofstream qualFile;
185 // m->openOutputFile(m->getRootName(sffFile) + "qual", qualFile);
187 string commonHeader = m->getline(inSFF);
188 string magicNumber = m->getline(inSFF);
189 string version = m->getline(inSFF);
190 string indexOffset = m->getline(inSFF);
191 string indexLength = m->getline(inSFF);
192 int numReads = parseHeaderLineToInt(inSFF);
193 string headerLength = m->getline(inSFF);
194 string keyLength = m->getline(inSFF);
195 int numFlows = parseHeaderLineToInt(inSFF);
196 string flowgramCode = m->getline(inSFF);
197 string flowChars = m->getline(inSFF);
198 string keySequence = m->getline(inSFF);
204 for(int i=0;i<numReads;i++){
206 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
209 seqName = seqName.substr(1);
212 string runPrefix = parseHeaderLineToString(inSFF);
213 string regionNumber = parseHeaderLineToString(inSFF);
214 string xyLocation = parseHeaderLineToString(inSFF);
217 string runName = parseHeaderLineToString(inSFF);
218 string analysisName = parseHeaderLineToString(inSFF);
219 string fullPath = parseHeaderLineToString(inSFF);
222 string readHeaderLen = parseHeaderLineToString(inSFF);
223 string nameLength = parseHeaderLineToString(inSFF);
224 int numBases = parseHeaderLineToInt(inSFF);
225 string clipQualLeft = parseHeaderLineToString(inSFF);
226 int clipQualRight = parseHeaderLineToInt(inSFF);
227 string clipAdapLeft = parseHeaderLineToString(inSFF);
228 string clipAdapRight = parseHeaderLineToString(inSFF);
231 vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
232 vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
233 string bases = parseHeaderLineToString(inSFF);
234 string qualityScores = parseHeaderLineToString(inSFF);
239 int flowLength = flowIndices[clipQualRight-1];
241 screenFlow(flowVector, flowLength);
242 string sequence = flow2seq(flowVector, flowLength);
246 if(minLength != 0 || numFPrimers != 0 || numBarcodes != 0 || numRPrimers != 0){
247 good = screenSeq(sequence, group);
251 *flowFileNames[group] << seqName << ' ' << flowLength;
252 for(int i=0;i<numFlows;i++){
253 *flowFileNames[group] << ' ' << flowVector[i];
255 *flowFileNames[group] << endl;
258 // string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
259 // fastaFile << fastaHeader << endl;
260 // fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
262 // qualFile << fastaHeader << endl;
263 // qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
266 for(int i=0;i<flowFileNames.size();i++){
267 flowFileNames[i]->close();
270 m->mothurOutEndLine();
271 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
272 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
273 m->mothurOutEndLine();
275 // fastaFile.close();
280 catch(exception& e) {
281 m->errorOut(e, "ParseSFFCommand", "execute");
286 //**********************************************************************************************************************
288 void ParseSFFCommand::help(){
290 m->mothurOut("The parse.sff command...");
291 m->mothurOutEndLine();
293 catch(exception& e) {
294 m->errorOut(e, "ParseSFFCommand", "help");
299 //**********************************************************************************************************************
301 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
305 m->openInputFile(oligoFile, inOligos);
307 string type, oligo, group;
311 while(!inOligos.eof()){
314 if(type[0] == '#'){ m->getline(inOligos); } // get rest of line if there's any crap there
318 for(int i=0;i<oligo.length();i++){
319 oligo[i] = toupper(oligo[i]);
320 if(oligo[i] == 'U') { oligo[i] = 'T'; }
322 if(type == "forward"){
323 forPrimer.push_back(oligo);
325 else if(type == "reverse"){
326 Sequence oligoRC("reverse", oligo);
327 oligoRC.reverseComplement();
328 revPrimer.push_back(oligoRC.getUnaligned());
330 else if(type == "barcode"){
332 barcodes[oligo]=index++;
333 groupVector.push_back(group);
335 outSFFFlowVec.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
336 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
337 outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
345 numFPrimers = forPrimer.size();
346 numRPrimers = revPrimer.size();
347 numBarcodes = barcodes.size();
349 catch(exception& e) {
350 m->errorOut(e, "ParseSFFCommand", "getOligos");
356 //**********************************************************************************************************************
358 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
362 while (!file.eof()) {
375 //**********************************************************************************************************************
377 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
381 while (!file.eof()) {
386 text = m->getline(file);
395 //**********************************************************************************************************************
397 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
399 vector<float> floatVector(length);
401 while (!file.eof()) {
404 for(int i=0;i<length;i++){
405 file >> floatVector[i];
414 //**********************************************************************************************************************
416 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
418 vector<int> intVector(length);
420 while (!file.eof()) {
423 for(int i=0;i<length;i++){
424 file >> intVector[i];
433 //**********************************************************************************************************************
436 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
441 while(newLength * 4 < length){
445 for(int i=0;i<4;i++){
446 float flow = flowgram[i + 4 * newLength];
450 if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
455 if(noise > 0 || signal == 0){
460 length = newLength * 4;
463 catch(exception& e) {
464 m->errorOut(e, "ParseSFFCommand", "screenFlow");
469 //**********************************************************************************************************************
471 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
473 string flow = "TACG";
474 string sequence = "";
475 for(int i=8;i<length;i++){
476 int signal = int(flowgram[i] + 0.5);
477 char base = flow[ i % 4 ];
478 for(int j=0;j<signal;j++){
485 //**********************************************************************************************************************
487 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
492 if(sequence.length() < minLength){ length = 0; }
495 int barcodeLength = 0;
497 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
498 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
500 barcodeLength = (it->first).size();
510 for(int i=0;i<numFPrimers;i++){
511 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
521 for(int i=0;i<numRPrimers;i++){
522 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
531 return fPrimer * rPrimer * length * barcode;
535 //**********************************************************************************************************************
537 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
540 int length = oligo.length();
542 for(int i=0;i<length;i++){
544 if(oligo[i] != seq[i]){
545 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
546 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
547 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
548 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
549 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
550 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
551 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
552 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
553 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
554 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
555 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
556 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
558 if(success == 0) { break; }
567 catch(exception& e) {
568 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
573 //**********************************************************************************************************************
575 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
578 // return qScores.substr(start-1, end-start+1);
582 //**********************************************************************************************************************
584 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
588 // int startCount = 0;
589 // int startIndex = 0;
591 // while(startCount < start && startIndex < qScores.length()){
592 // if(isspace(qScores[startIndex])){
598 // int endCount = startCount;
599 // int endIndex = startIndex;
601 // while(endCount < end && endIndex < qScores.length()){
602 // if(isspace(qScores[endIndex])){
608 // return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
612 //**********************************************************************************************************************