5 * Created by Pat Schloss on 2/6/10.
6 * Copyright 2010 Patrick D. Schloss. All rights reserved.
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ParseSFFCommand::getValidParameters(){
16 string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"};
17 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
21 m->errorOut(e, "ParseSFFCommand", "getValidParameters");
25 //**********************************************************************************************************************
26 ParseSFFCommand::ParseSFFCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["flow"] = tempOutNames;
33 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
37 //**********************************************************************************************************************
38 vector<string> ParseSFFCommand::getRequiredParameters(){
40 string Array[] = {"sff"};
41 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45 m->errorOut(e, "ParseSFFCommand", "getRequiredParameters");
49 //**********************************************************************************************************************
50 vector<string> ParseSFFCommand::getRequiredFiles(){
52 vector<string> myArray;
56 m->errorOut(e, "ParseSFFCommand", "getRequiredFiles");
60 //**********************************************************************************************************************
62 ParseSFFCommand::ParseSFFCommand(string option){
66 if(option == "help") {
71 //valid paramters for this command
72 string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"};
73 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75 OptionParser parser(option);
76 map<string,string> parameters = parser.getParameters();
78 ValidParameters validParameter;
79 map<string,string>::iterator it;
81 //check to make sure all parameters are valid for command
82 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
83 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
86 //initialize outputTypes
87 vector<string> tempOutNames;
88 outputTypes["flow"] = tempOutNames;
90 //if the user changes the input directory command factory will send this info to us in the output parameter
91 string inputDir = validParameter.validFile(parameters, "inputdir", false);
92 if (inputDir == "not found"){ inputDir = ""; }
95 it = parameters.find("sff");
96 //user has given a template file
97 if(it != parameters.end()){
98 path = m->hasPath(it->second);
99 //if the user has not given a path then, add inputdir. else leave path alone.
100 if (path == "") { parameters["sff"] = inputDir + it->second; }
103 it = parameters.find("oligos");
104 //user has given an oligos file
105 if(it != parameters.end()){
106 path = m->hasPath(it->second);
107 //if the user has not given a path then, add inputdir. else leave path alone.
108 if (path == "") { parameters["oligos"] = inputDir + it->second; }
113 //check for required parameters
114 sffFile = validParameter.validFile(parameters, "sff", true);
115 if (sffFile == "not found"){
116 m->mothurOut("sff is a required parameter for the parse.sff command.");
117 m->mothurOutEndLine();
120 else if (sffFile == "not open") { abort = true; }
122 //if the user changes the output directory command factory will send this info to us in the output parameter
123 outputDir = validParameter.validFile(parameters, "outputdir", false);
124 if (outputDir == "not found"){
126 outputDir += m->hasPath(sffFile); //if user entered a file with a path then preserve it
129 //check for optional parameter and set defaults
130 // ...at some point should added some additional type checking...
131 oligoFile = validParameter.validFile(parameters, "oligos", true);
132 if (oligoFile == "not found") { oligoFile = ""; }
133 else if(oligoFile == "not open"){ abort = true; }
135 string temp = validParameter.validFile(parameters, "minlength", false);
136 if (temp == "not found") { temp = "0"; }
137 convert(temp, minLength);
140 catch(exception& e) {
141 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
146 //**********************************************************************************************************************
148 ParseSFFCommand::~ParseSFFCommand() { /* do nothing */ }
150 //**********************************************************************************************************************
152 int ParseSFFCommand::execute(){
154 if (abort == true) { return 0; }
157 m->openInputFile(sffFile, inSFF);
159 cout.setf(ios::fixed, ios::floatfield);
160 cout.setf(ios::showpoint);
161 cout << setprecision(2);
163 vector<ofstream*> flowFileNames;
165 getOligos(flowFileNames);
168 flowFileNames.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
169 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow")); outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow"));
172 for(int i=0;i<flowFileNames.size();i++){
173 flowFileNames[i]->setf(ios::fixed, ios::floatfield);
174 flowFileNames[i]->setf(ios::showpoint);
175 *flowFileNames[i] << setprecision(2);
178 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
180 // ofstream fastaFile;
181 // m->openOutputFile(m->getRootName(sffFile) + "fasta", fastaFile);
183 // ofstream qualFile;
184 // m->openOutputFile(m->getRootName(sffFile) + "qual", qualFile);
186 string commonHeader = m->getline(inSFF);
187 string magicNumber = m->getline(inSFF);
188 string version = m->getline(inSFF);
189 string indexOffset = m->getline(inSFF);
190 string indexLength = m->getline(inSFF);
191 int numReads = parseHeaderLineToInt(inSFF);
192 string headerLength = m->getline(inSFF);
193 string keyLength = m->getline(inSFF);
194 int numFlows = parseHeaderLineToInt(inSFF);
195 string flowgramCode = m->getline(inSFF);
196 string flowChars = m->getline(inSFF);
197 string keySequence = m->getline(inSFF);
203 for(int i=0;i<numReads;i++){
205 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
208 seqName = seqName.substr(1);
211 string runPrefix = parseHeaderLineToString(inSFF);
212 string regionNumber = parseHeaderLineToString(inSFF);
213 string xyLocation = parseHeaderLineToString(inSFF);
216 string runName = parseHeaderLineToString(inSFF);
217 string analysisName = parseHeaderLineToString(inSFF);
218 string fullPath = parseHeaderLineToString(inSFF);
221 string readHeaderLen = parseHeaderLineToString(inSFF);
222 string nameLength = parseHeaderLineToString(inSFF);
223 int numBases = parseHeaderLineToInt(inSFF);
224 string clipQualLeft = parseHeaderLineToString(inSFF);
225 int clipQualRight = parseHeaderLineToInt(inSFF);
226 string clipAdapLeft = parseHeaderLineToString(inSFF);
227 string clipAdapRight = parseHeaderLineToString(inSFF);
230 vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
231 vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
232 string bases = parseHeaderLineToString(inSFF);
233 string qualityScores = parseHeaderLineToString(inSFF);
238 int flowLength = flowIndices[clipQualRight-1];
240 screenFlow(flowVector, flowLength);
241 string sequence = flow2seq(flowVector, flowLength);
245 if(minLength != 0 || numFPrimers != 0 || numBarcodes != 0 || numRPrimers != 0){
246 good = screenSeq(sequence, group);
250 *flowFileNames[group] << seqName << ' ' << flowLength;
251 for(int i=0;i<numFlows;i++){
252 *flowFileNames[group] << ' ' << flowVector[i];
254 *flowFileNames[group] << endl;
257 // string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
258 // fastaFile << fastaHeader << endl;
259 // fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
261 // qualFile << fastaHeader << endl;
262 // qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
265 for(int i=0;i<flowFileNames.size();i++){
266 flowFileNames[i]->close();
269 m->mothurOutEndLine();
270 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
271 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
272 m->mothurOutEndLine();
274 // fastaFile.close();
279 catch(exception& e) {
280 m->errorOut(e, "ParseSFFCommand", "execute");
285 //**********************************************************************************************************************
287 void ParseSFFCommand::help(){
289 m->mothurOut("The parse.sff command...");
290 m->mothurOutEndLine();
292 catch(exception& e) {
293 m->errorOut(e, "ParseSFFCommand", "help");
298 //**********************************************************************************************************************
300 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
304 m->openInputFile(oligoFile, inOligos);
306 string type, oligo, group;
310 while(!inOligos.eof()){
313 if(type[0] == '#'){ m->getline(inOligos); } // get rest of line if there's any crap there
317 for(int i=0;i<oligo.length();i++){
318 oligo[i] = toupper(oligo[i]);
319 if(oligo[i] == 'U') { oligo[i] = 'T'; }
321 if(type == "forward"){
322 forPrimer.push_back(oligo);
324 else if(type == "reverse"){
325 Sequence oligoRC("reverse", oligo);
326 oligoRC.reverseComplement();
327 revPrimer.push_back(oligoRC.getUnaligned());
329 else if(type == "barcode"){
331 barcodes[oligo]=index++;
332 groupVector.push_back(group);
334 outSFFFlowVec.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
335 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
336 outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
344 numFPrimers = forPrimer.size();
345 numRPrimers = revPrimer.size();
346 numBarcodes = barcodes.size();
348 catch(exception& e) {
349 m->errorOut(e, "ParseSFFCommand", "getOligos");
355 //**********************************************************************************************************************
357 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
361 while (!file.eof()) {
374 //**********************************************************************************************************************
376 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
380 while (!file.eof()) {
385 text = m->getline(file);
394 //**********************************************************************************************************************
396 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
398 vector<float> floatVector(length);
400 while (!file.eof()) {
403 for(int i=0;i<length;i++){
404 file >> floatVector[i];
413 //**********************************************************************************************************************
415 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
417 vector<int> intVector(length);
419 while (!file.eof()) {
422 for(int i=0;i<length;i++){
423 file >> intVector[i];
432 //**********************************************************************************************************************
435 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
440 while(newLength * 4 < length){
444 for(int i=0;i<4;i++){
445 float flow = flowgram[i + 4 * newLength];
449 if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
454 if(noise > 0 || signal == 0){
459 length = newLength * 4;
462 catch(exception& e) {
463 m->errorOut(e, "ParseSFFCommand", "screenFlow");
468 //**********************************************************************************************************************
470 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
472 string flow = "TACG";
473 string sequence = "";
474 for(int i=8;i<length;i++){
475 int signal = int(flowgram[i] + 0.5);
476 char base = flow[ i % 4 ];
477 for(int j=0;j<signal;j++){
484 //**********************************************************************************************************************
486 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
491 if(sequence.length() < minLength){ length = 0; }
494 int barcodeLength = 0;
496 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
497 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
499 barcodeLength = (it->first).size();
509 for(int i=0;i<numFPrimers;i++){
510 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
520 for(int i=0;i<numRPrimers;i++){
521 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
530 return fPrimer * rPrimer * length * barcode;
534 //**********************************************************************************************************************
536 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
539 int length = oligo.length();
541 for(int i=0;i<length;i++){
543 if(oligo[i] != seq[i]){
544 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
545 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
546 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
547 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
548 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
549 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
550 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
551 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
552 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
553 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
554 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
555 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
557 if(success == 0) { break; }
566 catch(exception& e) {
567 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
572 //**********************************************************************************************************************
574 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
577 // return qScores.substr(start-1, end-start+1);
581 //**********************************************************************************************************************
583 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
587 // int startCount = 0;
588 // int startIndex = 0;
590 // while(startCount < start && startIndex < qScores.length()){
591 // if(isspace(qScores[startIndex])){
597 // int endCount = startCount;
598 // int endIndex = startIndex;
600 // while(endCount < end && endIndex < qScores.length()){
601 // if(isspace(qScores[endIndex])){
607 // return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
611 //**********************************************************************************************************************