]> git.donarmstrong.com Git - mothur.git/blob - parsesffcommand.cpp
c636ec99221d874ec6b497e2ae3965fd353c8dc0
[mothur.git] / parsesffcommand.cpp
1 /*
2  *  parsesffcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 2/6/10.
6  *  Copyright 2010 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ParseSFFCommand::getValidParameters(){   
15         try {
16                 string Array[] =  {"sff", "oligos", "minlength", "outputdir", "inputdir"};
17                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "ParseSFFCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 ParseSFFCommand::ParseSFFCommand(){     
27         try {
28                 //initialize outputTypes
29                 vector<string> tempOutNames;
30                 outputTypes["flow"] = tempOutNames;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 vector<string> ParseSFFCommand::getRequiredParameters(){        
39         try {
40                 string Array[] =  {"sff"};
41                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
42                 return myArray;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ParseSFFCommand", "getRequiredParameters");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 vector<string> ParseSFFCommand::getRequiredFiles(){     
51         try {
52                 vector<string> myArray;
53                 return myArray;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "ParseSFFCommand", "getRequiredFiles");
57                 exit(1);
58         }
59 }
60 //**********************************************************************************************************************
61
62 ParseSFFCommand::ParseSFFCommand(string option){
63         try {
64                 abort = false;
65                 
66                 if(option == "help") {
67                         help();
68                         abort = true; 
69                 }
70                 else {
71                         //valid paramters for this command
72                         string Array[] =  {"sff", "oligos", "minlength", "outputdir", "inputdir"};
73                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
74                         
75                         OptionParser parser(option);
76                         map<string,string> parameters = parser.getParameters();
77                         
78                         ValidParameters validParameter;
79                         map<string,string>::iterator it;
80
81                         //check to make sure all parameters are valid for command
82                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
83                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
84                         }
85                         
86                         //initialize outputTypes
87                         vector<string> tempOutNames;
88                         outputTypes["flow"] = tempOutNames;
89                         
90                         //if the user changes the input directory command factory will send this info to us in the output parameter 
91                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
92                         if (inputDir == "not found"){   inputDir = "";          }
93                         else {
94                                 string path;
95                                 it = parameters.find("sff");
96                                 //user has given a template file
97                                 if(it != parameters.end()){ 
98                                         path = m->hasPath(it->second);
99                                         //if the user has not given a path then, add inputdir. else leave path alone.
100                                         if (path == "") {       parameters["sff"] = inputDir + it->second;              }
101                                 }
102                                 
103                                 it = parameters.find("oligos");
104                                 //user has given an oligos file
105                                 if(it != parameters.end()){ 
106                                         path = m->hasPath(it->second);
107                                         //if the user has not given a path then, add inputdir. else leave path alone.
108                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
109                                 }
110                         }
111                         
112                         
113                         //check for required parameters
114                         sffFile = validParameter.validFile(parameters, "sff", true);
115                         if (sffFile == "not found"){
116                                 m->mothurOut("sff is a required parameter for the parse.sff command.");
117                                 m->mothurOutEndLine();
118                                 abort = true;
119                         }
120                         else if (sffFile == "not open")         {       abort = true;   }       
121                         
122                         //if the user changes the output directory command factory will send this info to us in the output parameter 
123                         outputDir = validParameter.validFile(parameters, "outputdir", false);
124                         if (outputDir == "not found"){  
125                                 outputDir = ""; 
126                                 outputDir += m->hasPath(sffFile); //if user entered a file with a path then preserve it 
127                         }
128
129                         //check for optional parameter and set defaults
130                         // ...at some point should added some additional type checking...                       
131                         oligoFile = validParameter.validFile(parameters, "oligos", true);
132                         if (oligoFile == "not found")   {       oligoFile = "";         }
133                         else if(oligoFile == "not open"){       abort = true;           } 
134                         
135                         string temp = validParameter.validFile(parameters, "minlength", false);
136                         if (temp == "not found") { temp = "0"; }
137                         convert(temp, minLength); 
138                 }               
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
142                 exit(1);
143         }
144 }
145
146 //**********************************************************************************************************************
147
148 ParseSFFCommand::~ParseSFFCommand()     {       /*      do nothing      */      }
149
150 //**********************************************************************************************************************
151
152 int ParseSFFCommand::execute(){
153         try {
154                 if (abort == true) {    return 0;       }
155
156                 ifstream inSFF;
157                 m->openInputFile(sffFile, inSFF);
158                 
159                 cout.setf(ios::fixed, ios::floatfield);
160                 cout.setf(ios::showpoint);
161                 cout << setprecision(2);
162                         
163                 vector<ofstream*> flowFileNames;
164                 if(oligoFile != ""){
165                         getOligos(flowFileNames);
166                 }
167                 else{
168                         flowFileNames.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
169                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow")); outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow"));
170                 }
171                 
172                 for(int i=0;i<flowFileNames.size();i++){
173                         flowFileNames[i]->setf(ios::fixed, ios::floatfield);
174                         flowFileNames[i]->setf(ios::showpoint);
175                         *flowFileNames[i] << setprecision(2);
176                 }                       
177                 
178                 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close();  } return 0; }
179                 
180 //              ofstream fastaFile;
181 //              m->openOutputFile(m->getRootName(sffFile) + "fasta", fastaFile);
182
183 //              ofstream qualFile;
184 //              m->openOutputFile(m->getRootName(sffFile) + "qual", qualFile);
185                 
186                 string commonHeader = m->getline(inSFF);
187                 string magicNumber = m->getline(inSFF);         
188                 string version = m->getline(inSFF);
189                 string indexOffset = m->getline(inSFF);
190                 string indexLength = m->getline(inSFF);
191                 int numReads = parseHeaderLineToInt(inSFF);
192                 string headerLength = m->getline(inSFF);
193                 string keyLength = m->getline(inSFF);
194                 int numFlows = parseHeaderLineToInt(inSFF);
195                 string flowgramCode = m->getline(inSFF);
196                 string flowChars = m->getline(inSFF);
197                 string keySequence = m->getline(inSFF);
198                 m->gobble(inSFF);
199
200                 string seqName;
201                 bool good = 0;
202                 
203                 for(int i=0;i<numReads;i++){
204                         
205                         if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close();  } return 0; }
206                         
207                         inSFF >> seqName;
208                         seqName = seqName.substr(1);
209                         m->gobble(inSFF);
210                         
211                         string runPrefix = parseHeaderLineToString(inSFF);
212                         string regionNumber = parseHeaderLineToString(inSFF);
213                         string xyLocation = parseHeaderLineToString(inSFF);
214                         m->gobble(inSFF);
215                         
216                         string runName = parseHeaderLineToString(inSFF);
217                         string analysisName = parseHeaderLineToString(inSFF);
218                         string fullPath = parseHeaderLineToString(inSFF);
219                         m->gobble(inSFF);
220                         
221                         string readHeaderLen = parseHeaderLineToString(inSFF);
222                         string nameLength = parseHeaderLineToString(inSFF);
223                         int numBases = parseHeaderLineToInt(inSFF);
224                         string clipQualLeft = parseHeaderLineToString(inSFF);
225                         int clipQualRight = parseHeaderLineToInt(inSFF);
226                         string clipAdapLeft = parseHeaderLineToString(inSFF);
227                         string clipAdapRight = parseHeaderLineToString(inSFF);
228                         m->gobble(inSFF);
229                         
230                         vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
231                         vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
232                         string bases = parseHeaderLineToString(inSFF);
233                         string qualityScores = parseHeaderLineToString(inSFF);
234                         m->gobble(inSFF);
235                         
236
237                         
238                         int flowLength = flowIndices[clipQualRight-1];
239                                                 
240                         screenFlow(flowVector, flowLength);
241                         string sequence = flow2seq(flowVector, flowLength);
242                         
243                         int group = 0;
244         
245                         if(minLength != 0 || numFPrimers != 0  || numBarcodes != 0 || numRPrimers != 0){                
246                                 good = screenSeq(sequence, group);
247                         }
248
249                         if(good){
250                                 *flowFileNames[group] << seqName << ' ' << flowLength;
251                                 for(int i=0;i<numFlows;i++){
252                                         *flowFileNames[group] << ' ' << flowVector[i];
253                                 }
254                                 *flowFileNames[group] << endl;                          
255                         }
256                         
257 //                      string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
258 //                      fastaFile << fastaHeader << endl;
259 //                      fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
260 //
261 //                      qualFile << fastaHeader << endl;
262 //                      qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
263
264                 }
265                 for(int i=0;i<flowFileNames.size();i++){
266                         flowFileNames[i]->close();
267                 }
268
269                 m->mothurOutEndLine();
270                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
271                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
272                 m->mothurOutEndLine();
273
274 //              fastaFile.close();
275 //              qualFile.close();
276                 
277                 return 0;
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "ParseSFFCommand", "execute");
281                 exit(1);
282         }
283 }
284
285 //**********************************************************************************************************************
286
287 void ParseSFFCommand::help(){
288         try {
289                 m->mothurOut("The parse.sff command...");
290                 m->mothurOutEndLine();
291         }
292         catch(exception& e) {
293                 m->errorOut(e, "ParseSFFCommand", "help");
294                 exit(1);
295         }
296 }
297
298 //**********************************************************************************************************************
299
300 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
301         try {
302
303                 ifstream inOligos;
304                 m->openInputFile(oligoFile, inOligos);
305                 
306                 string type, oligo, group;
307                 
308                 int index = 0;
309                 
310                 while(!inOligos.eof()){
311                         inOligos >> type;
312
313                         if(type[0] == '#'){     m->getline(inOligos);   } // get rest of line if there's any crap there
314                         else{
315                                 inOligos >> oligo;
316                                 
317                                 for(int i=0;i<oligo.length();i++){
318                                         oligo[i] = toupper(oligo[i]);
319                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
320                                 }
321                                 if(type == "forward"){
322                                         forPrimer.push_back(oligo);
323                                 }
324                                 else if(type == "reverse"){
325                                         Sequence oligoRC("reverse", oligo);
326                                         oligoRC.reverseComplement();
327                                         revPrimer.push_back(oligoRC.getUnaligned());
328                                 }
329                                 else if(type == "barcode"){
330                                         inOligos >> group;
331                                         barcodes[oligo]=index++;
332                                         groupVector.push_back(group);
333                                         
334                                         outSFFFlowVec.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
335                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
336                                         outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
337                                 }
338                         }
339                         m->gobble(inOligos);
340                 }
341                 
342                 inOligos.close();
343                 
344                 numFPrimers = forPrimer.size();
345                 numRPrimers = revPrimer.size();
346                 numBarcodes = barcodes.size();
347         }
348         catch(exception& e) {
349                 m->errorOut(e, "ParseSFFCommand", "getOligos");
350                 exit(1);
351         }
352         
353 }
354
355 //**********************************************************************************************************************
356
357 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
358         
359         int number;
360
361         while (!file.eof())     {
362
363                 char c = file.get(); 
364                 if (c == ':'){
365                         file >> number;
366                         break;
367                 }
368                 
369         }
370         m->gobble(file);
371         return number;
372 }
373
374 //**********************************************************************************************************************
375
376 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
377         
378         string text;
379         
380         while (!file.eof())     {
381                 char c = file.get(); 
382                 
383                 if (c == ':'){
384                         m->gobble(file);
385                         text = m->getline(file);                        
386                         break;
387                 }
388         }
389         m->gobble(file);
390
391         return text;
392 }
393
394 //**********************************************************************************************************************
395
396 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
397         
398         vector<float> floatVector(length);
399         
400         while (!file.eof())     {
401                 char c = file.get(); 
402                 if (c == ':'){
403                         for(int i=0;i<length;i++){
404                                 file >> floatVector[i];
405                         }
406                         break;
407                 }
408         }
409         m->gobble(file);        
410         return floatVector;
411 }
412
413 //**********************************************************************************************************************
414
415 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
416         
417         vector<int> intVector(length);
418         
419         while (!file.eof())     {
420                 char c = file.get(); 
421                 if (c == ':'){
422                         for(int i=0;i<length;i++){
423                                 file >> intVector[i];
424                         }
425                         break;
426                 }
427         }
428         m->gobble(file);        
429         return intVector;
430 }
431
432 //**********************************************************************************************************************
433
434
435 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
436         try{
437
438                 int newLength = 0;
439
440                 while(newLength * 4 < length){
441                         
442                         int signal = 0;
443                         int noise = 0;
444                         for(int i=0;i<4;i++){
445                                 float flow = flowgram[i + 4 * newLength];
446
447                                 if(flow > 0.50){
448                                         signal++;
449                                         if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
450                                                 noise++;
451                                         }
452                                 }
453                         }
454                         if(noise > 0 || signal == 0){
455                                 break;
456                         }                       
457                         newLength++;
458                 }
459                 length = newLength * 4;
460         }
461         
462         catch(exception& e) {
463                 m->errorOut(e, "ParseSFFCommand", "screenFlow");
464                 exit(1);
465         }
466 }
467
468 //**********************************************************************************************************************
469
470 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
471
472         string flow = "TACG";
473         string sequence = "";
474         for(int i=8;i<length;i++){
475                 int signal = int(flowgram[i] + 0.5);
476                 char base = flow[ i % 4 ];
477                 for(int j=0;j<signal;j++){
478                         sequence += base;
479                 }
480         }
481         return sequence;
482 }
483
484 //**********************************************************************************************************************
485
486 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
487
488         int length = 1;
489         group = -1;
490         
491         if(sequence.length() < minLength){      length = 0;     }
492         
493         int barcode = 1;
494         int barcodeLength = 0;
495
496         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
497                 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
498                         barcode = 1;
499                         barcodeLength = (it->first).size();
500                         group = it->second;
501                         break;
502                 }
503                 else{
504                         barcode = 0;
505                 }
506         }
507         
508         int fPrimer = 1;
509         for(int i=0;i<numFPrimers;i++){
510                 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
511                         fPrimer = 1;
512                         break;
513                 }
514                 else{
515                         fPrimer = 0;
516                 }
517         }
518
519         int rPrimer = 1;
520         for(int i=0;i<numRPrimers;i++){
521                 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
522                         rPrimer = 1;
523                         break;
524                 }
525                 else{
526                         rPrimer = 0;
527                 }
528         }
529
530         return fPrimer * rPrimer * length * barcode;
531                 
532 }
533
534 //**********************************************************************************************************************
535            
536 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
537    try {
538            bool success = 1;
539            int length = oligo.length();
540            
541            for(int i=0;i<length;i++){
542                    
543                    if(oligo[i] != seq[i]){
544                            if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')         {       success = 0;    }
545                            else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                     {       success = 0;    }
546                            else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                         {       success = 0;    }
547                            else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                         {       success = 0;    }
548                            else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                         {       success = 0;    }
549                            else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                         {       success = 0;    }
550                            else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                         {       success = 0;    }
551                            else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                         {       success = 0;    }
552                            else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
553                            else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
554                            else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))        {       success = 0;    }
555                            else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))        {       success = 0;    }                       
556                            
557                            if(success == 0)     {       break;  }
558                    }
559                    else{
560                            success = 1;
561                    }
562            }
563            
564            return success;
565    }
566    catch(exception& e) {
567            m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
568            exit(1);
569    }
570 }
571            
572 //**********************************************************************************************************************
573
574 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
575 //      
576 //      
577 //      return qScores.substr(start-1, end-start+1);
578 //
579 //}
580
581 //**********************************************************************************************************************
582
583 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
584 //      
585 //      start--;
586 //      
587 //      int startCount = 0;
588 //      int startIndex = 0;
589 //      
590 //      while(startCount < start && startIndex < qScores.length()){
591 //              if(isspace(qScores[startIndex])){
592 //                      startCount++;
593 //              }
594 //         startIndex++;
595 //      }
596 //      
597 //      int endCount = startCount;
598 //      int endIndex = startIndex;
599 //      
600 //      while(endCount < end && endIndex < qScores.length()){
601 //              if(isspace(qScores[endIndex])){
602 //                      endCount++;
603 //              }
604 //              endIndex++;
605 //      }
606 //      
607 //   return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
608 //      
609 //}
610
611 //**********************************************************************************************************************
612
613