]> git.donarmstrong.com Git - mothur.git/blob - parsesffcommand.cpp
fixes while testing
[mothur.git] / parsesffcommand.cpp
1 /*
2  *  parsesffcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 2/6/10.
6  *  Copyright 2010 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ParseSFFCommand::getValidParameters(){   
15         try {
16                 string Array[] =  {"sff", "oligos", "minlength", "outputdir", "inputdir"};
17                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "ParseSFFCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 ParseSFFCommand::ParseSFFCommand(){     
27         try {
28                 abort = true;
29                 //initialize outputTypes
30                 vector<string> tempOutNames;
31                 outputTypes["flow"] = tempOutNames;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 vector<string> ParseSFFCommand::getRequiredParameters(){        
40         try {
41                 string Array[] =  {"sff"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ParseSFFCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 vector<string> ParseSFFCommand::getRequiredFiles(){     
52         try {
53                 vector<string> myArray;
54                 return myArray;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "ParseSFFCommand", "getRequiredFiles");
58                 exit(1);
59         }
60 }
61 //**********************************************************************************************************************
62
63 ParseSFFCommand::ParseSFFCommand(string option){
64         try {
65                 abort = false;
66                 
67                 if(option == "help") {
68                         help();
69                         abort = true; 
70                 }
71                 else {
72                         //valid paramters for this command
73                         string Array[] =  {"sff", "oligos", "minlength", "outputdir", "inputdir"};
74                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75                         
76                         OptionParser parser(option);
77                         map<string,string> parameters = parser.getParameters();
78                         
79                         ValidParameters validParameter;
80                         map<string,string>::iterator it;
81
82                         //check to make sure all parameters are valid for command
83                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
84                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
85                         }
86                         
87                         //initialize outputTypes
88                         vector<string> tempOutNames;
89                         outputTypes["flow"] = tempOutNames;
90                         
91                         //if the user changes the input directory command factory will send this info to us in the output parameter 
92                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
93                         if (inputDir == "not found"){   inputDir = "";          }
94                         else {
95                                 string path;
96                                 it = parameters.find("sff");
97                                 //user has given a template file
98                                 if(it != parameters.end()){ 
99                                         path = m->hasPath(it->second);
100                                         //if the user has not given a path then, add inputdir. else leave path alone.
101                                         if (path == "") {       parameters["sff"] = inputDir + it->second;              }
102                                 }
103                                 
104                                 it = parameters.find("oligos");
105                                 //user has given an oligos file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
110                                 }
111                         }
112                         
113                         
114                         //check for required parameters
115                         sffFile = validParameter.validFile(parameters, "sff", true);
116                         if (sffFile == "not found"){
117                                 m->mothurOut("sff is a required parameter for the parse.sff command.");
118                                 m->mothurOutEndLine();
119                                 abort = true;
120                         }
121                         else if (sffFile == "not open")         {       abort = true;   }       
122                         
123                         //if the user changes the output directory command factory will send this info to us in the output parameter 
124                         outputDir = validParameter.validFile(parameters, "outputdir", false);
125                         if (outputDir == "not found"){  
126                                 outputDir = ""; 
127                                 outputDir += m->hasPath(sffFile); //if user entered a file with a path then preserve it 
128                         }
129
130                         //check for optional parameter and set defaults
131                         // ...at some point should added some additional type checking...                       
132                         oligoFile = validParameter.validFile(parameters, "oligos", true);
133                         if (oligoFile == "not found")   {       oligoFile = "";         }
134                         else if(oligoFile == "not open"){       abort = true;           } 
135                         
136                         string temp = validParameter.validFile(parameters, "minlength", false);
137                         if (temp == "not found") { temp = "0"; }
138                         convert(temp, minLength); 
139                 }               
140         }
141         catch(exception& e) {
142                 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
143                 exit(1);
144         }
145 }
146
147 //**********************************************************************************************************************
148
149 ParseSFFCommand::~ParseSFFCommand()     {       /*      do nothing      */      }
150
151 //**********************************************************************************************************************
152
153 int ParseSFFCommand::execute(){
154         try {
155                 if (abort == true) {    return 0;       }
156
157                 ifstream inSFF;
158                 m->openInputFile(sffFile, inSFF);
159                 
160                 cout.setf(ios::fixed, ios::floatfield);
161                 cout.setf(ios::showpoint);
162                 cout << setprecision(2);
163                         
164                 vector<ofstream*> flowFileNames;
165                 if(oligoFile != ""){
166                         getOligos(flowFileNames);
167                 }
168                 else{
169                         flowFileNames.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
170                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow")); outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + "flow"));
171                 }
172                 
173                 for(int i=0;i<flowFileNames.size();i++){
174                         flowFileNames[i]->setf(ios::fixed, ios::floatfield);
175                         flowFileNames[i]->setf(ios::showpoint);
176                         *flowFileNames[i] << setprecision(2);
177                 }                       
178                 
179                 if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close();  } return 0; }
180                 
181 //              ofstream fastaFile;
182 //              m->openOutputFile(m->getRootName(sffFile) + "fasta", fastaFile);
183
184 //              ofstream qualFile;
185 //              m->openOutputFile(m->getRootName(sffFile) + "qual", qualFile);
186                 
187                 string commonHeader = m->getline(inSFF);
188                 string magicNumber = m->getline(inSFF);         
189                 string version = m->getline(inSFF);
190                 string indexOffset = m->getline(inSFF);
191                 string indexLength = m->getline(inSFF);
192                 int numReads = parseHeaderLineToInt(inSFF);
193                 string headerLength = m->getline(inSFF);
194                 string keyLength = m->getline(inSFF);
195                 int numFlows = parseHeaderLineToInt(inSFF);
196                 string flowgramCode = m->getline(inSFF);
197                 string flowChars = m->getline(inSFF);
198                 string keySequence = m->getline(inSFF);
199                 m->gobble(inSFF);
200
201                 string seqName;
202                 bool good = 0;
203                 
204                 for(int i=0;i<numReads;i++){
205                         
206                         if (m->control_pressed) { outputTypes.clear(); for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close();  } return 0; }
207                         
208                         inSFF >> seqName;
209                         seqName = seqName.substr(1);
210                         m->gobble(inSFF);
211                         
212                         string runPrefix = parseHeaderLineToString(inSFF);
213                         string regionNumber = parseHeaderLineToString(inSFF);
214                         string xyLocation = parseHeaderLineToString(inSFF);
215                         m->gobble(inSFF);
216                         
217                         string runName = parseHeaderLineToString(inSFF);
218                         string analysisName = parseHeaderLineToString(inSFF);
219                         string fullPath = parseHeaderLineToString(inSFF);
220                         m->gobble(inSFF);
221                         
222                         string readHeaderLen = parseHeaderLineToString(inSFF);
223                         string nameLength = parseHeaderLineToString(inSFF);
224                         int numBases = parseHeaderLineToInt(inSFF);
225                         string clipQualLeft = parseHeaderLineToString(inSFF);
226                         int clipQualRight = parseHeaderLineToInt(inSFF);
227                         string clipAdapLeft = parseHeaderLineToString(inSFF);
228                         string clipAdapRight = parseHeaderLineToString(inSFF);
229                         m->gobble(inSFF);
230                         
231                         vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
232                         vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
233                         string bases = parseHeaderLineToString(inSFF);
234                         string qualityScores = parseHeaderLineToString(inSFF);
235                         m->gobble(inSFF);
236                         
237
238                         
239                         int flowLength = flowIndices[clipQualRight-1];
240                                                 
241                         screenFlow(flowVector, flowLength);
242                         string sequence = flow2seq(flowVector, flowLength);
243                         
244                         int group = 0;
245         
246                         if(minLength != 0 || numFPrimers != 0  || numBarcodes != 0 || numRPrimers != 0){                
247                                 good = screenSeq(sequence, group);
248                         }
249
250                         if(good){
251                                 *flowFileNames[group] << seqName << ' ' << flowLength;
252                                 for(int i=0;i<numFlows;i++){
253                                         *flowFileNames[group] << ' ' << flowVector[i];
254                                 }
255                                 *flowFileNames[group] << endl;                          
256                         }
257                         
258 //                      string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
259 //                      fastaFile << fastaHeader << endl;
260 //                      fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
261 //
262 //                      qualFile << fastaHeader << endl;
263 //                      qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
264
265                 }
266                 for(int i=0;i<flowFileNames.size();i++){
267                         flowFileNames[i]->close();
268                 }
269
270                 m->mothurOutEndLine();
271                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
272                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
273                 m->mothurOutEndLine();
274
275 //              fastaFile.close();
276 //              qualFile.close();
277                 
278                 return 0;
279         }
280         catch(exception& e) {
281                 m->errorOut(e, "ParseSFFCommand", "execute");
282                 exit(1);
283         }
284 }
285
286 //**********************************************************************************************************************
287
288 void ParseSFFCommand::help(){
289         try {
290                 m->mothurOut("The parse.sff command...");
291                 m->mothurOutEndLine();
292         }
293         catch(exception& e) {
294                 m->errorOut(e, "ParseSFFCommand", "help");
295                 exit(1);
296         }
297 }
298
299 //**********************************************************************************************************************
300
301 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
302         try {
303
304                 ifstream inOligos;
305                 m->openInputFile(oligoFile, inOligos);
306                 
307                 string type, oligo, group;
308                 
309                 int index = 0;
310                 
311                 while(!inOligos.eof()){
312                         inOligos >> type;
313
314                         if(type[0] == '#'){     m->getline(inOligos);   } // get rest of line if there's any crap there
315                         else{
316                                 inOligos >> oligo;
317                                 
318                                 for(int i=0;i<oligo.length();i++){
319                                         oligo[i] = toupper(oligo[i]);
320                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
321                                 }
322                                 if(type == "forward"){
323                                         forPrimer.push_back(oligo);
324                                 }
325                                 else if(type == "reverse"){
326                                         Sequence oligoRC("reverse", oligo);
327                                         oligoRC.reverseComplement();
328                                         revPrimer.push_back(oligoRC.getUnaligned());
329                                 }
330                                 else if(type == "barcode"){
331                                         inOligos >> group;
332                                         barcodes[oligo]=index++;
333                                         groupVector.push_back(group);
334                                         
335                                         outSFFFlowVec.push_back(new ofstream((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
336                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
337                                         outputTypes["flow"].push_back((outputDir + m->getRootName(m->getSimpleName(sffFile)) + group + "flow"));
338                                 }
339                         }
340                         m->gobble(inOligos);
341                 }
342                 
343                 inOligos.close();
344                 
345                 numFPrimers = forPrimer.size();
346                 numRPrimers = revPrimer.size();
347                 numBarcodes = barcodes.size();
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "ParseSFFCommand", "getOligos");
351                 exit(1);
352         }
353         
354 }
355
356 //**********************************************************************************************************************
357
358 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
359         
360         int number;
361
362         while (!file.eof())     {
363
364                 char c = file.get(); 
365                 if (c == ':'){
366                         file >> number;
367                         break;
368                 }
369                 
370         }
371         m->gobble(file);
372         return number;
373 }
374
375 //**********************************************************************************************************************
376
377 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
378         
379         string text;
380         
381         while (!file.eof())     {
382                 char c = file.get(); 
383                 
384                 if (c == ':'){
385                         m->gobble(file);
386                         text = m->getline(file);                        
387                         break;
388                 }
389         }
390         m->gobble(file);
391
392         return text;
393 }
394
395 //**********************************************************************************************************************
396
397 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
398         
399         vector<float> floatVector(length);
400         
401         while (!file.eof())     {
402                 char c = file.get(); 
403                 if (c == ':'){
404                         for(int i=0;i<length;i++){
405                                 file >> floatVector[i];
406                         }
407                         break;
408                 }
409         }
410         m->gobble(file);        
411         return floatVector;
412 }
413
414 //**********************************************************************************************************************
415
416 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
417         
418         vector<int> intVector(length);
419         
420         while (!file.eof())     {
421                 char c = file.get(); 
422                 if (c == ':'){
423                         for(int i=0;i<length;i++){
424                                 file >> intVector[i];
425                         }
426                         break;
427                 }
428         }
429         m->gobble(file);        
430         return intVector;
431 }
432
433 //**********************************************************************************************************************
434
435
436 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
437         try{
438
439                 int newLength = 0;
440
441                 while(newLength * 4 < length){
442                         
443                         int signal = 0;
444                         int noise = 0;
445                         for(int i=0;i<4;i++){
446                                 float flow = flowgram[i + 4 * newLength];
447
448                                 if(flow > 0.50){
449                                         signal++;
450                                         if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
451                                                 noise++;
452                                         }
453                                 }
454                         }
455                         if(noise > 0 || signal == 0){
456                                 break;
457                         }                       
458                         newLength++;
459                 }
460                 length = newLength * 4;
461         }
462         
463         catch(exception& e) {
464                 m->errorOut(e, "ParseSFFCommand", "screenFlow");
465                 exit(1);
466         }
467 }
468
469 //**********************************************************************************************************************
470
471 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
472
473         string flow = "TACG";
474         string sequence = "";
475         for(int i=8;i<length;i++){
476                 int signal = int(flowgram[i] + 0.5);
477                 char base = flow[ i % 4 ];
478                 for(int j=0;j<signal;j++){
479                         sequence += base;
480                 }
481         }
482         return sequence;
483 }
484
485 //**********************************************************************************************************************
486
487 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
488
489         int length = 1;
490         group = -1;
491         
492         if(sequence.length() < minLength){      length = 0;     }
493         
494         int barcode = 1;
495         int barcodeLength = 0;
496
497         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
498                 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
499                         barcode = 1;
500                         barcodeLength = (it->first).size();
501                         group = it->second;
502                         break;
503                 }
504                 else{
505                         barcode = 0;
506                 }
507         }
508         
509         int fPrimer = 1;
510         for(int i=0;i<numFPrimers;i++){
511                 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
512                         fPrimer = 1;
513                         break;
514                 }
515                 else{
516                         fPrimer = 0;
517                 }
518         }
519
520         int rPrimer = 1;
521         for(int i=0;i<numRPrimers;i++){
522                 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
523                         rPrimer = 1;
524                         break;
525                 }
526                 else{
527                         rPrimer = 0;
528                 }
529         }
530
531         return fPrimer * rPrimer * length * barcode;
532                 
533 }
534
535 //**********************************************************************************************************************
536            
537 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
538    try {
539            bool success = 1;
540            int length = oligo.length();
541            
542            for(int i=0;i<length;i++){
543                    
544                    if(oligo[i] != seq[i]){
545                            if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')         {       success = 0;    }
546                            else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                     {       success = 0;    }
547                            else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                         {       success = 0;    }
548                            else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                         {       success = 0;    }
549                            else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                         {       success = 0;    }
550                            else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                         {       success = 0;    }
551                            else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                         {       success = 0;    }
552                            else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                         {       success = 0;    }
553                            else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
554                            else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
555                            else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))        {       success = 0;    }
556                            else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))        {       success = 0;    }                       
557                            
558                            if(success == 0)     {       break;  }
559                    }
560                    else{
561                            success = 1;
562                    }
563            }
564            
565            return success;
566    }
567    catch(exception& e) {
568            m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
569            exit(1);
570    }
571 }
572            
573 //**********************************************************************************************************************
574
575 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
576 //      
577 //      
578 //      return qScores.substr(start-1, end-start+1);
579 //
580 //}
581
582 //**********************************************************************************************************************
583
584 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
585 //      
586 //      start--;
587 //      
588 //      int startCount = 0;
589 //      int startIndex = 0;
590 //      
591 //      while(startCount < start && startIndex < qScores.length()){
592 //              if(isspace(qScores[startIndex])){
593 //                      startCount++;
594 //              }
595 //         startIndex++;
596 //      }
597 //      
598 //      int endCount = startCount;
599 //      int endIndex = startIndex;
600 //      
601 //      while(endCount < end && endIndex < qScores.length()){
602 //              if(isspace(qScores[endIndex])){
603 //                      endCount++;
604 //              }
605 //              endIndex++;
606 //      }
607 //      
608 //   return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
609 //      
610 //}
611
612 //**********************************************************************************************************************
613
614