]> git.donarmstrong.com Git - mothur.git/blob - parsesffcommand.cpp
some changes while testing 1.9
[mothur.git] / parsesffcommand.cpp
1 /*
2  *  parsesffcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 2/6/10.
6  *  Copyright 2010 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14
15 ParseSFFCommand::ParseSFFCommand(string option){
16         try {
17                 abort = false;
18                 
19                 if(option == "help") {
20                         help();
21                         abort = true; 
22                 }
23                 else {
24                         //valid paramters for this command
25                         string Array[] =  {"sff", "oligos", "minlength", "outputdir", "inputdir"};
26                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                         map<string,string>::iterator it;
33
34                         //check to make sure all parameters are valid for command
35                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("sff");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["sff"] = inputDir + it->second;              }
50                                 }
51                                 
52                                 it = parameters.find("oligos");
53                                 //user has given an oligos file
54                                 if(it != parameters.end()){ 
55                                         path = hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
58                                 }
59                         }
60                         
61                         
62                         //check for required parameters
63                         sffFile = validParameter.validFile(parameters, "sff", true);
64                         if (sffFile == "not found"){
65                                 m->mothurOut("sff is a required parameter for the parse.sff command.");
66                                 m->mothurOutEndLine();
67                                 abort = true;
68                         }
69                         else if (sffFile == "not open")         {       abort = true;   }       
70                         
71                         //if the user changes the output directory command factory will send this info to us in the output parameter 
72                         outputDir = validParameter.validFile(parameters, "outputdir", false);
73                         if (outputDir == "not found"){  
74                                 outputDir = ""; 
75                                 outputDir += hasPath(sffFile); //if user entered a file with a path then preserve it    
76                         }
77
78                         //check for optional parameter and set defaults
79                         // ...at some point should added some additional type checking...                       
80                         oligoFile = validParameter.validFile(parameters, "oligos", true);
81                         if (oligoFile == "not found")   {       oligoFile = "";         }
82                         else if(oligoFile == "not open"){       abort = true;           } 
83                         
84                         string temp = validParameter.validFile(parameters, "minlength", false);
85                         if (temp == "not found") { temp = "0"; }
86                         convert(temp, minLength); 
87                 }               
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
91                 exit(1);
92         }
93 }
94
95 //**********************************************************************************************************************
96
97 ParseSFFCommand::~ParseSFFCommand()     {       /*      do nothing      */      }
98
99 //**********************************************************************************************************************
100
101 int ParseSFFCommand::execute(){
102         try {
103                 if (abort == true) {    return 0;       }
104
105                 ifstream inSFF;
106                 openInputFile(sffFile, inSFF);
107                 
108                 cout.setf(ios::fixed, ios::floatfield);
109                 cout.setf(ios::showpoint);
110                 cout << setprecision(2);
111                         
112                 vector<ofstream*> flowFileNames;
113                 if(oligoFile != ""){
114                         getOligos(flowFileNames);
115                 }
116                 else{
117                         flowFileNames.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
118                         outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + "flow"));
119                 }
120                 
121                 for(int i=0;i<flowFileNames.size();i++){
122                         flowFileNames[i]->setf(ios::fixed, ios::floatfield);
123                         flowFileNames[i]->setf(ios::showpoint);
124                         *flowFileNames[i] << setprecision(2);
125                 }                       
126                 
127                 if (m->control_pressed) { for(int i=0;i<flowFileNames.size();i++){      flowFileNames[i]->close();  } return 0; }
128                 
129 //              ofstream fastaFile;
130 //              openOutputFile(getRootName(sffFile) + "fasta", fastaFile);
131
132 //              ofstream qualFile;
133 //              openOutputFile(getRootName(sffFile) + "qual", qualFile);
134                 
135                 string commonHeader = getline(inSFF);
136                 string magicNumber = getline(inSFF);            
137                 string version = getline(inSFF);
138                 string indexOffset = getline(inSFF);
139                 string indexLength = getline(inSFF);
140                 int numReads = parseHeaderLineToInt(inSFF);
141                 string headerLength = getline(inSFF);
142                 string keyLength = getline(inSFF);
143                 int numFlows = parseHeaderLineToInt(inSFF);
144                 string flowgramCode = getline(inSFF);
145                 string flowChars = getline(inSFF);
146                 string keySequence = getline(inSFF);
147                 gobble(inSFF);
148
149                 string seqName;
150                 bool good = 0;
151                 
152                 for(int i=0;i<numReads;i++){
153                         
154                         if (m->control_pressed) { for(int i=0;i<flowFileNames.size();i++){      flowFileNames[i]->close();  } return 0; }
155                         
156                         inSFF >> seqName;
157                         seqName = seqName.substr(1);
158                         gobble(inSFF);
159                         
160                         string runPrefix = parseHeaderLineToString(inSFF);
161                         string regionNumber = parseHeaderLineToString(inSFF);
162                         string xyLocation = parseHeaderLineToString(inSFF);
163                         gobble(inSFF);
164                         
165                         string runName = parseHeaderLineToString(inSFF);
166                         string analysisName = parseHeaderLineToString(inSFF);
167                         string fullPath = parseHeaderLineToString(inSFF);
168                         gobble(inSFF);
169                         
170                         string readHeaderLen = parseHeaderLineToString(inSFF);
171                         string nameLength = parseHeaderLineToString(inSFF);
172                         int numBases = parseHeaderLineToInt(inSFF);
173                         string clipQualLeft = parseHeaderLineToString(inSFF);
174                         int clipQualRight = parseHeaderLineToInt(inSFF);
175                         string clipAdapLeft = parseHeaderLineToString(inSFF);
176                         string clipAdapRight = parseHeaderLineToString(inSFF);
177                         gobble(inSFF);
178                         
179                         vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
180                         vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
181                         string bases = parseHeaderLineToString(inSFF);
182                         string qualityScores = parseHeaderLineToString(inSFF);
183                         gobble(inSFF);
184                         
185
186                         
187                         int flowLength = flowIndices[clipQualRight-1];
188                                                 
189                         screenFlow(flowVector, flowLength);
190                         string sequence = flow2seq(flowVector, flowLength);
191                         
192                         int group = 0;
193         
194                         if(minLength != 0 || numFPrimers != 0  || numBarcodes != 0 || numRPrimers != 0){                
195                                 good = screenSeq(sequence, group);
196                         }
197
198                         if(good){
199                                 *flowFileNames[group] << seqName << ' ' << flowLength;
200                                 for(int i=0;i<numFlows;i++){
201                                         *flowFileNames[group] << ' ' << flowVector[i];
202                                 }
203                                 *flowFileNames[group] << endl;                          
204                         }
205                         
206 //                      string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
207 //                      fastaFile << fastaHeader << endl;
208 //                      fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
209 //
210 //                      qualFile << fastaHeader << endl;
211 //                      qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
212
213                 }
214                 for(int i=0;i<flowFileNames.size();i++){
215                         flowFileNames[i]->close();
216                 }
217
218                 m->mothurOutEndLine();
219                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
220                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
221                 m->mothurOutEndLine();
222
223 //              fastaFile.close();
224 //              qualFile.close();
225                 
226                 return 0;
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "ParseSFFCommand", "execute");
230                 exit(1);
231         }
232 }
233
234 //**********************************************************************************************************************
235
236 void ParseSFFCommand::help(){
237         try {
238                 m->mothurOut("The parse.sff command...");
239                 m->mothurOutEndLine();
240         }
241         catch(exception& e) {
242                 m->errorOut(e, "ParseSFFCommand", "help");
243                 exit(1);
244         }
245 }
246
247 //**********************************************************************************************************************
248
249 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
250         try {
251
252                 ifstream inOligos;
253                 openInputFile(oligoFile, inOligos);
254                 
255                 string type, oligo, group;
256                 
257                 int index = 0;
258                 
259                 while(!inOligos.eof()){
260                         inOligos >> type;
261
262                         if(type[0] == '#'){     getline(inOligos);      } // get rest of line if there's any crap there
263                         else{
264                                 inOligos >> oligo;
265                                 
266                                 for(int i=0;i<oligo.length();i++){
267                                         oligo[i] = toupper(oligo[i]);
268                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
269                                 }
270                                 if(type == "forward"){
271                                         forPrimer.push_back(oligo);
272                                 }
273                                 else if(type == "reverse"){
274                                         Sequence oligoRC("reverse", oligo);
275                                         oligoRC.reverseComplement();
276                                         revPrimer.push_back(oligoRC.getUnaligned());
277                                 }
278                                 else if(type == "barcode"){
279                                         inOligos >> group;
280                                         barcodes[oligo]=index++;
281                                         groupVector.push_back(group);
282                                         
283                                         outSFFFlowVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
284                                         outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + group + "flow"));
285                                 }
286                         }
287                         gobble(inOligos);
288                 }
289                 
290                 inOligos.close();
291                 
292                 numFPrimers = forPrimer.size();
293                 numRPrimers = revPrimer.size();
294                 numBarcodes = barcodes.size();
295         }
296         catch(exception& e) {
297                 m->errorOut(e, "ParseSFFCommand", "getOligos");
298                 exit(1);
299         }
300         
301 }
302
303 //**********************************************************************************************************************
304
305 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
306         
307         int number;
308
309         while (!file.eof())     {
310
311                 char c = file.get(); 
312                 if (c == ':'){
313                         file >> number;
314                         break;
315                 }
316                 
317         }
318         gobble(file);
319         return number;
320 }
321
322 //**********************************************************************************************************************
323
324 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
325         
326         string text;
327         
328         while (!file.eof())     {
329                 char c = file.get(); 
330                 
331                 if (c == ':'){
332                         gobble(file);
333                         text = getline(file);                   
334                         break;
335                 }
336         }
337         gobble(file);
338
339         return text;
340 }
341
342 //**********************************************************************************************************************
343
344 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
345         
346         vector<float> floatVector(length);
347         
348         while (!file.eof())     {
349                 char c = file.get(); 
350                 if (c == ':'){
351                         for(int i=0;i<length;i++){
352                                 file >> floatVector[i];
353                         }
354                         break;
355                 }
356         }
357         gobble(file);   
358         return floatVector;
359 }
360
361 //**********************************************************************************************************************
362
363 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
364         
365         vector<int> intVector(length);
366         
367         while (!file.eof())     {
368                 char c = file.get(); 
369                 if (c == ':'){
370                         for(int i=0;i<length;i++){
371                                 file >> intVector[i];
372                         }
373                         break;
374                 }
375         }
376         gobble(file);   
377         return intVector;
378 }
379
380 //**********************************************************************************************************************
381
382
383 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
384         try{
385
386                 int newLength = 0;
387
388                 while(newLength * 4 < length){
389                         
390                         int signal = 0;
391                         int noise = 0;
392                         for(int i=0;i<4;i++){
393                                 float flow = flowgram[i + 4 * newLength];
394
395                                 if(flow > 0.50){
396                                         signal++;
397                                         if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
398                                                 noise++;
399                                         }
400                                 }
401                         }
402                         if(noise > 0 || signal == 0){
403                                 break;
404                         }                       
405                         newLength++;
406                 }
407                 length = newLength * 4;
408         }
409         
410         catch(exception& e) {
411                 m->errorOut(e, "ParseSFFCommand", "screenFlow");
412                 exit(1);
413         }
414 }
415
416 //**********************************************************************************************************************
417
418 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
419
420         string flow = "TACG";
421         string sequence = "";
422         for(int i=8;i<length;i++){
423                 int signal = int(flowgram[i] + 0.5);
424                 char base = flow[ i % 4 ];
425                 for(int j=0;j<signal;j++){
426                         sequence += base;
427                 }
428         }
429         return sequence;
430 }
431
432 //**********************************************************************************************************************
433
434 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
435
436         int length = 1;
437         group = -1;
438         
439         if(sequence.length() < minLength){      length = 0;     }
440         
441         int barcode = 1;
442         int barcodeLength = 0;
443
444         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
445                 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
446                         barcode = 1;
447                         barcodeLength = (it->first).size();
448                         group = it->second;
449                         break;
450                 }
451                 else{
452                         barcode = 0;
453                 }
454         }
455         
456         int fPrimer = 1;
457         for(int i=0;i<numFPrimers;i++){
458                 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
459                         fPrimer = 1;
460                         break;
461                 }
462                 else{
463                         fPrimer = 0;
464                 }
465         }
466
467         int rPrimer = 1;
468         for(int i=0;i<numRPrimers;i++){
469                 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
470                         rPrimer = 1;
471                         break;
472                 }
473                 else{
474                         rPrimer = 0;
475                 }
476         }
477
478         return fPrimer * rPrimer * length * barcode;
479                 
480 }
481
482 //**********************************************************************************************************************
483            
484 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
485    try {
486            bool success = 1;
487            int length = oligo.length();
488            
489            for(int i=0;i<length;i++){
490                    
491                    if(oligo[i] != seq[i]){
492                            if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')         {       success = 0;    }
493                            else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                     {       success = 0;    }
494                            else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                         {       success = 0;    }
495                            else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                         {       success = 0;    }
496                            else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                         {       success = 0;    }
497                            else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                         {       success = 0;    }
498                            else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                         {       success = 0;    }
499                            else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                         {       success = 0;    }
500                            else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
501                            else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))        {       success = 0;    }
502                            else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))        {       success = 0;    }
503                            else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))        {       success = 0;    }                       
504                            
505                            if(success == 0)     {       break;  }
506                    }
507                    else{
508                            success = 1;
509                    }
510            }
511            
512            return success;
513    }
514    catch(exception& e) {
515            m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
516            exit(1);
517    }
518 }
519            
520 //**********************************************************************************************************************
521
522 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
523 //      
524 //      
525 //      return qScores.substr(start-1, end-start+1);
526 //
527 //}
528
529 //**********************************************************************************************************************
530
531 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
532 //      
533 //      start--;
534 //      
535 //      int startCount = 0;
536 //      int startIndex = 0;
537 //      
538 //      while(startCount < start && startIndex < qScores.length()){
539 //              if(isspace(qScores[startIndex])){
540 //                      startCount++;
541 //              }
542 //         startIndex++;
543 //      }
544 //      
545 //      int endCount = startCount;
546 //      int endIndex = startIndex;
547 //      
548 //      while(endCount < end && endIndex < qScores.length()){
549 //              if(isspace(qScores[endIndex])){
550 //                      endCount++;
551 //              }
552 //              endIndex++;
553 //      }
554 //      
555 //   return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
556 //      
557 //}
558
559 //**********************************************************************************************************************
560
561