]> git.donarmstrong.com Git - mothur.git/blob - sffinfocommand.cpp
fixed problem with sffinfo that caused a file mismatch when the trimming trimmed...
[mothur.git] / sffinfocommand.cpp
1 /*
2  *  sffinfocommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 7/7/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sffinfocommand.h"
11 #include "endiannessmacros.h"
12
13 //**********************************************************************************************************************
14 vector<string> SffInfoCommand::getValidParameters(){    
15         try {
16                 string Array[] =  {"sff","qfile","fasta","flow","trim","accnos","sfftxt","outputdir","inputdir", "outputdir"};
17                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "SffInfoCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 SffInfoCommand::SffInfoCommand(){       
27         try {
28                 abort = true; calledHelp = true; 
29                 vector<string> tempOutNames;
30                 outputTypes["fasta"] = tempOutNames;
31                 outputTypes["flow"] = tempOutNames;
32                 outputTypes["sfftxt"] = tempOutNames;
33                 outputTypes["qfile"] = tempOutNames;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "SffInfoCommand", "SffInfoCommand");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 vector<string> SffInfoCommand::getRequiredParameters(){ 
42         try {
43                 string Array[] =  {"sff", "sfftxt", "or"};
44                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45                 return myArray;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "SffInfoCommand", "getRequiredParameters");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> SffInfoCommand::getRequiredFiles(){      
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SffInfoCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64
65 SffInfoCommand::SffInfoCommand(string option)  {
66         try {
67                 abort = false; calledHelp = false;   
68                 hasAccnos = false;
69                 
70                 //allow user to run help
71                 if(option == "help") { help(); abort = true; calledHelp = true; }
72                 
73                 else {
74                         //valid paramters for this command
75                         string Array[] =  {"sff","qfile","fasta","flow","trim","accnos","sfftxt","outputdir","inputdir", "outputdir"};
76                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
77                         
78                         OptionParser parser(option);
79                         map<string, string> parameters = parser.getParameters();
80                         
81                         ValidParameters validParameter;
82                         //check to make sure all parameters are valid for command
83                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
84                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
85                         }
86                         
87                         //initialize outputTypes
88                         vector<string> tempOutNames;
89                         outputTypes["fasta"] = tempOutNames;
90                         outputTypes["flow"] = tempOutNames;
91                         outputTypes["sfftxt"] = tempOutNames;
92                         outputTypes["qfile"] = tempOutNames;
93                         
94                         //if the user changes the output directory command factory will send this info to us in the output parameter 
95                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
96                         
97                         //if the user changes the input directory command factory will send this info to us in the output parameter 
98                         string inputDir = validParameter.validFile(parameters, "inputdir", false);        if (inputDir == "not found"){ inputDir = "";          }
99
100                         sffFilename = validParameter.validFile(parameters, "sff", false);
101                         if (sffFilename == "not found") { sffFilename = "";  }
102                         else { 
103                                 m->splitAtDash(sffFilename, filenames);
104                                 
105                                 //go through files and make sure they are good, if not, then disregard them
106                                 for (int i = 0; i < filenames.size(); i++) {
107                                         if (inputDir != "") {
108                                                 string path = m->hasPath(filenames[i]);
109                                                 //if the user has not given a path then, add inputdir. else leave path alone.
110                                                 if (path == "") {       filenames[i] = inputDir + filenames[i];         }
111                                         }
112         
113                                         ifstream in;
114                                         int ableToOpen = m->openInputFile(filenames[i], in, "noerror");
115                                 
116                                         //if you can't open it, try default location
117                                         if (ableToOpen == 1) {
118                                                 if (m->getDefaultPath() != "") { //default path is set
119                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(filenames[i]);
120                                                         m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
121                                                         ifstream in2;
122                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
123                                                         in2.close();
124                                                         filenames[i] = tryPath;
125                                                 }
126                                         }
127                                         
128                                         //if you can't open it, try default location
129                                         if (ableToOpen == 1) {
130                                                 if (m->getOutputDir() != "") { //default path is set
131                                                         string tryPath = m->getOutputDir() + m->getSimpleName(filenames[i]);
132                                                         m->mothurOut("Unable to open " + filenames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
133                                                         ifstream in2;
134                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
135                                                         in2.close();
136                                                         filenames[i] = tryPath;
137                                                 }
138                                         }
139                                         
140                                         in.close();
141                                         
142                                         if (ableToOpen == 1) { 
143                                                 m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine();
144                                                 //erase from file list
145                                                 filenames.erase(filenames.begin()+i);
146                                                 i--;
147                                         }
148                                 }
149                                 
150                                 //make sure there is at least one valid file left
151                                 if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
152                         }
153                         
154                         accnosName = validParameter.validFile(parameters, "accnos", false);
155                         if (accnosName == "not found") { accnosName = "";  }
156                         else { 
157                                 hasAccnos = true;
158                                 m->splitAtDash(accnosName, accnosFileNames);
159                                 
160                                 //go through files and make sure they are good, if not, then disregard them
161                                 for (int i = 0; i < accnosFileNames.size(); i++) {
162                                         if (inputDir != "") {
163                                                 string path = m->hasPath(accnosFileNames[i]);
164                                                 //if the user has not given a path then, add inputdir. else leave path alone.
165                                                 if (path == "") {       accnosFileNames[i] = inputDir + accnosFileNames[i];             }
166                                         }
167         
168                                         ifstream in;
169                                         int ableToOpen = m->openInputFile(accnosFileNames[i], in, "noerror");
170                                 
171                                         //if you can't open it, try default location
172                                         if (ableToOpen == 1) {
173                                                 if (m->getDefaultPath() != "") { //default path is set
174                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(accnosFileNames[i]);
175                                                         m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
176                                                         ifstream in2;
177                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
178                                                         in2.close();
179                                                         accnosFileNames[i] = tryPath;
180                                                 }
181                                         }
182                                         //if you can't open it, try default location
183                                         if (ableToOpen == 1) {
184                                                 if (m->getOutputDir() != "") { //default path is set
185                                                         string tryPath = m->getOutputDir() + m->getSimpleName(accnosFileNames[i]);
186                                                         m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
187                                                         ifstream in2;
188                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
189                                                         in2.close();
190                                                         accnosFileNames[i] = tryPath;
191                                                 }
192                                         }
193                                         in.close();
194                                         
195                                         if (ableToOpen == 1) { 
196                                                 m->mothurOut("Unable to open " + accnosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
197                                                 //erase from file list
198                                                 accnosFileNames.erase(accnosFileNames.begin()+i);
199                                                 i--;
200                                         }
201                                 }
202                                 
203                                 //make sure there is at least one valid file left
204                                 if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
205                         }
206                         
207                         if (hasAccnos) {
208                                 if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); }
209                         }
210                         
211                         string temp = validParameter.validFile(parameters, "qfile", false);                     if (temp == "not found"){       temp = "T";                             }
212                         qual = m->isTrue(temp); 
213                         
214                         temp = validParameter.validFile(parameters, "fasta", false);                            if (temp == "not found"){       temp = "T";                             }
215                         fasta = m->isTrue(temp); 
216                         
217                         temp = validParameter.validFile(parameters, "flow", false);                                     if (temp == "not found"){       temp = "F";                             }
218                         flow = m->isTrue(temp); 
219                         
220                         temp = validParameter.validFile(parameters, "trim", false);                                     if (temp == "not found"){       temp = "T";                             }
221                         trim = m->isTrue(temp); 
222                         
223                         temp = validParameter.validFile(parameters, "sfftxt", false);                           
224                         if (temp == "not found")        {       temp = "F";      sfftxt = false; sfftxtFilename = "";           }
225                         else if (m->isTrue(temp))       {       sfftxt = true;          sfftxtFilename = "";                            }
226                         else {
227                                 //you are a filename
228                                 if (inputDir != "") {
229                                         map<string,string>::iterator it = parameters.find("sfftxt");
230                                         //user has given a template file
231                                         if(it != parameters.end()){ 
232                                                 string path = m->hasPath(it->second);
233                                                 //if the user has not given a path then, add inputdir. else leave path alone.
234                                                 if (path == "") {       parameters["sfftxt"] = inputDir + it->second;           }
235                                         }
236                                 }
237                                 
238                                 sfftxtFilename = validParameter.validFile(parameters, "sfftxt", true);
239                                 if (sfftxtFilename == "not found") { sfftxtFilename = "";  }
240                                 else if (sfftxtFilename == "not open") { sfftxtFilename = "";  }
241                         }
242                         
243                         if ((sfftxtFilename == "") && (filenames.size() == 0)) {  m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true; }
244                 }
245         }
246         catch(exception& e) {
247                 m->errorOut(e, "SffInfoCommand", "SffInfoCommand");
248                 exit(1);
249         }
250 }
251 //**********************************************************************************************************************
252
253 void SffInfoCommand::help(){
254         try {
255                 m->mothurOut("The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file..\n");
256                 m->mothurOut("The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, and trim. sff is required. \n");
257                 m->mothurOut("The sff parameter allows you to enter the sff file you would like to extract data from.  You may enter multiple files by separating them by -'s.\n");
258                 m->mothurOut("The fasta parameter allows you to indicate if you would like a fasta formatted file generated.  Default=True. \n");
259                 m->mothurOut("The qfile parameter allows you to indicate if you would like a quality file generated.  Default=True. \n");
260                 m->mothurOut("The flow parameter allows you to indicate if you would like a flowgram file generated.  Default=False. \n");
261                 m->mothurOut("The sfftxt parameter allows you to indicate if you would like a sff.txt file generated.  Default=False. \n");
262                 m->mothurOut("If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n");
263                 m->mothurOut("The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values.  Default=True. \n");
264                 m->mothurOut("The accnos parameter allows you to provide a accnos file containing the names of the sequences you would like extracted. You may enter multiple files by separating them by -'s. \n");
265                 m->mothurOut("Example sffinfo(sff=mySffFile.sff, trim=F).\n");
266                 m->mothurOut("Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n\n");
267         }
268         catch(exception& e) {
269                 m->errorOut(e, "SffInfoCommand", "help");
270                 exit(1);
271         }
272 }
273 //**********************************************************************************************************************
274
275 SffInfoCommand::~SffInfoCommand(){}
276
277 //**********************************************************************************************************************
278 int SffInfoCommand::execute(){
279         try {
280                 
281                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
282                 
283                 for (int s = 0; s < filenames.size(); s++) {
284                         
285                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str());         } return 0; }
286                         
287                         int start = time(NULL);
288                         
289                         m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine();
290                         
291                         string accnos = "";
292                         if (hasAccnos) { accnos = accnosFileNames[s]; }
293                         
294                         int numReads = extractSffInfo(filenames[s], accnos);
295
296                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + ".");
297                 }
298                 
299                 if (sfftxtFilename != "") {  parseSffTxt(); }
300                 
301                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str());         } return 0; }
302                 
303                 //set fasta file as new current fastafile
304                 string current = "";
305                 itTypes = outputTypes.find("fasta");
306                 if (itTypes != outputTypes.end()) {
307                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
308                 }
309                 
310                 itTypes = outputTypes.find("qfile");
311                 if (itTypes != outputTypes.end()) {
312                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
313                 }       
314                 
315                 //report output filenames
316                 m->mothurOutEndLine();
317                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
318                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
319                 m->mothurOutEndLine();
320
321                 return 0;
322         }
323         catch(exception& e) {
324                 m->errorOut(e, "SffInfoCommand", "execute");
325                 exit(1);
326         }
327 }
328 //**********************************************************************************************************************
329 int SffInfoCommand::extractSffInfo(string input, string accnos){
330         try {
331                 
332                 if (outputDir == "") {  outputDir += m->hasPath(input); }
333                 
334                 if (accnos != "")       {  readAccnosFile(accnos);  }
335                 else                            {       seqNames.clear();               }
336
337                 ofstream outSfftxt, outFasta, outQual, outFlow;
338                 string outFastaFileName, outQualFileName;
339                 string sfftxtFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "sff.txt";
340                 string outFlowFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "flow";
341                 if (trim) {
342                         outFastaFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "fasta";
343                         outQualFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "qual";
344                 }else{
345                         outFastaFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "raw.fasta";
346                         outQualFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "raw.qual";
347                 }
348                 
349                 if (sfftxt) { m->openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint);  outputNames.push_back(sfftxtFileName);  outputTypes["sfftxt"].push_back(sfftxtFileName); }
350                 if (fasta)      { m->openOutputFile(outFastaFileName, outFasta);        outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); }
351                 if (qual)       { m->openOutputFile(outQualFileName, outQual);          outputNames.push_back(outQualFileName); outputTypes["qfile"].push_back(outQualFileName);  }
352                 if (flow)       { m->openOutputFile(outFlowFileName, outFlow);          outputNames.push_back(outFlowFileName);  outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName);  }
353                 
354                 ifstream in;
355                 in.open(input.c_str(), ios::binary);
356                 
357                 CommonHeader header; 
358                 readCommonHeader(in, header);
359         
360                 int count = 0;
361                 mycount = 0;
362                 
363                 //check magic number and version
364                 if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; }
365                 if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; }
366         
367                 //print common header
368                 if (sfftxt) {   printCommonHeader(outSfftxt, header);           }
369                 if (flow)       {       outFlow << header.numFlowsPerRead << endl;      }
370                         
371                 //read through the sff file
372                 while (!in.eof()) {
373                         
374                         bool print = true;
375                         
376                         //read header
377                         Header readheader;
378                         readHeader(in, readheader);
379                         
380                         //read data
381                         seqRead read; 
382                         readSeqData(in, read, header.numFlowsPerRead, readheader.numBases);
383                                 
384                         //if you have provided an accosfile and this seq is not in it, then dont print
385                         if (seqNames.size() != 0) {   if (seqNames.count(readheader.name) == 0) { print = false; }  }
386                         
387                         //print 
388                         if (print) {
389                                 if (sfftxt) { printHeader(outSfftxt, readheader); printSffTxtSeqData(outSfftxt, read, readheader); }
390                                 if (fasta)      {       printFastaSeqData(outFasta, read, readheader);  }
391                                 if (qual)       {       printQualSeqData(outQual, read, readheader);    }
392                                 if (flow)       {       printFlowSeqData(outFlow, read, readheader);    }
393                         }
394                         
395                         count++;
396                         mycount++;
397                 
398                         //report progress
399                         if((count+1) % 10000 == 0){     m->mothurOut(toString(count+1)); m->mothurOutEndLine();         }
400                 
401                         if (m->control_pressed) { count = 0; break;   }
402                         
403                         if (count >= header.numReads) { break; }
404                 }
405                 
406                 //report progress
407                 if (!m->control_pressed) {   if((count) % 10000 != 0){  m->mothurOut(toString(count)); m->mothurOutEndLine();           }  }
408                 
409                 in.close();
410                 
411                 if (sfftxt) {  outSfftxt.close();       }
412                 if (fasta)      {  outFasta.close();    }
413                 if (qual)       {  outQual.close();             }
414                 if (flow)       {  outFlow.close();             }
415                 
416                 return count;
417         }
418         catch(exception& e) {
419                 m->errorOut(e, "SffInfoCommand", "extractSffInfo");
420                 exit(1);
421         }
422 }
423 //**********************************************************************************************************************
424 int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
425         try {
426
427                 if (!in.eof()) {
428
429                         //read magic number
430                         char buffer[4];
431                         in.read(buffer, 4);
432                         header.magicNumber = be_int4(*(unsigned int *)(&buffer));
433                 
434                         //read version
435                         char buffer9[4];
436                         in.read(buffer9, 4);
437                         header.version = "";
438                         for (int i = 0; i < 4; i++) {  header.version += toString((int)(buffer9[i])); }
439                                 
440                         //read offset
441                         char buffer2 [8];
442                         in.read(buffer2, 8);
443                         header.indexOffset =  be_int8(*(unsigned long int *)(&buffer2));
444                         
445                         //read index length
446                         char buffer3 [4];
447                         in.read(buffer3, 4);
448                         header.indexLength =  be_int4(*(unsigned int *)(&buffer3));
449                         
450                         //read num reads
451                         char buffer4 [4];
452                         in.read(buffer4, 4);
453                         header.numReads =  be_int4(*(unsigned int *)(&buffer4));
454                                 
455                         //read header length
456                         char buffer5 [2];
457                         in.read(buffer5, 2);
458                         header.headerLength =  be_int2(*(unsigned short *)(&buffer5));
459                                         
460                         //read key length
461                         char buffer6 [2];
462                         in.read(buffer6, 2);
463                         header.keyLength = be_int2(*(unsigned short *)(&buffer6));
464                         
465                         //read number of flow reads
466                         char buffer7 [2];
467                         in.read(buffer7, 2);
468                         header.numFlowsPerRead =  be_int2(*(unsigned short *)(&buffer7));
469                                 
470                         //read format code
471                         char buffer8 [1];
472                         in.read(buffer8, 1);
473                         header.flogramFormatCode = (int)(buffer8[0]);
474                         
475                         //read flow chars
476                         char* tempBuffer = new char[header.numFlowsPerRead];
477                         in.read(&(*tempBuffer), header.numFlowsPerRead); 
478                         header.flowChars = tempBuffer;
479                         if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead);  }
480                         delete[] tempBuffer;
481                         
482                         //read key
483                         char* tempBuffer2 = new char[header.keyLength];
484                         in.read(&(*tempBuffer2), header.keyLength);
485                         header.keySequence = tempBuffer2;
486                         if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength);  }
487                         delete[] tempBuffer2;
488                                 
489                         /* Pad to 8 chars */
490                         unsigned long int spotInFile = in.tellg();
491                         unsigned long int spot = (spotInFile + 7)& ~7;  // ~ inverts
492                         in.seekg(spot);
493                         
494                 }else{
495                         m->mothurOut("Error reading sff common header."); m->mothurOutEndLine();
496                 }
497
498                 return 0;
499         }
500         catch(exception& e) {
501                 m->errorOut(e, "SffInfoCommand", "readCommonHeader");
502                 exit(1);
503         }
504 }
505 //**********************************************************************************************************************
506 int SffInfoCommand::readHeader(ifstream& in, Header& header){
507         try {
508         
509                 if (!in.eof()) {
510                         
511                         //read header length
512                         char buffer [2];
513                         in.read(buffer, 2);
514                         header.headerLength = be_int2(*(unsigned short *)(&buffer));
515                                                 
516                         //read name length
517                         char buffer2 [2];
518                         in.read(buffer2, 2);
519                         header.nameLength = be_int2(*(unsigned short *)(&buffer2));
520
521                         //read num bases
522                         char buffer3 [4];
523                         in.read(buffer3, 4);
524                         header.numBases =  be_int4(*(unsigned int *)(&buffer3));
525                         
526                         //read clip qual left
527                         char buffer4 [2];
528                         in.read(buffer4, 2);
529                         header.clipQualLeft =  be_int2(*(unsigned short *)(&buffer4));
530                         header.clipQualLeft = 5; 
531                         
532                         //read clip qual right
533                         char buffer5 [2];
534                         in.read(buffer5, 2);
535                         header.clipQualRight =  be_int2(*(unsigned short *)(&buffer5));
536                         
537                         //read clipAdapterLeft
538                         char buffer6 [2];
539                         in.read(buffer6, 2);
540                         header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6));
541
542                         //read clipAdapterRight
543                         char buffer7 [2];
544                         in.read(buffer7, 2);
545                         header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7));
546                 
547                         //read name
548                         char* tempBuffer = new char[header.nameLength];
549                         in.read(&(*tempBuffer), header.nameLength);
550                         header.name = tempBuffer;
551                         if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength);  }
552                         delete[] tempBuffer;
553                         
554                         //extract info from name
555                         decodeName(header.timestamp, header.region, header.xy, header.name);
556                         
557                         /* Pad to 8 chars */
558                         unsigned long int spotInFile = in.tellg();
559                         unsigned long int spot = (spotInFile + 7)& ~7;
560                         in.seekg(spot);
561                         
562                 }else{
563                         m->mothurOut("Error reading sff header info."); m->mothurOutEndLine();
564                 }
565
566                 return 0;
567         }
568         catch(exception& e) {
569                 m->errorOut(e, "SffInfoCommand", "readHeader");
570                 exit(1);
571         }
572 }
573 //**********************************************************************************************************************
574 int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, int numBases){
575         try {
576         
577                 if (!in.eof()) {
578         
579                         //read flowgram
580                         read.flowgram.resize(numFlowReads);
581                         for (int i = 0; i < numFlowReads; i++) {  
582                                 char buffer [2];
583                                 in.read(buffer, 2);
584                                 read.flowgram[i] = be_int2(*(unsigned short *)(&buffer));
585                         }
586         
587                         //read flowIndex
588                         read.flowIndex.resize(numBases);
589                         for (int i = 0; i < numBases; i++) {  
590                                 char temp[1];
591                                 in.read(temp, 1);
592                                 read.flowIndex[i] = be_int1(*(unsigned char *)(&temp));
593                         }
594         
595                         //read bases
596                         char* tempBuffer = new char[numBases];
597                         in.read(&(*tempBuffer), numBases);
598                         read.bases = tempBuffer;
599                         if (read.bases.length() > numBases) { read.bases = read.bases.substr(0, numBases);  }
600                         delete[] tempBuffer;
601
602                         //read qual scores
603                         read.qualScores.resize(numBases);
604                         for (int i = 0; i < numBases; i++) {  
605                                 char temp[1];
606                                 in.read(temp, 1);
607                                 read.qualScores[i] = be_int1(*(unsigned char *)(&temp));
608                         }
609         
610                         /* Pad to 8 chars */
611                         unsigned long int spotInFile = in.tellg();
612                         unsigned long int spot = (spotInFile + 7)& ~7;
613                         in.seekg(spot);
614                         
615                 }else{
616                         m->mothurOut("Error reading."); m->mothurOutEndLine();
617                 }
618
619                 return 0;
620         }
621         catch(exception& e) {
622                 m->errorOut(e, "SffInfoCommand", "readSeqData");
623                 exit(1);
624         }
625 }
626 //**********************************************************************************************************************
627 int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) {
628         try {
629                 
630                 if (name.length() >= 6) {
631                         string time = name.substr(0, 6);
632                         unsigned int timeNum = m->fromBase36(time);
633                         
634                         int q1 = timeNum / 60;
635                         int sec = timeNum - 60 * q1;
636                         int q2 = q1 / 60;
637                         int minute = q1 - 60 * q2;
638                         int q3 = q2 / 24;
639                         int hr = q2 - 24 * q3;
640                         int q4 = q3 / 32;
641                         int day = q3 - 32 * q4;
642                         int q5 = q4 / 13;
643                         int mon = q4 - 13 * q5;
644                         int year = 2000 + q5;
645                 
646                         timestamp = toString(year) + "_" + toString(mon) + "_" + toString(day) + "_" + toString(hr) + "_" + toString(minute) + "_" + toString(sec);
647                 }
648                 
649                 if (name.length() >= 9) {
650                         region = name.substr(7, 2);
651                 
652                         string xyNum = name.substr(9);
653                         unsigned int myXy = m->fromBase36(xyNum);
654                         int x = myXy >> 12;
655                         int y = myXy & 4095;
656                 
657                         xy = toString(x) + "_" + toString(y);
658                 }
659                 
660                 return 0;
661         }
662         catch(exception& e) {
663                 m->errorOut(e, "SffInfoCommand", "decodeName");
664                 exit(1);
665         }
666 }
667 //**********************************************************************************************************************
668 int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader& header) {
669         try {
670         
671                 out << "Common Header:\nMagic Number: " << header.magicNumber << endl;
672                 out << "Version: " << header.version << endl;
673                 out << "Index Offset: " << header.indexOffset << endl;
674                 out << "Index Length: " << header.indexLength << endl;
675                 out << "Number of Reads: " << header.numReads << endl;
676                 out << "Header Length: " << header.headerLength << endl;
677                 out << "Key Length: " << header.keyLength << endl;
678                 out << "Number of Flows: " << header.numFlowsPerRead << endl;
679                 out << "Format Code: " << header.flogramFormatCode << endl;
680                 out << "Flow Chars: " << header.flowChars << endl;
681                 out << "Key Sequence: " << header.keySequence << endl << endl;
682                         
683                 return 0;
684         }
685         catch(exception& e) {
686                 m->errorOut(e, "SffInfoCommand", "printCommonHeader");
687                 exit(1);
688         }
689 }
690 //**********************************************************************************************************************
691 int SffInfoCommand::printHeader(ofstream& out, Header& header) {
692         try {
693                 
694                 out << ">" << header.name << endl;
695                 out << "Run Prefix: " << header.timestamp << endl;
696                 out << "Region #:  " << header.region << endl;
697                 out << "XY Location: " << header.xy << endl << endl;
698                 
699                 out << "Run Name:  " << endl;
700                 out << "Analysis Name:  " << endl;
701                 out << "Full Path: " << endl << endl;
702                 
703                 out << "Read Header Len: " << header.headerLength << endl;
704                 out << "Name Length: " << header.nameLength << endl;
705                 out << "# of Bases: " << header.numBases << endl;
706                 out << "Clip Qual Left: " << header.clipQualLeft << endl;
707                 out << "Clip Qual Right: " << header.clipQualRight << endl;
708                 out << "Clip Adap Left: " << header.clipAdapterLeft << endl;
709                 out << "Clip Adap Right: " << header.clipAdapterRight << endl << endl;
710                 
711                 return 0;
712         }
713         catch(exception& e) {
714                 m->errorOut(e, "SffInfoCommand", "printHeader");
715                 exit(1);
716         }
717 }
718
719 //**********************************************************************************************************************
720 int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read, Header& header) {
721         try {
722                 
723                 out << "Flowgram: ";
724                 for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t';  }
725                 
726                 out << endl <<  "Flow Indexes: ";
727                 int sum = 0;
728                 for (int i = 0; i < read.flowIndex.size(); i++) {  sum +=  read.flowIndex[i];  out << sum << '\t'; }
729                 
730                 //make the bases you want to clip lowercase and the bases you want to keep upper case
731                 if(header.clipQualRight == 0){  header.clipQualRight = read.bases.length();     }
732                 for (int i = 0; i < (header.clipQualLeft-1); i++) { read.bases[i] = tolower(read.bases[i]); }
733                 for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++) {   read.bases[i] = toupper(read.bases[i]);  }
734                 for (int i = (header.clipQualRight-1); i < read.bases.length(); i++) {   read.bases[i] = tolower(read.bases[i]);  }
735                 
736                 out << endl <<  "Bases: " << read.bases << endl << "Quality Scores: ";
737                 for (int i = 0; i < read.qualScores.size(); i++) {   out << read.qualScores[i] << '\t';  }
738         
739                 
740                 out << endl << endl;
741                 
742                 return 0;
743         }
744         catch(exception& e) {
745                 m->errorOut(e, "SffInfoCommand", "printSffTxtSeqData");
746                 exit(1);
747         }
748 }
749 //**********************************************************************************************************************
750 int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) {
751         try {
752                 
753                 string seq = read.bases;
754                 
755                 if (trim) {
756                         if(header.clipQualRight < header.clipQualLeft){
757                                 seq = "NNNN";
758                         }
759                         else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){
760                                 seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft));
761                         }
762                         else {
763                                 seq = seq.substr(header.clipQualLeft-1);
764                         }
765                 }else{
766                         //if you wanted the sfftxt then you already converted the bases to the right case
767                         if (!sfftxt) {
768                                 //make the bases you want to clip lowercase and the bases you want to keep upper case
769                                 if(header.clipQualRight == 0){  header.clipQualRight = seq.length();    }
770                                 for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]);  }
771                                 for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++)  {   seq[i] = toupper(seq[i]);  }
772                                 for (int i = (header.clipQualRight-1); i < seq.length(); i++) {   seq[i] = tolower(seq[i]);  }
773                         }
774                 }
775                 
776                 out << ">" << header.name  << " xy=" << header.xy << endl;
777                 out << seq << endl;
778                 
779                 return 0;
780         }
781         catch(exception& e) {
782                 m->errorOut(e, "SffInfoCommand", "printFastaSeqData");
783                 exit(1);
784         }
785 }
786
787 //**********************************************************************************************************************
788 int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& header) {
789         try {
790                 
791                 if (trim) {
792                         if(header.clipQualRight < header.clipQualLeft){
793                                 out << ">" << header.name << " xy=" << header.xy << endl;
794                                 out << "0\t0\t0\t0";
795                         }
796                         else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){
797                                 out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl;
798                                 for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++) {   out << read.qualScores[i] << '\t'; }
799                         }
800                         else{
801                                 out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl;
802                                 for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) {   out << read.qualScores[i] << '\t';   }                       
803                         }
804                 }else{
805                         out << ">" << header.name << " xy=" << header.xy << " length=" << read.qualScores.size() << endl;
806                         for (int i = 0; i < read.qualScores.size(); i++) {   out << read.qualScores[i] << '\t';  }
807                 }
808                 
809                 out << endl;
810                 
811                 return 0;
812         }
813         catch(exception& e) {
814                 m->errorOut(e, "SffInfoCommand", "printQualSeqData");
815                 exit(1);
816         }
817 }
818
819 //**********************************************************************************************************************
820 int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) {
821         try {
822                 if(header.clipQualRight > header.clipQualLeft){
823                         
824                         int rightIndex = 0;
825                         for (int i = 0; i < header.clipQualRight; i++) {  rightIndex +=  read.flowIndex[i];     }
826
827                         out << header.name << ' ' << rightIndex;
828                         for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << ' ' << (read.flowgram[i]/(float)100);  }
829                         out << endl;
830                 }
831                 
832                 
833                 return 0;
834         }
835         catch(exception& e) {
836                 m->errorOut(e, "SffInfoCommand", "printFlowSeqData");
837                 exit(1);
838         }
839 }
840 //**********************************************************************************************************************
841 int SffInfoCommand::readAccnosFile(string filename) {
842         try {
843                 //remove old names
844                 seqNames.clear();
845                 
846                 ifstream in;
847                 m->openInputFile(filename, in);
848                 string name;
849                 
850                 while(!in.eof()){
851                         in >> name; m->gobble(in);
852                                                 
853                         seqNames.insert(name);
854                         
855                         if (m->control_pressed) { seqNames.clear(); break; }
856                 }
857                 in.close();             
858                 
859                 return 0;
860         }
861         catch(exception& e) {
862                 m->errorOut(e, "SffInfoCommand", "readAccnosFile");
863                 exit(1);
864         }
865 }
866 //**********************************************************************************************************************
867 int SffInfoCommand::parseSffTxt() {
868         try {
869                 
870                 ifstream inSFF;
871                 m->openInputFile(sfftxtFilename, inSFF);
872                 
873                 if (outputDir == "") {  outputDir += m->hasPath(sfftxtFilename); }
874                 
875                 //output file names
876                 ofstream outFasta, outQual, outFlow;
877                 string outFastaFileName, outQualFileName;
878                 string outFlowFileName = outputDir + m->getRootName(m->getSimpleName(sfftxtFilename)) + "flow";
879                 if (trim) {
880                         outFastaFileName = outputDir + m->getRootName(m->getSimpleName(sfftxtFilename)) + "fasta";
881                         outQualFileName = outputDir + m->getRootName(m->getSimpleName(sfftxtFilename)) + "qual";
882                 }else{
883                         outFastaFileName = outputDir + m->getRootName(m->getSimpleName(sfftxtFilename)) + "raw.fasta";
884                         outQualFileName = outputDir + m->getRootName(m->getSimpleName(sfftxtFilename)) + "raw.qual";
885                 }
886                 
887                 if (fasta)      { m->openOutputFile(outFastaFileName, outFasta);        outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); }
888                 if (qual)       { m->openOutputFile(outQualFileName, outQual);          outputNames.push_back(outQualFileName); outputTypes["qual"].push_back(outQualFileName);  }
889                 if (flow)       { m->openOutputFile(outFlowFileName, outFlow);          outputNames.push_back(outFlowFileName);  outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName);  }
890                 
891                 //read common header
892                 string commonHeader = m->getline(inSFF);
893                 string magicNumber = m->getline(inSFF); 
894                 string version = m->getline(inSFF);
895                 string indexOffset = m->getline(inSFF);
896                 string indexLength = m->getline(inSFF);
897                 int numReads = parseHeaderLineToInt(inSFF);
898                 string headerLength = m->getline(inSFF);
899                 string keyLength = m->getline(inSFF);
900                 int numFlows = parseHeaderLineToInt(inSFF);
901                 string flowgramCode = m->getline(inSFF);
902                 string flowChars = m->getline(inSFF);
903                 string keySequence = m->getline(inSFF);
904                 m->gobble(inSFF);
905                 
906                 string seqName;
907                 
908                 if (flow)       {       outFlow << numFlows << endl;    }
909                 
910                 for(int i=0;i<numReads;i++){
911                         
912                         //sanity check
913                         if (inSFF.eof()) { m->mothurOut("[ERROR]: Expected " + toString(numReads) + " but reached end of file at " + toString(i+1) + "."); m->mothurOutEndLine(); break; }
914                         
915                         Header header;
916                         
917                         //parse read header
918                         inSFF >> seqName;
919                         seqName = seqName.substr(1);
920                         m->gobble(inSFF);
921                         header.name = seqName;
922                         
923                         string runPrefix = parseHeaderLineToString(inSFF);              header.timestamp = runPrefix;
924                         string regionNumber = parseHeaderLineToString(inSFF);   header.region = regionNumber;
925                         string xyLocation = parseHeaderLineToString(inSFF);             header.xy = xyLocation;
926                         m->gobble(inSFF);
927                                 
928                         string runName = parseHeaderLineToString(inSFF);
929                         string analysisName = parseHeaderLineToString(inSFF);
930                         string fullPath = parseHeaderLineToString(inSFF);
931                         m->gobble(inSFF);
932                         
933                         string readHeaderLen = parseHeaderLineToString(inSFF);  convert(readHeaderLen, header.headerLength);
934                         string nameLength = parseHeaderLineToString(inSFF);             convert(nameLength, header.nameLength);
935                         int numBases = parseHeaderLineToInt(inSFF);                             header.numBases = numBases;
936                         string clipQualLeft = parseHeaderLineToString(inSFF);   convert(clipQualLeft, header.clipQualLeft);
937                         int clipQualRight = parseHeaderLineToInt(inSFF);                header.clipQualRight = clipQualRight;
938                         string clipAdapLeft = parseHeaderLineToString(inSFF);   convert(clipAdapLeft, header.clipAdapterLeft);
939                         string clipAdapRight = parseHeaderLineToString(inSFF);  convert(clipAdapRight, header.clipAdapterRight);
940                         m->gobble(inSFF);
941                                 
942                         seqRead read;
943                         
944                         //parse read
945                         vector<unsigned short> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);      read.flowgram = flowVector;
946                         vector<unsigned int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases); 
947                         
948                         //adjust for print
949                         vector<unsigned int> flowIndicesAdjusted; flowIndicesAdjusted.push_back(flowIndices[0]);
950                         for (int j = 1; j < flowIndices.size(); j++) {   flowIndicesAdjusted.push_back(flowIndices[j] - flowIndices[j-1]);   }
951                         read.flowIndex = flowIndicesAdjusted;
952                         
953                         string bases = parseHeaderLineToString(inSFF);                                                                          read.bases = bases;
954                         vector<unsigned int> qualityScores = parseHeaderLineToIntVector(inSFF, numBases);       read.qualScores = qualityScores;
955                         m->gobble(inSFF);
956                                         
957                         //if you have provided an accosfile and this seq is not in it, then dont print
958                         bool print = true;
959                         if (seqNames.size() != 0) {   if (seqNames.count(header.name) == 0) { print = false; }  }
960                         
961                         //print 
962                         if (print) {
963                                 if (fasta)      {       printFastaSeqData(outFasta, read, header);      }
964                                 if (qual)       {       printQualSeqData(outQual, read, header);        }
965                                 if (flow)       {       printFlowSeqData(outFlow, read, header);        }
966                         }
967                         
968                         //report progress
969                         if((i+1) % 10000 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine();             }
970                         
971                         if (m->control_pressed) {  break;  }
972                 }
973                 
974                 //report progress
975                 if (!m->control_pressed) {   if((numReads) % 10000 != 0){       m->mothurOut(toString(numReads)); m->mothurOutEndLine();                }  }
976                 
977                 inSFF.close();
978                 
979                 if (fasta)      {  outFasta.close();    }
980                 if (qual)       {  outQual.close();             }
981                 if (flow)       {  outFlow.close();             }
982                 
983                 return 0;
984         }
985         catch(exception& e) {
986                 m->errorOut(e, "SffInfoCommand", "parseSffTxt");
987                 exit(1);
988         }
989 }
990 //**********************************************************************************************************************
991
992 int SffInfoCommand::parseHeaderLineToInt(ifstream& file){
993         try {
994                 int number;
995                 
996                 while (!file.eof())     {
997                         
998                         char c = file.get(); 
999                         if (c == ':'){
1000                                 file >> number;
1001                                 break;
1002                         }
1003                         
1004                 }
1005                 m->gobble(file);
1006                 return number;
1007         }
1008         catch(exception& e) {
1009                 m->errorOut(e, "SffInfoCommand", "parseHeaderLineToInt");
1010                 exit(1);
1011         }
1012         
1013 }
1014
1015 //**********************************************************************************************************************
1016
1017 string SffInfoCommand::parseHeaderLineToString(ifstream& file){
1018         try {
1019                 string text;
1020                 
1021                 while (!file.eof())     {
1022                         char c = file.get(); 
1023                         
1024                         if (c == ':'){
1025                                 //m->gobble(file);
1026                                 //text = m->getline(file);      
1027                                 file >> text;
1028                                 break;
1029                         }
1030                 }
1031                 m->gobble(file);
1032                 
1033                 return text;
1034         }
1035         catch(exception& e) {
1036                 m->errorOut(e, "SffInfoCommand", "parseHeaderLineToString");
1037                 exit(1);
1038         }
1039 }
1040
1041 //**********************************************************************************************************************
1042
1043 vector<unsigned short> SffInfoCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
1044         try {
1045                 vector<unsigned short> floatVector(length);
1046                 
1047                 while (!file.eof())     {
1048                         char c = file.get(); 
1049                         if (c == ':'){
1050                                 float temp;
1051                                 for(int i=0;i<length;i++){
1052                                         file >> temp;
1053                                         floatVector[i] = temp * 100;
1054                                 }
1055                                 break;
1056                         }
1057                 }
1058                 m->gobble(file);        
1059                 return floatVector;
1060         }
1061         catch(exception& e) {
1062                 m->errorOut(e, "SffInfoCommand", "parseHeaderLineToFloatVector");
1063                 exit(1);
1064         }
1065 }
1066
1067 //**********************************************************************************************************************
1068
1069 vector<unsigned int> SffInfoCommand::parseHeaderLineToIntVector(ifstream& file, int length){
1070         try {
1071                 vector<unsigned int> intVector(length);
1072                 
1073                 while (!file.eof())     {
1074                         char c = file.get(); 
1075                         if (c == ':'){
1076                                 for(int i=0;i<length;i++){
1077                                         file >> intVector[i];
1078                                 }
1079                                 break;
1080                         }
1081                 }
1082                 m->gobble(file);        
1083                 return intVector;
1084         }
1085         catch(exception& e) {
1086                 m->errorOut(e, "SffInfoCommand", "parseHeaderLineToIntVector");
1087                 exit(1);
1088         }
1089 }
1090
1091 //**********************************************************************************************************************
1092
1093
1094                                 
1095