5 * Created by Pat Schloss on 2/6/10.
6 * Copyright 2010 Patrick D. Schloss. All rights reserved.
10 #include "parsesffcommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
15 ParseSFFCommand::ParseSFFCommand(string option){
19 if(option == "help") {
24 //valid paramters for this command
25 string Array[] = {"sff", "oligos", "minlength", "outputdir", "inputdir"};
26 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28 OptionParser parser(option);
29 map<string,string> parameters = parser.getParameters();
31 ValidParameters validParameter;
32 map<string,string>::iterator it;
34 //check to make sure all parameters are valid for command
35 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //if the user changes the input directory command factory will send this info to us in the output parameter
40 string inputDir = validParameter.validFile(parameters, "inputdir", false);
41 if (inputDir == "not found"){ inputDir = ""; }
44 it = parameters.find("sff");
45 //user has given a template file
46 if(it != parameters.end()){
47 path = hasPath(it->second);
48 //if the user has not given a path then, add inputdir. else leave path alone.
49 if (path == "") { parameters["sff"] = inputDir + it->second; }
52 it = parameters.find("oligos");
53 //user has given an oligos file
54 if(it != parameters.end()){
55 path = hasPath(it->second);
56 //if the user has not given a path then, add inputdir. else leave path alone.
57 if (path == "") { parameters["oligos"] = inputDir + it->second; }
62 //check for required parameters
63 sffFile = validParameter.validFile(parameters, "sff", true);
64 if (sffFile == "not found"){
65 m->mothurOut("sff is a required parameter for the parse.sff command.");
66 m->mothurOutEndLine();
69 else if (sffFile == "not open") { abort = true; }
71 //if the user changes the output directory command factory will send this info to us in the output parameter
72 outputDir = validParameter.validFile(parameters, "outputdir", false);
73 if (outputDir == "not found"){
75 outputDir += hasPath(sffFile); //if user entered a file with a path then preserve it
78 //check for optional parameter and set defaults
79 // ...at some point should added some additional type checking...
80 oligoFile = validParameter.validFile(parameters, "oligos", true);
81 if (oligoFile == "not found") { oligoFile = ""; }
82 else if(oligoFile == "not open"){ abort = true; }
84 string temp = validParameter.validFile(parameters, "minlength", false);
85 if (temp == "not found") { temp = "0"; }
86 convert(temp, minLength);
90 m->errorOut(e, "ParseSFFCommand", "ParseSFFCommand");
95 //**********************************************************************************************************************
97 ParseSFFCommand::~ParseSFFCommand() { /* do nothing */ }
99 //**********************************************************************************************************************
101 int ParseSFFCommand::execute(){
103 if (abort == true) { return 0; }
106 openInputFile(sffFile, inSFF);
108 cout.setf(ios::fixed, ios::floatfield);
109 cout.setf(ios::showpoint);
110 cout << setprecision(2);
112 vector<ofstream*> flowFileNames;
114 getOligos(flowFileNames);
117 flowFileNames.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + "flow").c_str(), ios::ate));
118 outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + "flow"));
121 for(int i=0;i<flowFileNames.size();i++){
122 flowFileNames[i]->setf(ios::fixed, ios::floatfield);
123 flowFileNames[i]->setf(ios::showpoint);
124 *flowFileNames[i] << setprecision(2);
127 if (m->control_pressed) { for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
129 // ofstream fastaFile;
130 // openOutputFile(getRootName(sffFile) + "fasta", fastaFile);
132 // ofstream qualFile;
133 // openOutputFile(getRootName(sffFile) + "qual", qualFile);
135 string commonHeader = getline(inSFF);
136 string magicNumber = getline(inSFF);
137 string version = getline(inSFF);
138 string indexOffset = getline(inSFF);
139 string indexLength = getline(inSFF);
140 int numReads = parseHeaderLineToInt(inSFF);
141 string headerLength = getline(inSFF);
142 string keyLength = getline(inSFF);
143 int numFlows = parseHeaderLineToInt(inSFF);
144 string flowgramCode = getline(inSFF);
145 string flowChars = getline(inSFF);
146 string keySequence = getline(inSFF);
152 for(int i=0;i<numReads;i++){
154 if (m->control_pressed) { for(int i=0;i<flowFileNames.size();i++){ flowFileNames[i]->close(); } return 0; }
157 seqName = seqName.substr(1);
160 string runPrefix = parseHeaderLineToString(inSFF);
161 string regionNumber = parseHeaderLineToString(inSFF);
162 string xyLocation = parseHeaderLineToString(inSFF);
165 string runName = parseHeaderLineToString(inSFF);
166 string analysisName = parseHeaderLineToString(inSFF);
167 string fullPath = parseHeaderLineToString(inSFF);
170 string readHeaderLen = parseHeaderLineToString(inSFF);
171 string nameLength = parseHeaderLineToString(inSFF);
172 int numBases = parseHeaderLineToInt(inSFF);
173 string clipQualLeft = parseHeaderLineToString(inSFF);
174 int clipQualRight = parseHeaderLineToInt(inSFF);
175 string clipAdapLeft = parseHeaderLineToString(inSFF);
176 string clipAdapRight = parseHeaderLineToString(inSFF);
179 vector<float> flowVector = parseHeaderLineToFloatVector(inSFF, numFlows);
180 vector<int> flowIndices = parseHeaderLineToIntVector(inSFF, numBases);
181 string bases = parseHeaderLineToString(inSFF);
182 string qualityScores = parseHeaderLineToString(inSFF);
187 int flowLength = flowIndices[clipQualRight-1];
189 screenFlow(flowVector, flowLength);
190 string sequence = flow2seq(flowVector, flowLength);
194 if(minLength != 0 || numFPrimers != 0 || numBarcodes != 0 || numRPrimers != 0){
195 good = screenSeq(sequence, group);
199 *flowFileNames[group] << seqName << ' ' << flowLength;
200 for(int i=0;i<numFlows;i++){
201 *flowFileNames[group] << ' ' << flowVector[i];
203 *flowFileNames[group] << endl;
206 // string fastaHeader = '>' + seqName + "\tregion=" + regionNumber + " xy=" + xyLocation;
207 // fastaFile << fastaHeader << endl;
208 // fastaFile << stripSeqQual(bases, clipQualLeft, clipQualRight) << endl;
210 // qualFile << fastaHeader << endl;
211 // qualFile << stripQualQual(qualityScores, clipQualLeft, clipQualRight) << endl;
214 for(int i=0;i<flowFileNames.size();i++){
215 flowFileNames[i]->close();
218 m->mothurOutEndLine();
219 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
220 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
221 m->mothurOutEndLine();
223 // fastaFile.close();
228 catch(exception& e) {
229 m->errorOut(e, "ParseSFFCommand", "execute");
234 //**********************************************************************************************************************
236 void ParseSFFCommand::help(){
238 m->mothurOut("The parse.sff command...");
239 m->mothurOutEndLine();
241 catch(exception& e) {
242 m->errorOut(e, "ParseSFFCommand", "help");
247 //**********************************************************************************************************************
249 void ParseSFFCommand::getOligos(vector<ofstream*>& outSFFFlowVec){
253 openInputFile(oligoFile, inOligos);
255 string type, oligo, group;
259 while(!inOligos.eof()){
262 if(type[0] == '#'){ getline(inOligos); } // get rest of line if there's any crap there
266 for(int i=0;i<oligo.length();i++){
267 oligo[i] = toupper(oligo[i]);
268 if(oligo[i] == 'U') { oligo[i] = 'T'; }
270 if(type == "forward"){
271 forPrimer.push_back(oligo);
273 else if(type == "reverse"){
274 Sequence oligoRC("reverse", oligo);
275 oligoRC.reverseComplement();
276 revPrimer.push_back(oligoRC.getUnaligned());
278 else if(type == "barcode"){
280 barcodes[oligo]=index++;
281 groupVector.push_back(group);
283 outSFFFlowVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(sffFile)) + group + ".flow").c_str(), ios::ate));
284 outputNames.push_back((outputDir + getRootName(getSimpleName(sffFile)) + group + "flow"));
292 numFPrimers = forPrimer.size();
293 numRPrimers = revPrimer.size();
294 numBarcodes = barcodes.size();
296 catch(exception& e) {
297 m->errorOut(e, "ParseSFFCommand", "getOligos");
303 //**********************************************************************************************************************
305 int ParseSFFCommand::parseHeaderLineToInt(ifstream& file){
309 while (!file.eof()) {
322 //**********************************************************************************************************************
324 string ParseSFFCommand::parseHeaderLineToString(ifstream& file){
328 while (!file.eof()) {
333 text = getline(file);
342 //**********************************************************************************************************************
344 vector<float> ParseSFFCommand::parseHeaderLineToFloatVector(ifstream& file, int length){
346 vector<float> floatVector(length);
348 while (!file.eof()) {
351 for(int i=0;i<length;i++){
352 file >> floatVector[i];
361 //**********************************************************************************************************************
363 vector<int> ParseSFFCommand::parseHeaderLineToIntVector(ifstream& file, int length){
365 vector<int> intVector(length);
367 while (!file.eof()) {
370 for(int i=0;i<length;i++){
371 file >> intVector[i];
380 //**********************************************************************************************************************
383 void ParseSFFCommand::screenFlow(vector<float> flowgram, int& length){
388 while(newLength * 4 < length){
392 for(int i=0;i<4;i++){
393 float flow = flowgram[i + 4 * newLength];
397 if(flow <= 0.69){ // not sure why, but if i make it <0.70 it doesn't work...
402 if(noise > 0 || signal == 0){
407 length = newLength * 4;
410 catch(exception& e) {
411 m->errorOut(e, "ParseSFFCommand", "screenFlow");
416 //**********************************************************************************************************************
418 string ParseSFFCommand::flow2seq(vector<float> flowgram, int length){
420 string flow = "TACG";
421 string sequence = "";
422 for(int i=8;i<length;i++){
423 int signal = int(flowgram[i] + 0.5);
424 char base = flow[ i % 4 ];
425 for(int j=0;j<signal;j++){
432 //**********************************************************************************************************************
434 bool ParseSFFCommand::screenSeq(string& sequence, int& group){
439 if(sequence.length() < minLength){ length = 0; }
442 int barcodeLength = 0;
444 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
445 if(compareDNASeq(it->first, sequence.substr(0,(it->first).length()))){
447 barcodeLength = (it->first).size();
457 for(int i=0;i<numFPrimers;i++){
458 if(compareDNASeq(forPrimer[i], sequence.substr(barcodeLength,forPrimer[i].length()))){
468 for(int i=0;i<numRPrimers;i++){
469 if(compareDNASeq(revPrimer[i], sequence.substr(sequence.length()-revPrimer[i].length(),revPrimer[i].length()))){
478 return fPrimer * rPrimer * length * barcode;
482 //**********************************************************************************************************************
484 bool ParseSFFCommand::compareDNASeq(string oligo, string seq){
487 int length = oligo.length();
489 for(int i=0;i<length;i++){
491 if(oligo[i] != seq[i]){
492 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
493 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
494 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
495 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
496 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
497 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
498 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
499 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
500 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
501 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
502 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
503 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
505 if(success == 0) { break; }
514 catch(exception& e) {
515 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
520 //**********************************************************************************************************************
522 //string ParseSFFCommand::stripSeqQual(string qScores, int start, int end){
525 // return qScores.substr(start-1, end-start+1);
529 //**********************************************************************************************************************
531 //string ParseSFFCommand::stripQualQual(string qScores, int start, int end){
535 // int startCount = 0;
536 // int startIndex = 0;
538 // while(startCount < start && startIndex < qScores.length()){
539 // if(isspace(qScores[startIndex])){
545 // int endCount = startCount;
546 // int endIndex = startIndex;
548 // while(endCount < end && endIndex < qScores.length()){
549 // if(isspace(qScores[endIndex])){
555 // return qScores.substr(startIndex, endIndex-startIndex-1);//, endCount-startCount);
559 //**********************************************************************************************************************