]> git.donarmstrong.com Git - mothur.git/blobdiff - makecontigscommand.cpp
added get.dists and remove.dists commands. fixed bug in trim.seqs windows paralelliza...
[mothur.git] / makecontigscommand.cpp
index 15ffb07294eed6d1e8b5109744e883d01fa4abaa..74133bbefc4abfb2f453893aa8ff2c39db4f0320 100644 (file)
@@ -33,6 +33,7 @@ vector<string> MakeContigsCommand::setParameters(){
                CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
         CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "","",false,false); parameters.push_back(pthreshold);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+        CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat);
                CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
                
@@ -51,13 +52,14 @@ string MakeContigsCommand::getHelpString(){
                string helpString = "";
                helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta.  It will also provide new quality files if the fastq or file parameter is used.\n";
         helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
-               helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n";
+               helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, format, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n";
                helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
         helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column.  Mothur will process each pair and create a combined fasta and qual file with all the sequences.\n";
         helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process.  If you provide one, you must provide the other.\n";
         helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process.  If you provide one, you must provide the other.\n";
         helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters.  If you provide one, you must provide the other.\n";
-               helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh and needleman. The default is needleman.\n";
+               helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n";
+        helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh and needleman. The default is needleman.\n";
         helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
                helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
                helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
@@ -317,6 +319,19 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                        
                        align = validParameter.validFile(parameters, "align", false);           if (align == "not found"){      align = "needleman";    }
                        if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
+            
+            format = validParameter.validFile(parameters, "format", false);            if (format == "not found"){     format = "sanger";      }
+            
+            if ((format != "sanger") && (format != "illumina") && (format != "solexa"))  { 
+                               m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine();
+                               abort=true;
+                       }
+            
+            //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
+            for (int i = -64; i < 65; i++) { 
+                char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
+                convertTable.push_back(temp);
+            }
         }
                
        }
@@ -628,8 +643,10 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                                                                tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
                                                                m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
                                 
-                                tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
-                                m->openOutputFile(tempPrimerQualFileNames[i][j], temp);                temp.close();
+                                if (files[processors-1][1] != "") {
+                                    tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
+                                    m->openOutputFile(tempPrimerQualFileNames[i][j], temp);            temp.close();
+                                }
                                                        }
                                                }
                                        }
@@ -750,9 +767,10 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                             tempFASTAFileNames[i][j] += extension;
                             m->openOutputFile(tempFASTAFileNames[i][j], temp);                 temp.close();
                             
-                            
-                            tempPrimerQualFileNames[i][j] += extension;
-                            m->openOutputFile(tempPrimerQualFileNames[i][j], temp);            temp.close();
+                            if (files[processors-1][1] != "") {
+                                tempPrimerQualFileNames[i][j] += extension;
+                                m->openOutputFile(tempPrimerQualFileNames[i][j], temp);                temp.close();
+                            }
                         }
                     }
                 }
@@ -778,9 +796,10 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                         tempFASTAFileNames[i][j] += extension;
                         m->openOutputFile(tempFASTAFileNames[i][j], temp);                     temp.close();
                         
-                        
-                        tempPrimerQualFileNames[i][j] += extension;
-                        m->openOutputFile(tempPrimerQualFileNames[i][j], temp);                temp.close();
+                        if (files[processors-1][1] != "") {
+                            tempPrimerQualFileNames[i][j] += extension;
+                            m->openOutputFile(tempPrimerQualFileNames[i][j], temp);            temp.close();
+                        }
                     }
                 }
             }
@@ -796,6 +815,7 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                }
         
         //do my part
+        processIDS.push_back(processors-1);
                num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames);       
         
                //Wait until all threads have terminated.
@@ -804,6 +824,9 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                //Close all thread handles and free memory allocations.
                for(int i=0; i < pDataArray.size(); i++){
                        num += pDataArray[i]->count;
+            if (!pDataArray[i]->done) {
+                m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of sequences assigned to it, quitting. \n"); m->control_pressed = true; 
+            }
             for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
                 map<string, int>::iterator it2 = groupCounts.find(it->first);
                 if (it2 == groupCounts.end()) {        groupCounts[it->first] = it->second; }
@@ -1032,8 +1055,12 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
                 }
                 
             }
-
+            
             if(trashCode.length() == 0){
+                bool ignore = false;
+                
+                if (m->debug) { m->mothurOut(fSeq.getName()); }
+                
                 if (createGroup) {
                     if(barcodes.size() != 0){
                         string thisGroup = barcodeNameVector[barcodeIndex];
@@ -1049,16 +1076,20 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
                         
                         if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
                         
-                        groupMap[fSeq.getName()] = thisGroup; 
+                        int pos = thisGroup.find("ignore");
+                        if (pos == string::npos) {
+                            groupMap[fSeq.getName()] = thisGroup; 
                         
-                        map<string, int>::iterator it = groupCounts.find(thisGroup);
-                        if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
-                        else { groupCounts[it->first] ++; }
+                            map<string, int>::iterator it = groupCounts.find(thisGroup);
+                            if (it == groupCounts.end()) {     groupCounts[thisGroup] = 1; }
+                            else { groupCounts[it->first] ++; }
+                        }else { ignore = true; }
                         
                     }
                 }
+                if (m->debug) { m->mothurOut("\n"); }
                 
-                if(allFiles){
+                if(allFiles && !ignore){
                     ofstream output;
                     m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
                     output << ">" << fSeq.getName() << endl << contig << endl;
@@ -1495,15 +1526,8 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
         if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
         if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
         
-        vector<int> qualScores;
-               int controlChar = int('!');
-               for (int i = 0; i < quality.length(); i++) { 
-                       int temp = int(quality[i]);
-                       temp -= controlChar;
-                       
-                       qualScores.push_back(temp);
-               }
-    
+        vector<int> qualScores = convertQual(quality);
+        
         read.name = name;
         read.sequence = sequence;
         read.scores = qualScores;
@@ -1657,7 +1681,7 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vect
                while(!in.eof()){
             
                        in >> type; 
-            
+            cout << type << endl;
                        if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }      
             
                        if(type[0] == '#'){
@@ -1740,6 +1764,7 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vect
                         
                     barcodes[indexBarcode]=newPair; indexBarcode++;
                                        barcodeNameVector.push_back(group);
+                    cout << group << endl;
                                }else if(type == "LINKER"){
                                        linker.push_back(foligo);
                     m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n");
@@ -1781,46 +1806,49 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vect
                                        
                                        string primerName = primerNameVector[itPrimer->first];
                                        string barcodeName = barcodeNameVector[itBar->first];
-                                       
-                                       string comboGroupName = "";
-                                       string fastaFileName = "";
-                                       string qualFileName = "";
-                                       string nameFileName = "";
-                    string countFileName = "";
-                                       
-                                       if(primerName == ""){
-                                               comboGroupName = barcodeNameVector[itBar->first];
-                                       }
-                                       else{
-                                               if(barcodeName == ""){
-                                                       comboGroupName = primerNameVector[itPrimer->first];
-                                               }
-                                               else{
-                                                       comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
-                                               }
-                                       }
-                                       
-                                       
-                                       ofstream temp;
-                                       fastaFileName = rootname + comboGroupName + ".fasta";
-                                       if (uniqueNames.count(fastaFileName) == 0) {
-                                               outputNames.push_back(fastaFileName);
-                                               outputTypes["fasta"].push_back(fastaFileName);
-                                               uniqueNames.insert(fastaFileName);
-                                       }
-                                       
-                                       fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
-                                       m->openOutputFile(fastaFileName, temp);         temp.close();
-                                       
-                    if ((fqualfile != "") || (ffastqfile != "") || (file != "")) {
-                        qualFileName = rootname + ".qual";
-                        if (uniqueNames.count(qualFileName) == 0) {
-                            outputNames.push_back(qualFileName);
-                            outputTypes["qfile"].push_back(qualFileName);
+                    
+                    if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing 
+                                       else {
+                        string comboGroupName = "";
+                        string fastaFileName = "";
+                        string qualFileName = "";
+                        string nameFileName = "";
+                        string countFileName = "";
+                        
+                        if(primerName == ""){
+                            comboGroupName = barcodeNameVector[itBar->first];
+                        }
+                        else{
+                            if(barcodeName == ""){
+                                comboGroupName = primerNameVector[itPrimer->first];
+                            }
+                            else{
+                                comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
+                            }
+                        }
+                        
+                        
+                        ofstream temp;
+                        fastaFileName = rootname + comboGroupName + ".fasta";
+                        if (uniqueNames.count(fastaFileName) == 0) {
+                            outputNames.push_back(fastaFileName);
+                            outputTypes["fasta"].push_back(fastaFileName);
+                            uniqueNames.insert(fastaFileName);
+                        }
+                        
+                        fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
+                        m->openOutputFile(fastaFileName, temp);                temp.close();
+                        
+                        if ((fqualfile != "") || (ffastqfile != "") || (file != "")) {
+                            qualFileName = rootname + ".qual";
+                            if (uniqueNames.count(qualFileName) == 0) {
+                                outputNames.push_back(qualFileName);
+                                outputTypes["qfile"].push_back(qualFileName);
+                            }
+                            
+                            qualFileNames[itBar->first][itPrimer->first] = qualFileName;
+                            m->openOutputFile(qualFileName, temp);             temp.close();
                         }
-                                               
-                        qualFileNames[itBar->first][itPrimer->first] = qualFileName;
-                        m->openOutputFile(qualFileName, temp);         temp.close();
                     }
                                }
                        }
@@ -1895,6 +1923,34 @@ string MakeContigsCommand::reverseOligo(string oligo){
        }
 }
 //**********************************************************************************************************************
+vector<int> MakeContigsCommand::convertQual(string qual) {
+       try {
+               vector<int> qualScores;
+               
+               for (int i = 0; i < qual.length(); i++) { 
+            
+            int temp = 0;
+            temp = int(qual[i]);
+            if (format == "illumina") {
+                temp -= 64; //char '@'
+            }else if (format == "solexa") {
+                temp = int(convertTable[temp]); //convert to sanger
+                temp -= int('!'); //char '!'
+            }else {
+                temp -= int('!'); //char '!'
+            }
+                       qualScores.push_back(temp);
+               }
+               
+               return qualScores;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeContigsCommand", "convertQual");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************