]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
added dups parameter to chimera.uchime. working on make.contigs command.
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21
22 struct fastqRead {
23         vector<int> scores;
24         string name;
25         string sequence;
26         
27         fastqRead() { name = ""; sequence = ""; scores.clear(); };
28         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
29         ~fastqRead() {};
30 };
31
32 /**************************************************************************************************/
33
34 class MakeContigsCommand : public Command {
35 public:
36     MakeContigsCommand(string);
37     MakeContigsCommand();
38     ~MakeContigsCommand(){}
39     
40     vector<string> setParameters();
41     string getCommandName()                     { return "make.contigs";                        }
42     string getCommandCategory()         { return "Sequence Processing";         } 
43     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
44     string getOutputFileNameTag(string, string);
45         string getHelpString(); 
46     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
47     string getDescription()             { return "description"; }
48     
49     int execute(); 
50     void help() { m->mothurOut(getHelpString()); }      
51     
52 private:
53     bool abort, allFiles;
54     string outputDir, ffastqfile, rfastqfile, align, oligosfile;
55         float match, misMatch, gapOpen, gapExtend;
56         int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
57     vector<string> outputNames;
58     
59     map<int, oligosPair> barcodes;
60         map<int, oligosPair> primers;
61     vector<string>  linker;
62     vector<string>  spacer;
63         vector<string> primerNameVector;        
64         vector<string> barcodeNameVector;       
65     
66         map<string, int> groupCounts;  
67     //map<string, int> combos;
68         //map<string, int> groupToIndex;
69     //vector<string> groupVector;
70     
71     fastqRead readFastq(ifstream&);
72     vector< vector<string> > readFastqFiles(int&);
73     bool checkReads(fastqRead&, fastqRead&);
74     int createProcesses(vector< vector<string> >, string, string, string);
75     int driver(vector<string>, string, string, string);
76     bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
77     string reverseOligo(string);
78 };
79
80 /**************************************************************************************************/
81
82 /**************************************************************************************************/
83 //custom data structure for threads to use.
84 // This is passed by void pointer so it can be any data type
85 // that can be passed using a single void pointer (LPVOID).
86 struct contigsData {
87         string outputFasta; 
88         string outputQual; 
89         string outputMisMatches;
90         string align;
91     vector<string> files;
92         MothurOut* m;
93         float match, misMatch, gapOpen, gapExtend;
94         int count, threshold, threadID;
95         
96         contigsData(){}
97         contigsData(vector<string> f, string of, string oq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
98         files = f;
99                 outputFasta = of;
100         outputQual = oq;
101         outputMisMatches = om;
102         m = mout;
103                 match = ma; 
104                 misMatch = misMa;
105                 gapOpen = gapO; 
106                 gapExtend = gapE; 
107         threshold = thr;
108                 align = al;
109                 count = 0;
110                 threadID = tid;
111         }
112 };
113
114 /**************************************************************************************************/
115 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
116 #else
117 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
118         contigsData* pDataArray;
119         pDataArray = (contigsData*)lpParam;
120         
121         try {
122         int longestBase = 1000;
123         Alignment* alignment;
124         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
125                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
126         
127         int num = 0;
128         string thisffastafile = pDataArray->files[0];
129         string thisfqualfile = pDataArray->files[1];
130         string thisrfastafile = pDataArray->files[2];
131         string thisrqualfile = pDataArray->files[3];
132         
133         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
134         
135         ifstream inFFasta, inRFasta, inFQual, inRQual;
136         pDataArray->m->openInputFile(thisffastafile, inFFasta);
137         pDataArray->m->openInputFile(thisfqualfile, inFQual);
138         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
139         pDataArray->m->openInputFile(thisrqualfile, inRQual);
140         
141         ofstream outFasta, outQual, outMisMatch;
142         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
143         pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
144         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
145         outMisMatch << "Name\tLength\tMisMatches\n";
146         
147         while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
148             
149             if (pDataArray->m->control_pressed) { break; }
150             
151             //read seqs and quality info
152             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
153             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
154             QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual);
155             QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual);
156             
157             //flip the reverse reads
158             rSeq.reverseComplement();
159             rQual.flipQScores();
160             
161             //pairwise align
162             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
163             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
164             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
165             fSeq.setAligned(alignment->getSeqAAln());
166             rSeq.setAligned(alignment->getSeqBAln());
167             int length = fSeq.getAligned().length();
168             
169             //traverse alignments merging into one contiguous seq
170             string contig = "";
171             vector<int> contigScores; 
172             int numMismatches = 0;
173             string seq1 = fSeq.getAligned();
174             string seq2 = rSeq.getAligned();
175             
176             vector<int> scores1 = fQual.getQualityScores();
177             vector<int> scores2 = rQual.getQualityScores();
178             
179             for (int i = 0; i < length; i++) {
180                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
181                     contig += seq1[i];
182                     contigScores.push_back(scores1[ABaseMap[i]]);
183                     if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; }
184                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
185                     if (scores2[BBaseMap[i]] >= pDataArray->threshold) {
186                         contig += seq2[i];
187                         contigScores.push_back(scores2[BBaseMap[i]]);
188                     }
189                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
190                     if (scores1[ABaseMap[i]] >= pDataArray->threshold) { 
191                         contig += seq1[i];
192                         contigScores.push_back(scores1[ABaseMap[i]]);
193                     }
194                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
195                     char c = seq1[i];
196                     contigScores.push_back(scores1[ABaseMap[i]]);
197                     if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; }
198                     contig += c;
199                     numMismatches++;
200                 }else { //should never get here
201                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
202                 }
203             }
204             
205             //output
206             outFasta << ">" << fSeq.getName() << endl << contig << endl;
207             outQual << ">" << fSeq.getName() << endl;
208             for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
209             outQual << endl;
210             outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
211             
212             num++;
213             
214                         //report progress
215                         if((num) % 1000 == 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
216                 }
217         
218                 //report progress
219                 if((num) % 1000 != 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
220         
221         inFFasta.close();
222         inFQual.close();
223         inRFasta.close();
224         inRQual.close();
225         outFasta.close();
226         outQual.close();
227         outMisMatch.close();
228         delete alignment;
229         
230         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);}
231         
232         return 0;
233                 
234         }
235         catch(exception& e) {
236                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
237                 exit(1);
238         }
239
240 #endif
241
242
243 #endif