]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
Merge remote-tracking branch 'mothur/master'
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21
22 struct fastqRead {
23         vector<int> scores;
24         string name;
25         string sequence;
26         
27         fastqRead() { name = ""; sequence = ""; scores.clear(); };
28         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
29         ~fastqRead() {};
30 };
31
32 struct pairFastqRead {
33         fastqRead forward;
34     fastqRead reverse;
35         
36         pairFastqRead() {};
37         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
38         ~pairFastqRead() {};
39 };
40 /**************************************************************************************************/
41
42 class MakeContigsCommand : public Command {
43 public:
44     MakeContigsCommand(string);
45     MakeContigsCommand();
46     ~MakeContigsCommand(){}
47     
48     vector<string> setParameters();
49     string getCommandName()                     { return "make.contigs";                        }
50     string getCommandCategory()         { return "Sequence Processing";         } 
51     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
52     
53         string getHelpString(); 
54     string getOutputPattern(string);    
55     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
56     string getDescription()             { return "description"; }
57     
58     int execute(); 
59     void help() { m->mothurOut(getHelpString()); }      
60     
61 private:
62     bool abort, allFiles, createGroup, trimOverlap;
63     string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format;
64         float match, misMatch, gapOpen, gapExtend;
65         int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq;
66     vector<string> outputNames;
67     
68     map<int, oligosPair> barcodes;
69         map<int, oligosPair> primers;
70     vector<string>  linker;
71     vector<string>  spacer;
72         vector<string> primerNameVector;        
73         vector<string> barcodeNameVector;
74         vector<char> convertTable;
75     
76         map<string, int> groupCounts; 
77     map<string, string> groupMap;
78     
79     vector<int> convertQual(string);
80     fastqRead readFastq(ifstream&, bool&);
81     vector< vector< vector<string> > > preProcessData(unsigned long int&);
82     vector< vector<string> > readFileNames(string);
83     vector< vector<string> > readFastqFiles(unsigned long int&, string, string);
84     vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
85     //bool checkReads(fastqRead&, fastqRead&, string, string);
86     int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >);
87     int driver(vector<string>, string, string, string, vector<vector<string> >, int);
88     bool getOligos(vector<vector<string> >&, string);
89     string reverseOligo(string);
90     vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
91 };
92
93 /**************************************************************************************************/
94
95 /**************************************************************************************************/
96 //custom data structure for threads to use.
97 // This is passed by void pointer so it can be any data type
98 // that can be passed using a single void pointer (LPVOID).
99 struct contigsData {
100         string outputFasta; 
101     string outputScrapFasta; 
102         string outputMisMatches;
103         string align;
104     vector<string> files;
105     vector<vector<string> > fastaFileNames;
106         MothurOut* m;
107         float match, misMatch, gapOpen, gapExtend;
108         int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq;
109     bool allFiles, createGroup, done, trimOverlap;
110     map<string, int> groupCounts; 
111     map<string, string> groupMap;
112     vector<string> primerNameVector;    
113         vector<string> barcodeNameVector;
114     map<int, oligosPair> barcodes;
115         map<int, oligosPair> primers;
116         
117         contigsData(){}
118         contigsData(vector<string> f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, bool to, int tid) {
119         files = f;
120                 outputFasta = of;
121         outputMisMatches = om;
122         m = mout;
123                 match = ma; 
124                 misMatch = misMa;
125                 gapOpen = gapO; 
126                 gapExtend = gapE; 
127         insert = thr;
128                 align = al;
129                 count = 0;
130         outputScrapFasta = osf;
131         fastaFileNames = ffn;
132         barcodes = br;
133         primers = pr;
134         barcodeNameVector = bnv;
135         primerNameVector = pnv;
136         pdiffs = pdf;
137         bdiffs = bdf;
138         tdiffs = tdf;
139         allFiles = all;
140         trimOverlap = to;
141         createGroup = cg;
142                 threadID = tid;
143         deltaq = delt;
144         done=false;
145         }
146 };
147
148 /**************************************************************************************************/
149 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
150 #else
151 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
152         contigsData* pDataArray;
153         pDataArray = (contigsData*)lpParam;
154         
155         try {
156         int longestBase = 1000;
157         Alignment* alignment;
158         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
159                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
160         
161         pDataArray->count = 0;
162         string thisffastafile = pDataArray->files[0];
163         string thisfqualfile = pDataArray->files[1];
164         string thisrfastafile = pDataArray->files[2];
165         string thisrqualfile = pDataArray->files[3];
166         
167         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
168         
169                 if(pDataArray->allFiles){
170                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
171                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
172                                         if (pDataArray->fastaFileNames[i][j] != "") {
173                                                 ofstream temp;
174                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
175                                         }
176                                 }
177                         }
178                 }
179         
180         ifstream inFFasta, inRFasta, inFQual, inRQual;
181         ofstream outFasta, outMisMatch, outScrapFasta;
182         pDataArray->m->openInputFile(thisffastafile, inFFasta);
183         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
184         if (thisfqualfile != "") {
185             pDataArray->m->openInputFile(thisfqualfile, inFQual);
186             pDataArray->m->openInputFile(thisrqualfile, inRQual);
187         }
188         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
189         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
190         pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
191         
192         if (pDataArray->threadID == 0) {  outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";  }
193         
194         TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
195         
196         while ((!inFFasta.eof()) && (!inRFasta.eof())) {
197             
198             if (pDataArray->m->control_pressed) { break; }
199             
200             int success = 1;
201             string trashCode = "";
202             int currentSeqsDiffs = 0;
203             
204             //read seqs and quality info
205             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
206             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
207             QualityScores* fQual = NULL; QualityScores* rQual = NULL;
208             if (thisfqualfile != "") {
209                 fQual = new QualityScores(inFQual); pDataArray->m->gobble(inFQual);
210                 rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
211             }
212             
213             int barcodeIndex = 0;
214             int primerIndex = 0;
215             
216             if(pDataArray->barcodes.size() != 0){
217                 if (thisfqualfile != "") {
218                     success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
219                 }else {
220                     success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
221                 }
222                 if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
223                 else{ currentSeqsDiffs += success;  }
224             }
225             
226             if(pDataArray->primers.size() != 0){
227                 if (thisfqualfile != "") {
228                     success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
229                 }else {
230                     success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
231                 }
232                 if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
233                 else{ currentSeqsDiffs += success;  }
234             }
235             
236             if (currentSeqsDiffs > pDataArray->tdiffs)  {       trashCode += 't';   }
237             
238             //flip the reverse reads
239             rSeq.reverseComplement();
240             if (thisfqualfile != "") { rQual->flipQScores(); }
241            
242             //pairwise align
243             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
244             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
245             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
246             fSeq.setAligned(alignment->getSeqAAln());
247             rSeq.setAligned(alignment->getSeqBAln());
248             int length = fSeq.getAligned().length();
249             
250             //traverse alignments merging into one contiguous seq
251             string contig = "";
252             int numMismatches = 0;
253             string seq1 = fSeq.getAligned();
254             string seq2 = rSeq.getAligned();
255             vector<int> scores1, scores2;
256             if (thisfqualfile != "") {
257                 scores1 = fQual->getQualityScores();
258                 scores2 = rQual->getQualityScores();
259                 delete fQual; delete rQual;
260             }
261             
262             int overlapStart = fSeq.getStartPos();
263             int seq2Start = rSeq.getStartPos();
264             //bigger of the 2 starting positions is the location of the overlapping start
265             if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
266                 overlapStart = seq2Start; 
267                 for (int i = 0; i < overlapStart; i++) { contig += seq1[i];  }
268             }else { //seq1 starts later so take from 0 to overlapStart from seq2
269                 for (int i = 0; i < overlapStart; i++) {  contig += seq2[i]; }
270             }
271             
272             int seq1End = fSeq.getEndPos();
273             int seq2End = rSeq.getEndPos();
274             int overlapEnd = seq1End;
275             if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
276             
277             int oStart = contig.length();
278             for (int i = overlapStart; i < overlapEnd; i++) {
279                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
280                     contig += seq1[i];
281                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
282                     if (thisfqualfile != "") {
283                         if (scores2[BBaseMap[i]] < pDataArray->insert) { } //
284                         else { contig += seq2[i];  }
285                     }else { contig += seq2[i]; } //with no quality info, then we keep it?
286                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
287                     if (thisfqualfile != "") {
288                         if (scores1[ABaseMap[i]] < pDataArray->insert) { } //
289                         else { contig += seq1[i];  }
290                     }else { contig += seq1[i]; } //with no quality info, then we keep it?
291                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
292                     if (thisfqualfile != "") {
293                         if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
294                             char c = seq1[i];
295                             if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
296                             contig += c;
297                         }else { //if no, base becomes n
298                             contig += 'N';
299                         }
300                         numMismatches++;
301                     }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
302                 }else { //should never get here
303                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
304                 }
305             }
306             int oend = contig.length();
307             
308             if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
309                 for (int i = overlapEnd; i < length; i++) { contig += seq2[i];  }
310             }else { //seq2 ends before seq1 so take from overlap to length from seq1
311                 for (int i = overlapEnd; i < length; i++) {  contig += seq1[i]; }
312             }
313
314             if (pDataArray->trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
315             
316             if(trashCode.length() == 0){
317                 bool ignore = false;
318                 if (pDataArray->createGroup) {
319                     if(pDataArray->barcodes.size() != 0){
320                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
321                         if (pDataArray->primers.size() != 0) { 
322                             if (pDataArray->primerNameVector[primerIndex] != "") { 
323                                 if(thisGroup != "") {
324                                     thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
325                                 }else {
326                                     thisGroup = pDataArray->primerNameVector[primerIndex]; 
327                                 }
328                             } 
329                         }
330                         
331                         if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
332                         
333                         int pos = thisGroup.find("ignore");
334                         if (pos == string::npos) {
335                             pDataArray->groupMap[fSeq.getName()] = thisGroup; 
336                         
337                             map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
338                             if (it == pDataArray->groupCounts.end()) {  pDataArray->groupCounts[thisGroup] = 1; }
339                             else { pDataArray->groupCounts[it->first] ++; }
340                         }else { ignore = true; }
341                     }
342                 }
343                 
344                 if(pDataArray->allFiles && !ignore){
345                     ofstream output;
346                     pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
347                     output << ">" << fSeq.getName() << endl << contig << endl;
348                     output.close();
349                 }
350                 
351                 //output
352                 outFasta << ">" << fSeq.getName() << endl << contig << endl;
353                 int numNs = 0;
354                 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; }  }
355                 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
356             }else {
357                 //output
358                 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
359             }
360             pDataArray->count++;
361             
362                         //report progress
363                         if((pDataArray->count) % 1000 == 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
364                 }
365         
366                 //report progress
367                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
368         
369         inFFasta.close();
370         inRFasta.close();
371         outFasta.close();
372         outMisMatch.close();
373         outScrapFasta.close();
374         if (thisfqualfile != "") {
375             inFQual.close();
376             inRQual.close();
377         }
378         delete alignment;
379         
380         pDataArray->done = true;
381         if (pDataArray->m->control_pressed) {  pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);  pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); }
382         
383         return 0;
384                 
385         }
386         catch(exception& e) {
387                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
388                 exit(1);
389         }
390
391 #endif
392
393
394 #endif