]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
Merge remote-tracking branch 'mothur/master'
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21
22 struct fastqRead {
23         vector<int> scores;
24         string name;
25         string sequence;
26         
27         fastqRead() { name = ""; sequence = ""; scores.clear(); };
28         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
29         ~fastqRead() {};
30 };
31
32 struct pairFastqRead {
33         fastqRead forward;
34     fastqRead reverse;
35         
36         pairFastqRead() {};
37         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
38         ~pairFastqRead() {};
39 };
40 /**************************************************************************************************/
41
42 class MakeContigsCommand : public Command {
43 public:
44     MakeContigsCommand(string);
45     MakeContigsCommand();
46     ~MakeContigsCommand(){}
47     
48     vector<string> setParameters();
49     string getCommandName()                     { return "make.contigs";                        }
50     string getCommandCategory()         { return "Sequence Processing";         } 
51     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
52     
53         string getHelpString(); 
54     string getOutputPattern(string);    
55     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
56     string getDescription()             { return "description"; }
57     
58     int execute(); 
59     void help() { m->mothurOut(getHelpString()); }      
60     
61 private:
62     bool abort, allFiles, createGroup;
63     string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file;
64         float match, misMatch, gapOpen, gapExtend;
65         int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
66     vector<string> outputNames;
67     
68     map<int, oligosPair> barcodes;
69         map<int, oligosPair> primers;
70     vector<string>  linker;
71     vector<string>  spacer;
72         vector<string> primerNameVector;        
73         vector<string> barcodeNameVector;       
74     
75         map<string, int> groupCounts; 
76     map<string, string> groupMap;
77     
78     fastqRead readFastq(ifstream&, bool&);
79     vector< vector< vector<string> > > preProcessData(unsigned long int&);
80     vector< vector<string> > readFileNames(string);
81     vector< vector<string> > readFastqFiles(unsigned long int&, string, string);
82     vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
83     bool checkReads(fastqRead&, fastqRead&, string, string);
84     int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
85     int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
86     bool getOligos(vector<vector<string> >&, vector< vector<string> >&, string);
87     string reverseOligo(string);
88     vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
89 };
90
91 /**************************************************************************************************/
92
93 /**************************************************************************************************/
94 //custom data structure for threads to use.
95 // This is passed by void pointer so it can be any data type
96 // that can be passed using a single void pointer (LPVOID).
97 struct contigsData {
98         string outputFasta; 
99         string outputQual; 
100     string outputScrapFasta; 
101         string outputScrapQual;
102         string outputMisMatches;
103         string align;
104     vector<string> files;
105     vector<vector<string> > fastaFileNames;
106     vector<vector<string> > qualFileNames;
107         MothurOut* m;
108         float match, misMatch, gapOpen, gapExtend;
109         int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
110     bool allFiles, createGroup;
111     map<string, int> groupCounts; 
112     map<string, string> groupMap;
113     vector<string> primerNameVector;    
114         vector<string> barcodeNameVector;
115     map<int, oligosPair> barcodes;
116         map<int, oligosPair> primers;
117         
118         contigsData(){}
119         contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
120         files = f;
121                 outputFasta = of;
122         outputQual = oq;
123         outputMisMatches = om;
124         m = mout;
125                 match = ma; 
126                 misMatch = misMa;
127                 gapOpen = gapO; 
128                 gapExtend = gapE; 
129         threshold = thr;
130                 align = al;
131                 count = 0;
132         outputScrapFasta = osf;
133         outputScrapQual = osq;
134         fastaFileNames = ffn;
135         qualFileNames = qfn;
136         barcodes = br;
137         primers = pr;
138         barcodeNameVector = bnv;
139         primerNameVector = pnv;
140         pdiffs = pdf;
141         bdiffs = bdf;
142         tdiffs = tdf;
143         allFiles = all;
144         createGroup = cg;
145                 threadID = tid;
146         }
147 };
148
149 /**************************************************************************************************/
150 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
151 #else
152 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
153         contigsData* pDataArray;
154         pDataArray = (contigsData*)lpParam;
155         
156         try {
157         int longestBase = 1000;
158         Alignment* alignment;
159         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
160                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
161         
162         int num = 0;
163         string thisffastafile = pDataArray->files[0];
164         string thisfqualfile = pDataArray->files[1];
165         string thisrfastafile = pDataArray->files[2];
166         string thisrqualfile = pDataArray->files[3];
167         
168         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
169         
170                 if(pDataArray->allFiles){
171                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
172                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
173                                         if (pDataArray->fastaFileNames[i][j] != "") {
174                                                 ofstream temp;
175                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
176                         if (thisfqualfile != "") { pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                        temp.close(); }
177                                         }
178                                 }
179                         }
180                 }
181         
182         ifstream inFFasta, inRFasta, inFQual, inRQual;
183         ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
184         pDataArray->m->openInputFile(thisffastafile, inFFasta);
185         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
186         if (thisfqualfile != "") {
187             pDataArray->m->openInputFile(thisfqualfile, inFQual);
188             pDataArray->m->openInputFile(thisrqualfile, inRQual);
189             pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
190             pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
191         }
192         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
193         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
194         pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
195         
196         outMisMatch << "Name\tLength\tMisMatches\n";
197         
198         TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
199         
200         while ((!inFFasta.eof()) && (!inRFasta.eof())) {
201             
202             if (pDataArray->m->control_pressed) { break; }
203             
204             int success = 1;
205             string trashCode = "";
206             int currentSeqsDiffs = 0;
207             
208             //read seqs and quality info
209             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
210             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
211             QualityScores* fQual = NULL; QualityScores* rQual = NULL;
212             if (thisfqualfile != "") {
213                 fQual = new QualityScores(inFQual); pDataArray->m->gobble(inFQual);
214                 rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
215             }
216             
217             int barcodeIndex = 0;
218             int primerIndex = 0;
219             
220             if(pDataArray->barcodes.size() != 0){
221                 if (thisfqualfile != "") {
222                     success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
223                 }else {
224                     success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
225                 }
226                 if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
227                 else{ currentSeqsDiffs += success;  }
228             }
229             
230             if(pDataArray->primers.size() != 0){
231                 if (thisfqualfile != "") {
232                     success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
233                 }else {
234                     success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
235                 }
236                 if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
237                 else{ currentSeqsDiffs += success;  }
238             }
239             
240             if (currentSeqsDiffs > pDataArray->tdiffs)  {       trashCode += 't';   }
241             
242             //flip the reverse reads
243             rSeq.reverseComplement();
244             if (thisfqualfile != "") { rQual->flipQScores(); }
245            
246             //pairwise align
247             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
248             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
249             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
250             fSeq.setAligned(alignment->getSeqAAln());
251             rSeq.setAligned(alignment->getSeqBAln());
252             int length = fSeq.getAligned().length();
253             
254             //traverse alignments merging into one contiguous seq
255             string contig = "";
256             vector<int> contigScores; 
257             int numMismatches = 0;
258             string seq1 = fSeq.getAligned();
259             string seq2 = rSeq.getAligned();
260             vector<int> scores1, scores2;
261             if (thisfqualfile != "") {
262                 scores1 = fQual->getQualityScores();
263                 scores2 = rQual->getQualityScores();
264                 delete fQual; delete rQual;
265             }
266             
267             int overlapStart = fSeq.getStartPos();
268             int seq2Start = rSeq.getStartPos();
269             //bigger of the 2 starting positions is the location of the overlapping start
270             if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
271                 overlapStart = seq2Start; 
272                 for (int i = 0; i < overlapStart; i++) {
273                     contig += seq1[i];
274                     if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
275                 }
276             }else { //seq1 starts later so take from 0 to overlapStart from seq2
277                 for (int i = 0; i < overlapStart; i++) {
278                     contig += seq2[i];
279                     if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
280                 }
281             }
282             
283             int seq1End = fSeq.getEndPos();
284             int seq2End = rSeq.getEndPos();
285             int overlapEnd = seq1End;
286             if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
287             
288             for (int i = overlapStart; i < overlapEnd; i++) {
289                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
290                     contig += seq1[i];
291                     if (thisfqualfile != "") { 
292                         contigScores.push_back(scores1[ABaseMap[i]]);
293                         if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
294                     }
295                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
296                     if (thisfqualfile != "") {
297                         if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
298                         else {
299                             contig += seq2[i];
300                             contigScores.push_back(scores2[BBaseMap[i]]);
301                         }
302                     }else { contig += seq2[i]; }
303                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
304                     if (thisfqualfile != "") {
305                         if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
306                         else {
307                             contig += seq1[i];
308                             contigScores.push_back(scores1[ABaseMap[i]]);
309                         }
310                     }else { contig += seq1[i]; }
311                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
312                     if (thisfqualfile != "") {
313                         char c = seq1[i];
314                         contigScores.push_back(scores1[ABaseMap[i]]);
315                         if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
316                         contig += c;
317                         numMismatches++;
318                     }else { numMismatches++; }
319                 }else { //should never get here
320                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
321                 }
322             }
323             
324             if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
325                 for (int i = overlapEnd; i < length; i++) {
326                     contig += seq2[i];
327                     if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); }
328                 }
329             }else { //seq2 ends before seq1 so take from overlap to length from seq1
330                 for (int i = overlapEnd; i < length; i++) {
331                     contig += seq1[i];
332                     if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); }
333                 }
334                 
335             }
336
337             if(trashCode.length() == 0){
338                 bool ignore = false;
339                 if (pDataArray->createGroup) {
340                     if(pDataArray->barcodes.size() != 0){
341                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
342                         if (pDataArray->primers.size() != 0) { 
343                             if (pDataArray->primerNameVector[primerIndex] != "") { 
344                                 if(thisGroup != "") {
345                                     thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
346                                 }else {
347                                     thisGroup = pDataArray->primerNameVector[primerIndex]; 
348                                 }
349                             } 
350                         }
351                         
352                         if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
353                         
354                         int pos = thisGroup.find("ignore");
355                         if (pos == string::npos) {
356                             pDataArray->groupMap[fSeq.getName()] = thisGroup; 
357                         
358                             map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
359                             if (it == pDataArray->groupCounts.end()) {  pDataArray->groupCounts[thisGroup] = 1; }
360                             else { pDataArray->groupCounts[it->first] ++; }
361                         }else { ignore = true; }
362                     }
363                 }
364                 
365                 if(pDataArray->allFiles && !ignore){
366                     ofstream output;
367                     pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
368                     output << ">" << fSeq.getName() << endl << contig << endl;
369                     output.close();
370                     
371                     if (thisfqualfile != "") {
372                         pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
373                         output << ">" << fSeq.getName() << endl;
374                         for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
375                         output << endl;
376                         output.close(); 
377                     }
378                 }
379                 
380                 //output
381                 outFasta << ">" << fSeq.getName() << endl << contig << endl;
382                 if (thisfqualfile != "") {
383                     outQual << ">" << fSeq.getName() << endl;
384                     for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
385                     outQual << endl;
386                 }
387                 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
388             }else {
389                 //output
390                 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
391                 if (thisfqualfile != "") {
392                     outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
393                     for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
394                     outScrapQual << endl;
395                 }
396             }
397             num++;
398             
399                         //report progress
400                         if((num) % 1000 == 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
401                 }
402         
403                 //report progress
404                 if((num) % 1000 != 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
405         
406         inFFasta.close();
407         inRFasta.close();
408         outFasta.close();
409         outMisMatch.close();
410         outScrapFasta.close();
411         if (thisfqualfile != "") {
412             inFQual.close();
413             inRQual.close();
414             outQual.close();
415             outScrapQual.close();
416         }
417         delete alignment;
418         
419         if (pDataArray->m->control_pressed) {  pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);  pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);  if (thisfqualfile != "") { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); } }
420         
421         return 0;
422                 
423         }
424         catch(exception& e) {
425                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
426                 exit(1);
427         }
428
429 #endif
430
431
432 #endif