]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
Merge remote-tracking branch 'origin/master'
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21
22 struct fastqRead {
23         vector<int> scores;
24         string name;
25         string sequence;
26         
27         fastqRead() { name = ""; sequence = ""; scores.clear(); };
28         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
29         ~fastqRead() {};
30 };
31
32 struct pairFastqRead {
33         fastqRead forward;
34     fastqRead reverse;
35         
36         pairFastqRead() {};
37         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
38         ~pairFastqRead() {};
39 };
40 /**************************************************************************************************/
41
42 class MakeContigsCommand : public Command {
43 public:
44     MakeContigsCommand(string);
45     MakeContigsCommand();
46     ~MakeContigsCommand(){}
47     
48     vector<string> setParameters();
49     string getCommandName()                     { return "make.contigs";                        }
50     string getCommandCategory()         { return "Sequence Processing";         } 
51     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
52     string getOutputFileNameTag(string, string);
53         string getHelpString(); 
54     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
55     string getDescription()             { return "description"; }
56     
57     int execute(); 
58     void help() { m->mothurOut(getHelpString()); }      
59     
60 private:
61     bool abort, allFiles, createGroup;
62     string outputDir, ffastqfile, rfastqfile, align, oligosfile;
63         float match, misMatch, gapOpen, gapExtend;
64         int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
65     vector<string> outputNames;
66     
67     map<int, oligosPair> barcodes;
68         map<int, oligosPair> primers;
69     vector<string>  linker;
70     vector<string>  spacer;
71         vector<string> primerNameVector;        
72         vector<string> barcodeNameVector;       
73     
74         map<string, int> groupCounts; 
75     map<string, string> groupMap;
76     //map<string, int> combos;
77         //map<string, int> groupToIndex;
78     //vector<string> groupVector;
79     
80     fastqRead readFastq(ifstream&, bool&);
81     vector< vector<string> > readFastqFiles(int&);
82     bool checkReads(fastqRead&, fastqRead&);
83     int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
84     int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
85     bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
86     string reverseOligo(string);
87     vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
88 };
89
90 /**************************************************************************************************/
91
92 /**************************************************************************************************/
93 //custom data structure for threads to use.
94 // This is passed by void pointer so it can be any data type
95 // that can be passed using a single void pointer (LPVOID).
96 struct contigsData {
97         string outputFasta; 
98         string outputQual; 
99     string outputScrapFasta; 
100         string outputScrapQual;
101         string outputMisMatches;
102         string align;
103     vector<string> files;
104     vector<vector<string> > fastaFileNames;
105     vector<vector<string> > qualFileNames;
106         MothurOut* m;
107         float match, misMatch, gapOpen, gapExtend;
108         int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
109     bool allFiles, createGroup;
110     map<string, int> groupCounts; 
111     map<string, string> groupMap;
112     vector<string> primerNameVector;    
113         vector<string> barcodeNameVector;
114     map<int, oligosPair> barcodes;
115         map<int, oligosPair> primers;
116         
117         contigsData(){}
118         contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
119         files = f;
120                 outputFasta = of;
121         outputQual = oq;
122         outputMisMatches = om;
123         m = mout;
124                 match = ma; 
125                 misMatch = misMa;
126                 gapOpen = gapO; 
127                 gapExtend = gapE; 
128         threshold = thr;
129                 align = al;
130                 count = 0;
131         outputScrapFasta = osf;
132         outputScrapQual = osq;
133         fastaFileNames = ffn;
134         qualFileNames = qfn;
135         barcodes = br;
136         primers = pr;
137         barcodeNameVector = bnv;
138         primerNameVector = pnv;
139         pdiffs = pdf;
140         bdiffs = bdf;
141         tdiffs = tdf;
142         allFiles = all;
143         createGroup = cg;
144                 threadID = tid;
145         }
146 };
147
148 /**************************************************************************************************/
149 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
150 #else
151 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
152         contigsData* pDataArray;
153         pDataArray = (contigsData*)lpParam;
154         
155         try {
156         int longestBase = 1000;
157         Alignment* alignment;
158         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
159                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
160         
161         int num = 0;
162         string thisffastafile = pDataArray->files[0];
163         string thisfqualfile = pDataArray->files[1];
164         string thisrfastafile = pDataArray->files[2];
165         string thisrqualfile = pDataArray->files[3];
166         
167         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
168         
169                 if(pDataArray->allFiles){
170                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
171                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
172                                         if (pDataArray->fastaFileNames[i][j] != "") {
173                                                 ofstream temp;
174                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
175                         pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
176                                         }
177                                 }
178                         }
179                 }
180         
181         ifstream inFFasta, inRFasta, inFQual, inRQual;
182         pDataArray->m->openInputFile(thisffastafile, inFFasta);
183         pDataArray->m->openInputFile(thisfqualfile, inFQual);
184         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
185         pDataArray->m->openInputFile(thisrqualfile, inRQual);
186         
187         ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
188         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
189         pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
190         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
191         pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
192         pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
193         outMisMatch << "Name\tLength\tMisMatches\n";
194         
195         TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
196         
197         while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
198             
199             if (pDataArray->m->control_pressed) { break; }
200             
201             int success = 1;
202             string trashCode = "";
203             int currentSeqsDiffs = 0;
204             
205             //read seqs and quality info
206             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
207             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
208             QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual);
209             QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual);
210             
211             int barcodeIndex = 0;
212             int primerIndex = 0;
213             
214             if(pDataArray->barcodes.size() != 0){
215                 success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex);
216                 if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
217                 else{ currentSeqsDiffs += success;  }
218             }
219             
220             if(pDataArray->primers.size() != 0){
221                 success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex);
222                 if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
223                 else{ currentSeqsDiffs += success;  }
224             }
225             
226             if (currentSeqsDiffs > pDataArray->tdiffs)  {       trashCode += 't';   }
227             
228             //flip the reverse reads
229             rSeq.reverseComplement();
230             rQual.flipQScores();
231            
232             //pairwise align
233             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
234             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
235             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
236             fSeq.setAligned(alignment->getSeqAAln());
237             rSeq.setAligned(alignment->getSeqBAln());
238             int length = fSeq.getAligned().length();
239             
240             //traverse alignments merging into one contiguous seq
241             string contig = "";
242             vector<int> contigScores; 
243             int numMismatches = 0;
244             string seq1 = fSeq.getAligned();
245             string seq2 = rSeq.getAligned();
246             vector<int> scores1 = fQual.getQualityScores();
247             vector<int> scores2 = rQual.getQualityScores();
248             
249             int overlapStart = fSeq.getStartPos();
250             int seq2Start = rSeq.getStartPos();
251             //bigger of the 2 starting positions is the location of the overlapping start
252             if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
253                 overlapStart = seq2Start; 
254                 for (int i = 0; i < overlapStart; i++) {
255                     contig += seq1[i];
256                     contigScores.push_back(scores1[ABaseMap[i]]);
257                 }
258             }else { //seq1 starts later so take from 0 to overlapStart from seq2
259                 for (int i = 0; i < overlapStart; i++) {
260                     contig += seq2[i];
261                     contigScores.push_back(scores2[BBaseMap[i]]);
262                 }
263             }
264             
265             int seq1End = fSeq.getEndPos();
266             int seq2End = rSeq.getEndPos();
267             int overlapEnd = seq1End;
268             if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
269             
270             for (int i = overlapStart; i < overlapEnd; i++) {
271                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
272                     contig += seq1[i];
273                     contigScores.push_back(scores1[ABaseMap[i]]);
274                     if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
275                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
276                     if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
277                     else {
278                         contig += seq2[i];
279                         contigScores.push_back(scores2[BBaseMap[i]]);
280                     }
281                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
282                     if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
283                     else {
284                         contig += seq1[i];
285                         contigScores.push_back(scores1[ABaseMap[i]]);
286                     }
287                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
288                     char c = seq1[i];
289                     contigScores.push_back(scores1[ABaseMap[i]]);
290                     if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
291                     contig += c;
292                     numMismatches++;
293                 }else { //should never get here
294                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
295                 }
296             }
297             
298             if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
299                 for (int i = overlapEnd; i < length; i++) {
300                     contig += seq2[i];
301                     contigScores.push_back(scores2[BBaseMap[i]]);
302                 }
303             }else { //seq2 ends before seq1 so take from overlap to length from seq1
304                 for (int i = overlapEnd; i < length; i++) {
305                     contig += seq1[i];
306                     contigScores.push_back(scores1[ABaseMap[i]]);
307                 }
308                 
309             }
310
311             if(trashCode.length() == 0){
312                 if (pDataArray->createGroup) {
313                     if(pDataArray->barcodes.size() != 0){
314                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
315                         if (pDataArray->primers.size() != 0) { 
316                             if (pDataArray->primerNameVector[primerIndex] != "") { 
317                                 if(thisGroup != "") {
318                                     thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
319                                 }else {
320                                     thisGroup = pDataArray->primerNameVector[primerIndex]; 
321                                 }
322                             } 
323                         }
324                         
325                         if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
326                         
327                         pDataArray->groupMap[fSeq.getName()] = thisGroup; 
328                         
329                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
330                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
331                         else { pDataArray->groupCounts[it->first] ++; }
332                         
333                     }
334                 }
335                 
336                 if(pDataArray->allFiles){
337                     ofstream output;
338                     pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
339                     output << ">" << fSeq.getName() << endl << contig << endl;
340                     output.close();
341                     
342                     pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
343                     output << ">" << fSeq.getName() << endl;
344                     for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
345                     output << endl;
346                     output.close();                                                     
347                 }
348                 
349                 //output
350                 outFasta << ">" << fSeq.getName() << endl << contig << endl;
351                 outQual << ">" << fSeq.getName() << endl;
352                 for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
353                 outQual << endl;
354                 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
355             }else {
356                 //output
357                 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
358                 outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
359                 for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
360                 outScrapQual << endl;
361             }
362             num++;
363             
364                         //report progress
365                         if((num) % 1000 == 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
366                 }
367         
368                 //report progress
369                 if((num) % 1000 != 0){  pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine();             }
370         
371         inFFasta.close();
372         inFQual.close();
373         inRFasta.close();
374         inRQual.close();
375         outFasta.close();
376         outQual.close();
377         outMisMatch.close();
378         outScrapFasta.close();
379         outScrapQual.close();
380         delete alignment;
381         
382         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);}
383         
384         return 0;
385                 
386         }
387         catch(exception& e) {
388                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
389                 exit(1);
390         }
391
392 #endif
393
394
395 #endif