]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
added oligos class. added check orient parameter to trim.flows, sffinfo, fastq.info...
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21 #include "oligos.h"
22
23 struct fastqRead {
24         vector<int> scores;
25         string name;
26         string sequence;
27         
28         fastqRead() { name = ""; sequence = ""; scores.clear(); };
29         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
30         ~fastqRead() {};
31 };
32
33 struct pairFastqRead {
34         fastqRead forward;
35     fastqRead reverse;
36     fastqRead findex;
37     fastqRead rindex;
38         
39         pairFastqRead() {};
40         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
41     pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {};
42         ~pairFastqRead() {};
43 };
44 /**************************************************************************************************/
45
46 class MakeContigsCommand : public Command {
47 public:
48     MakeContigsCommand(string);
49     MakeContigsCommand();
50     ~MakeContigsCommand(){}
51     
52     vector<string> setParameters();
53     string getCommandName()                     { return "make.contigs";                        }
54     string getCommandCategory()         { return "Sequence Processing";         } 
55     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
56     
57         string getHelpString(); 
58     string getOutputPattern(string);    
59     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
60     string getDescription()             { return "description"; }
61     
62     int execute(); 
63     void help() { m->mothurOut(getHelpString()); }      
64     
65 private:
66     bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk, reorient;
67     string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format, inputDir;
68         float match, misMatch, gapOpen, gapExtend;
69         int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq, numBarcodes, numFPrimers, numLinkers, numSpacers, numRPrimers;
70     vector<string> outputNames;
71     Oligos* oligos;
72         vector<char> convertTable;
73     
74         map<string, int> groupCounts; 
75     map<string, string> groupMap;
76     map<int, string> file2Group;
77     
78     vector<int> convertQual(string);
79     fastqRead readFastq(ifstream&, bool&);
80     vector< vector< vector<string> > > preProcessData(unsigned long int&);
81     vector< vector<string> > readFileNames(string);
82     vector< vector<string> > readFastqFiles(unsigned long int&, string, string, string, string);
83     vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
84     //bool checkReads(fastqRead&, fastqRead&, string, string);
85     int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >, int);
86     int driver(vector<string>, string, string, string, vector<vector<string> >, int, string);
87     bool getOligos(vector<vector<string> >&, string, map<string, string>&);
88     vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool);
89     vector<pairFastqRead> mergeReads(vector<pairFastqRead> frReads, vector<pairFastqRead> friReads, map<string, pairFastqRead>& pairUniques);
90 };
91
92 /**************************************************************************************************/
93
94 /**************************************************************************************************/
95 //custom data structure for threads to use.
96 // This is passed by void pointer so it can be any data type
97 // that can be passed using a single void pointer (LPVOID).
98 struct contigsData {
99         string outputFasta; 
100     string outputScrapFasta; 
101         string outputMisMatches;
102         string align, group, oligosfile;
103     vector<string> files;
104     vector<vector<string> > fastaFileNames;
105         MothurOut* m;
106         float match, misMatch, gapOpen, gapExtend;
107         int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq;
108     bool allFiles, createOligosGroup, createFileGroup, done, trimOverlap, reorient;
109     map<string, int> groupCounts; 
110     map<string, string> groupMap;
111     
112         
113         contigsData(){}
114         contigsData(string g, vector<string> f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, vector<vector<string> > ffn, string olig, bool ro, int pdf, int bdf, int tdf, bool cg, bool cfg, bool all, bool to, int tid) {
115         files = f;
116                 outputFasta = of;
117         outputMisMatches = om;
118         m = mout;
119                 match = ma; 
120                 misMatch = misMa;
121                 gapOpen = gapO; 
122                 gapExtend = gapE; 
123         insert = thr;
124                 align = al;
125         group = g;
126                 count = 0;
127         outputScrapFasta = osf;
128         fastaFileNames = ffn;
129         oligosfile = olig;
130         pdiffs = pdf;
131         bdiffs = bdf;
132         tdiffs = tdf;
133         allFiles = all;
134         trimOverlap = to;
135         createOligosGroup = cg;
136         createFileGroup = cfg;
137                 threadID = tid;
138         deltaq = delt;
139         reorient = ro;
140         done=false;
141         }
142 };
143
144 /**************************************************************************************************/
145 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
146 #else
147 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
148         contigsData* pDataArray;
149         pDataArray = (contigsData*)lpParam;
150         
151         try {
152         int longestBase = 1000;
153         Alignment* alignment;
154         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
155                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
156         
157         pDataArray->count = 0;
158         string thisffastafile = pDataArray->files[0];
159         string thisfqualfile = pDataArray->files[1];
160         string thisrfastafile = pDataArray->files[2];
161         string thisrqualfile = pDataArray->files[3];
162         string thisfindexfile = pDataArray->files[4];
163         string thisrindexfile = pDataArray->files[5];
164         
165         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
166         
167                 if(pDataArray->allFiles){
168                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
169                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
170                                         if (pDataArray->fastaFileNames[i][j] != "") {
171                                                 ofstream temp;
172                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
173                                         }
174                                 }
175                         }
176                 }
177         
178         ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
179         ofstream outFasta, outMisMatch, outScrapFasta;
180         pDataArray->m->openInputFile(thisffastafile, inFFasta);
181         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
182         if (thisfqualfile != "") {
183             pDataArray->m->openInputFile(thisfqualfile, inFQual);
184             pDataArray->m->openInputFile(thisrqualfile, inRQual);
185         }
186         
187         if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex);  }
188         if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex);  }
189         
190         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
191         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
192         pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
193         
194         outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";  
195         
196         Oligos oligos;
197         if (pDataArray->oligosfile != "") { oligos.read(pDataArray->oligosfile);  }
198         int numFPrimers = oligos.getPairedPrimers().size();
199         int numBarcodes = oligos.getPairedBarcodes().size();
200
201         
202         TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getPairedPrimers(), oligos.getPairedBarcodes());
203         TrimOligos* rtrimOligos = NULL;
204         if (pDataArray->reorient) {
205             rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getReorientedPairedPrimers(), oligos.getReorientedPairedBarcodes()); numBarcodes = oligos.getReorientedPairedBarcodes().size();
206         }
207         
208         while ((!inFFasta.eof()) && (!inRFasta.eof())) {
209             
210             if (pDataArray->m->control_pressed) { break; }
211             
212             int success = 1;
213             string trashCode = "";
214             int currentSeqsDiffs = 0;
215             
216             //read seqs and quality info
217             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
218             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
219             QualityScores* fQual = NULL; QualityScores* rQual = NULL;
220             if (thisfqualfile != "") {
221                 fQual = new QualityScores(inFQual); pDataArray->m->gobble(inFQual);
222                 rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
223             }
224             
225             Sequence findexBarcode("findex", "NONE");  Sequence rindexBarcode("rindex", "NONE");
226             if (thisfindexfile != "") {
227                 Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex);
228                 findexBarcode.setAligned(temp.getAligned());
229             }
230             
231             if (thisrindexfile != "") {
232                 Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex);
233                 rindexBarcode.setAligned(temp.getAligned());
234             }
235
236             int barcodeIndex = 0;
237             int primerIndex = 0;
238             Sequence savedFSeq(fSeq.getName(), fSeq.getAligned());  Sequence savedRSeq(rSeq.getName(), rSeq.getAligned());
239             Sequence savedFindex(findexBarcode.getName(), findexBarcode.getAligned()); Sequence savedRIndex(rindexBarcode.getName(), rindexBarcode.getAligned());
240             QualityScores* savedFQual = NULL; QualityScores* savedRQual = NULL;
241             if (thisfqualfile != "") {
242                 savedFQual = new QualityScores(fQual->getName(), fQual->getQualityScores());
243                 savedRQual = new QualityScores(rQual->getName(), rQual->getQualityScores());
244             }
245             
246             if(numBarcodes != 0){
247                 if (thisfqualfile != "") {
248                     if ((thisfindexfile != "") || (thisrindexfile != "")) {
249                         success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
250                     }else {
251                         success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
252                     }
253                 }else {
254                     success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
255                 }
256                 if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
257                 else{ currentSeqsDiffs += success;  }
258             }
259             
260             if(numFPrimers != 0){
261                 if (thisfqualfile != "") {
262                     success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
263                 }else {
264                     success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
265                 }
266                 if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
267                 else{ currentSeqsDiffs += success;  }
268             }
269             
270             if (currentSeqsDiffs > pDataArray->tdiffs)  {       trashCode += 't';   }
271             
272             if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
273                 int thisSuccess = 0;
274                 string thisTrashCode = "";
275                 int thisCurrentSeqsDiffs = 0;
276                 
277                 int thisBarcodeIndex = 0;
278                 int thisPrimerIndex = 0;
279                 
280                 if(numBarcodes != 0){
281                     if (thisfqualfile != "") {
282                         if ((thisfindexfile != "") || (thisrindexfile != "")) {
283                             thisSuccess = rtrimOligos->stripBarcode(savedFindex, savedRIndex, *savedFQual, *savedRQual, thisBarcodeIndex);
284                         }else {
285                             thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisBarcodeIndex);
286                         }
287                     }else {
288                         thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, thisBarcodeIndex);
289                     }
290                     if(thisSuccess > pDataArray->bdiffs)                {       thisTrashCode += 'b';   }
291                     else{ thisCurrentSeqsDiffs += thisSuccess;  }
292                 }
293                 
294                 if(numFPrimers != 0){
295                     if (thisfqualfile != "") {
296                         thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisPrimerIndex);
297                     }else {
298                         thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, thisPrimerIndex);
299                     }
300                     if(thisSuccess > pDataArray->pdiffs)                {       thisTrashCode += 'f';   }
301                     else{ thisCurrentSeqsDiffs += thisSuccess;  }
302                 }
303                 
304                 if (thisCurrentSeqsDiffs > pDataArray->tdiffs)  {       thisTrashCode += 't';   }
305                 
306                 if (thisTrashCode == "") {
307                     trashCode = thisTrashCode;
308                     success = thisSuccess;
309                     currentSeqsDiffs = thisCurrentSeqsDiffs;
310                     barcodeIndex = thisBarcodeIndex;
311                     primerIndex = thisPrimerIndex;
312                     savedFSeq.reverseComplement();
313                     savedRSeq.reverseComplement();
314                     fSeq.setAligned(savedFSeq.getAligned());
315                     rSeq.setAligned(savedRSeq.getAligned());
316                     if(thisfqualfile != ""){
317                         savedFQual->flipQScores(); savedRQual->flipQScores();
318                         fQual->setScores(savedFQual->getScores()); rQual->setScores(savedRQual->getScores());
319                     }
320                 }else { trashCode += "(" + thisTrashCode + ")";  }
321             }
322             
323             //flip the reverse reads
324             rSeq.reverseComplement();
325             if (thisfqualfile != "") { rQual->flipQScores(); }
326            
327             //pairwise align
328             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
329             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
330             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
331             fSeq.setAligned(alignment->getSeqAAln());
332             rSeq.setAligned(alignment->getSeqBAln());
333             int length = fSeq.getAligned().length();
334             
335             //traverse alignments merging into one contiguous seq
336             string contig = "";
337             int numMismatches = 0;
338             string seq1 = fSeq.getAligned();
339             string seq2 = rSeq.getAligned();
340             vector<int> scores1, scores2;
341             if (thisfqualfile != "") {
342                 scores1 = fQual->getQualityScores();
343                 scores2 = rQual->getQualityScores();
344                 delete fQual; delete rQual; delete savedFQual; delete savedRQual;
345             }
346             
347             int overlapStart = fSeq.getStartPos();
348             int seq2Start = rSeq.getStartPos();
349             //bigger of the 2 starting positions is the location of the overlapping start
350             if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
351                 overlapStart = seq2Start; 
352                 for (int i = 0; i < overlapStart; i++) { contig += seq1[i];  }
353             }else { //seq1 starts later so take from 0 to overlapStart from seq2
354                 for (int i = 0; i < overlapStart; i++) {  contig += seq2[i]; }
355             }
356             
357             int seq1End = fSeq.getEndPos();
358             int seq2End = rSeq.getEndPos();
359             int overlapEnd = seq1End;
360             if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
361             
362             int oStart = contig.length();
363             for (int i = overlapStart; i < overlapEnd; i++) {
364                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
365                     contig += seq1[i];
366                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
367                     if (thisfqualfile != "") {
368                         if (scores2[BBaseMap[i]] < pDataArray->insert) { } //
369                         else { contig += seq2[i];  }
370                     }else { contig += seq2[i]; } //with no quality info, then we keep it?
371                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
372                     if (thisfqualfile != "") {
373                         if (scores1[ABaseMap[i]] < pDataArray->insert) { } //
374                         else { contig += seq1[i];  }
375                     }else { contig += seq1[i]; } //with no quality info, then we keep it?
376                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
377                     if (thisfqualfile != "") {
378                         if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
379                             char c = seq1[i];
380                             if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
381                             contig += c;
382                         }else { //if no, base becomes n
383                             contig += 'N';
384                         }
385                         numMismatches++;
386                     }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
387                 }else { //should never get here
388                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
389                 }
390             }
391             int oend = contig.length();
392             
393             if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
394                 for (int i = overlapEnd; i < length; i++) { contig += seq2[i];  }
395             }else { //seq2 ends before seq1 so take from overlap to length from seq1
396                 for (int i = overlapEnd; i < length; i++) {  contig += seq1[i]; }
397             }
398
399             if (pDataArray->trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
400             
401             if(trashCode.length() == 0){
402                 bool ignore = false;
403                 if (pDataArray->createOligosGroup) {
404                     string thisGroup = oligos.getGroupName(barcodeIndex, primerIndex);
405                     if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
406                     
407                     int pos = thisGroup.find("ignore");
408                     if (pos == string::npos) {
409                         pDataArray->groupMap[fSeq.getName()] = thisGroup;
410                         
411                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
412                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
413                         else { pDataArray->groupCounts[it->first] ++; }
414                     }else { ignore = true; }
415                 }else if (pDataArray->createFileGroup) {
416                     int pos = pDataArray->group.find("ignore");
417                     if (pos == string::npos) {
418                         pDataArray->groupMap[fSeq.getName()] = pDataArray->group;
419                         
420                         map<string, int>::iterator it = pDataArray->groupCounts.find(pDataArray->group);
421                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[pDataArray->group] = 1; }
422                         else { pDataArray->groupCounts[it->first]++; }
423                     }else { ignore = true; }
424                 }
425
426                 
427                 if(pDataArray->allFiles && !ignore){
428                     ofstream output;
429                     pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
430                     output << ">" << fSeq.getName() << endl << contig << endl;
431                     output.close();
432                 }
433                 
434                 //output
435                 outFasta << ">" << fSeq.getName() << endl << contig << endl;
436                 int numNs = 0;
437                 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; }  }
438                 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
439             }else {
440                 //output
441                 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
442             }
443             pDataArray->count++;
444             
445                         //report progress
446                         if((pDataArray->count) % 1000 == 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
447                 }
448         
449                 //report progress
450                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
451         
452         inFFasta.close();
453         inRFasta.close();
454         outFasta.close();
455         outMisMatch.close();
456         outScrapFasta.close();
457         if (thisfqualfile != "") {
458             inFQual.close();
459             inRQual.close();
460         }
461         delete alignment;
462         if (pDataArray->reorient) { delete rtrimOligos; }
463         
464         pDataArray->done = true;
465         if (pDataArray->m->control_pressed) {  pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);  pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); }
466         
467         return 0;
468                 
469         }
470         catch(exception& e) {
471                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
472                 exit(1);
473         }
474
475 #endif
476
477
478 #endif