]> git.donarmstrong.com Git - mothur.git/blob - makecontigscommand.h
working on pam
[mothur.git] / makecontigscommand.h
1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
3
4 //
5 //  makecontigscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 5/15/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
21
22 struct fastqRead {
23         vector<int> scores;
24         string name;
25         string sequence;
26         
27         fastqRead() { name = ""; sequence = ""; scores.clear(); };
28         fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
29         ~fastqRead() {};
30 };
31
32 struct pairFastqRead {
33         fastqRead forward;
34     fastqRead reverse;
35     fastqRead findex;
36     fastqRead rindex;
37         
38         pairFastqRead() {};
39         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
40     pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {};
41         ~pairFastqRead() {};
42 };
43 /**************************************************************************************************/
44
45 class MakeContigsCommand : public Command {
46 public:
47     MakeContigsCommand(string);
48     MakeContigsCommand();
49     ~MakeContigsCommand(){}
50     
51     vector<string> setParameters();
52     string getCommandName()                     { return "make.contigs";                        }
53     string getCommandCategory()         { return "Sequence Processing";         } 
54     //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
55     
56         string getHelpString(); 
57     string getOutputPattern(string);    
58     string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
59     string getDescription()             { return "description"; }
60     
61     int execute(); 
62     void help() { m->mothurOut(getHelpString()); }      
63     
64 private:
65     bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk;
66     string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format;
67         float match, misMatch, gapOpen, gapExtend;
68         int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq;
69     vector<string> outputNames;
70     
71     map<int, oligosPair> barcodes;
72         map<int, oligosPair> primers;
73     vector<string>  linker;
74     vector<string>  spacer;
75         vector<string> primerNameVector;        
76         vector<string> barcodeNameVector;
77         vector<char> convertTable;
78     
79         map<string, int> groupCounts; 
80     map<string, string> groupMap;
81     map<int, string> file2Group;
82     
83     vector<int> convertQual(string);
84     fastqRead readFastq(ifstream&, bool&);
85     vector< vector< vector<string> > > preProcessData(unsigned long int&);
86     vector< vector<string> > readFileNames(string);
87     vector< vector<string> > readFastqFiles(unsigned long int&, string, string, string, string);
88     vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
89     //bool checkReads(fastqRead&, fastqRead&, string, string);
90     int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >, int);
91     int driver(vector<string>, string, string, string, vector<vector<string> >, int, string);
92     bool getOligos(vector<vector<string> >&, string);
93     string reverseOligo(string);
94     vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool);
95     vector<pairFastqRead> mergeReads(vector<pairFastqRead> frReads, vector<pairFastqRead> friReads, map<string, pairFastqRead>& pairUniques);
96 };
97
98 /**************************************************************************************************/
99
100 /**************************************************************************************************/
101 //custom data structure for threads to use.
102 // This is passed by void pointer so it can be any data type
103 // that can be passed using a single void pointer (LPVOID).
104 struct contigsData {
105         string outputFasta; 
106     string outputScrapFasta; 
107         string outputMisMatches;
108         string align, group;
109     vector<string> files;
110     vector<vector<string> > fastaFileNames;
111         MothurOut* m;
112         float match, misMatch, gapOpen, gapExtend;
113         int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq;
114     bool allFiles, createOligosGroup, createFileGroup, done, trimOverlap;
115     map<string, int> groupCounts; 
116     map<string, string> groupMap;
117     vector<string> primerNameVector;    
118         vector<string> barcodeNameVector;
119     map<int, oligosPair> barcodes;
120         map<int, oligosPair> primers;
121         
122         contigsData(){}
123         contigsData(string g, vector<string> f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool cfg, bool all, bool to, int tid) {
124         files = f;
125                 outputFasta = of;
126         outputMisMatches = om;
127         m = mout;
128                 match = ma; 
129                 misMatch = misMa;
130                 gapOpen = gapO; 
131                 gapExtend = gapE; 
132         insert = thr;
133                 align = al;
134         group = g;
135                 count = 0;
136         outputScrapFasta = osf;
137         fastaFileNames = ffn;
138         barcodes = br;
139         primers = pr;
140         barcodeNameVector = bnv;
141         primerNameVector = pnv;
142         pdiffs = pdf;
143         bdiffs = bdf;
144         tdiffs = tdf;
145         allFiles = all;
146         trimOverlap = to;
147         createOligosGroup = cg;
148         createFileGroup = cfg;
149                 threadID = tid;
150         deltaq = delt;
151         done=false;
152         }
153 };
154
155 /**************************************************************************************************/
156 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
157 #else
158 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ 
159         contigsData* pDataArray;
160         pDataArray = (contigsData*)lpParam;
161         
162         try {
163         int longestBase = 1000;
164         Alignment* alignment;
165         if(pDataArray->align == "gotoh")                        {       alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase);                 }
166                 else if(pDataArray->align == "needleman")       {       alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase);                            }
167         
168         pDataArray->count = 0;
169         string thisffastafile = pDataArray->files[0];
170         string thisfqualfile = pDataArray->files[1];
171         string thisrfastafile = pDataArray->files[2];
172         string thisrqualfile = pDataArray->files[3];
173         string thisfindexfile = pDataArray->files[4];
174         string thisrindexfile = pDataArray->files[5];
175         
176         if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
177         
178                 if(pDataArray->allFiles){
179                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
180                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
181                                         if (pDataArray->fastaFileNames[i][j] != "") {
182                                                 ofstream temp;
183                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
184                                         }
185                                 }
186                         }
187                 }
188         
189         ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
190         ofstream outFasta, outMisMatch, outScrapFasta;
191         pDataArray->m->openInputFile(thisffastafile, inFFasta);
192         pDataArray->m->openInputFile(thisrfastafile, inRFasta);
193         if (thisfqualfile != "") {
194             pDataArray->m->openInputFile(thisfqualfile, inFQual);
195             pDataArray->m->openInputFile(thisrqualfile, inRQual);
196         }
197         
198         if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex);  }
199         if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex);  }
200         
201         pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
202         pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
203         pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
204         
205         outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";  
206         
207         TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
208         
209         while ((!inFFasta.eof()) && (!inRFasta.eof())) {
210             
211             if (pDataArray->m->control_pressed) { break; }
212             
213             int success = 1;
214             string trashCode = "";
215             int currentSeqsDiffs = 0;
216             
217             //read seqs and quality info
218             Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
219             Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
220             QualityScores* fQual = NULL; QualityScores* rQual = NULL;
221             if (thisfqualfile != "") {
222                 fQual = new QualityScores(inFQual); pDataArray->m->gobble(inFQual);
223                 rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
224             }
225             
226             Sequence findexBarcode("findex", "NONE");  Sequence rindexBarcode("rindex", "NONE");
227             if (thisfindexfile != "") {
228                 Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex);
229                 findexBarcode.setAligned(temp.getAligned());
230             }
231             
232             if (thisrindexfile != "") {
233                 Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex);
234                 rindexBarcode.setAligned(temp.getAligned());
235             }
236
237             int barcodeIndex = 0;
238             int primerIndex = 0;
239             
240             if(pDataArray->barcodes.size() != 0){
241                 if (thisfqualfile != "") {
242                     if ((thisfindexfile != "") || (thisrindexfile != "")) {
243                         success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
244                     }else {
245                         success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
246                     }
247                 }else {
248                     success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
249                 }
250                 if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
251                 else{ currentSeqsDiffs += success;  }
252             }
253             
254             if(pDataArray->primers.size() != 0){
255                 if (thisfqualfile != "") {
256                     success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
257                 }else {
258                     success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
259                 }
260                 if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
261                 else{ currentSeqsDiffs += success;  }
262             }
263             
264             if (currentSeqsDiffs > pDataArray->tdiffs)  {       trashCode += 't';   }
265             
266             //flip the reverse reads
267             rSeq.reverseComplement();
268             if (thisfqualfile != "") { rQual->flipQScores(); }
269            
270             //pairwise align
271             alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
272             map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
273             map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
274             fSeq.setAligned(alignment->getSeqAAln());
275             rSeq.setAligned(alignment->getSeqBAln());
276             int length = fSeq.getAligned().length();
277             
278             //traverse alignments merging into one contiguous seq
279             string contig = "";
280             int numMismatches = 0;
281             string seq1 = fSeq.getAligned();
282             string seq2 = rSeq.getAligned();
283             vector<int> scores1, scores2;
284             if (thisfqualfile != "") {
285                 scores1 = fQual->getQualityScores();
286                 scores2 = rQual->getQualityScores();
287                 delete fQual; delete rQual;
288             }
289             
290             int overlapStart = fSeq.getStartPos();
291             int seq2Start = rSeq.getStartPos();
292             //bigger of the 2 starting positions is the location of the overlapping start
293             if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
294                 overlapStart = seq2Start; 
295                 for (int i = 0; i < overlapStart; i++) { contig += seq1[i];  }
296             }else { //seq1 starts later so take from 0 to overlapStart from seq2
297                 for (int i = 0; i < overlapStart; i++) {  contig += seq2[i]; }
298             }
299             
300             int seq1End = fSeq.getEndPos();
301             int seq2End = rSeq.getEndPos();
302             int overlapEnd = seq1End;
303             if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
304             
305             int oStart = contig.length();
306             for (int i = overlapStart; i < overlapEnd; i++) {
307                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
308                     contig += seq1[i];
309                 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
310                     if (thisfqualfile != "") {
311                         if (scores2[BBaseMap[i]] < pDataArray->insert) { } //
312                         else { contig += seq2[i];  }
313                     }else { contig += seq2[i]; } //with no quality info, then we keep it?
314                 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
315                     if (thisfqualfile != "") {
316                         if (scores1[ABaseMap[i]] < pDataArray->insert) { } //
317                         else { contig += seq1[i];  }
318                     }else { contig += seq1[i]; } //with no quality info, then we keep it?
319                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
320                     if (thisfqualfile != "") {
321                         if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
322                             char c = seq1[i];
323                             if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
324                             contig += c;
325                         }else { //if no, base becomes n
326                             contig += 'N';
327                         }
328                         numMismatches++;
329                     }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
330                 }else { //should never get here
331                     pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
332                 }
333             }
334             int oend = contig.length();
335             
336             if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
337                 for (int i = overlapEnd; i < length; i++) { contig += seq2[i];  }
338             }else { //seq2 ends before seq1 so take from overlap to length from seq1
339                 for (int i = overlapEnd; i < length; i++) {  contig += seq1[i]; }
340             }
341
342             if (pDataArray->trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
343             
344             if(trashCode.length() == 0){
345                 bool ignore = false;
346                 if (pDataArray->createOligosGroup) {
347                     if(pDataArray->barcodes.size() != 0){
348                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
349                         if (pDataArray->primers.size() != 0) { 
350                             if (pDataArray->primerNameVector[primerIndex] != "") { 
351                                 if(thisGroup != "") {
352                                     thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
353                                 }else {
354                                     thisGroup = pDataArray->primerNameVector[primerIndex]; 
355                                 }
356                             } 
357                         }
358                         
359                         if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
360                         
361                         int pos = thisGroup.find("ignore");
362                         if (pos == string::npos) {
363                             pDataArray->groupMap[fSeq.getName()] = thisGroup; 
364                         
365                             map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
366                             if (it == pDataArray->groupCounts.end()) {  pDataArray->groupCounts[thisGroup] = 1; }
367                             else { pDataArray->groupCounts[it->first] ++; }
368                         }else { ignore = true; }
369                     }
370                 }else if (pDataArray->createFileGroup) {
371                     int pos = pDataArray->group.find("ignore");
372                     if (pos == string::npos) {
373                         pDataArray->groupMap[fSeq.getName()] = pDataArray->group;
374                         
375                         map<string, int>::iterator it = pDataArray->groupCounts.find(pDataArray->group);
376                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[pDataArray->group] = 1; }
377                         else { pDataArray->groupCounts[it->first]++; }
378                     }else { ignore = true; }
379                 }
380
381                 
382                 if(pDataArray->allFiles && !ignore){
383                     ofstream output;
384                     pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
385                     output << ">" << fSeq.getName() << endl << contig << endl;
386                     output.close();
387                 }
388                 
389                 //output
390                 outFasta << ">" << fSeq.getName() << endl << contig << endl;
391                 int numNs = 0;
392                 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; }  }
393                 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
394             }else {
395                 //output
396                 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
397             }
398             pDataArray->count++;
399             
400                         //report progress
401                         if((pDataArray->count) % 1000 == 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
402                 }
403         
404                 //report progress
405                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
406         
407         inFFasta.close();
408         inRFasta.close();
409         outFasta.close();
410         outMisMatch.close();
411         outScrapFasta.close();
412         if (thisfqualfile != "") {
413             inFQual.close();
414             inRQual.close();
415         }
416         delete alignment;
417         
418         pDataArray->done = true;
419         if (pDataArray->m->control_pressed) {  pDataArray->m->mothurRemove(pDataArray->outputFasta);  pDataArray->m->mothurRemove(pDataArray->outputMisMatches);  pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); }
420         
421         return 0;
422                 
423         }
424         catch(exception& e) {
425                 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");
426                 exit(1);
427         }
428
429 #endif
430
431
432 #endif