1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
5 // makecontigscommand.h
8 // Created by Sarah Westcott on 5/15/12.
9 // Copyright (c) 2012 Schloss Lab. All rights reserved.
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
28 fastqRead() { name = ""; sequence = ""; scores.clear(); };
29 fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
33 struct pairFastqRead {
40 pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
41 pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {};
44 /**************************************************************************************************/
46 class MakeContigsCommand : public Command {
48 MakeContigsCommand(string);
50 ~MakeContigsCommand(){}
52 vector<string> setParameters();
53 string getCommandName() { return "make.contigs"; }
54 string getCommandCategory() { return "Sequence Processing"; }
55 //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
57 string getHelpString();
58 string getOutputPattern(string);
59 string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
60 string getDescription() { return "description"; }
63 void help() { m->mothurOut(getHelpString()); }
66 bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk, reorient;
67 string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format, inputDir;
68 float match, misMatch, gapOpen, gapExtend;
69 int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq, numBarcodes, numFPrimers, numLinkers, numSpacers, numRPrimers;
70 vector<string> outputNames;
72 vector<char> convertTable;
74 map<string, int> groupCounts;
75 map<string, string> groupMap;
76 map<int, string> file2Group;
78 vector<int> convertQual(string);
79 fastqRead readFastq(ifstream&, bool&);
80 vector< vector< vector<string> > > preProcessData(unsigned long int&);
81 vector< vector<string> > readFileNames(string);
82 vector< vector<string> > readFastqFiles(unsigned long int&, string, string, string, string);
83 vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
84 //bool checkReads(fastqRead&, fastqRead&, string, string);
85 int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >, int);
86 int driver(vector<string>, string, string, string, vector<vector<string> >, int, string);
87 bool getOligos(vector<vector<string> >&, string, map<string, string>&);
88 vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool);
89 vector<pairFastqRead> mergeReads(vector<pairFastqRead> frReads, vector<pairFastqRead> friReads, map<string, pairFastqRead>& pairUniques);
92 /**************************************************************************************************/
94 /**************************************************************************************************/
95 //custom data structure for threads to use.
96 // This is passed by void pointer so it can be any data type
97 // that can be passed using a single void pointer (LPVOID).
100 string outputScrapFasta;
101 string outputMisMatches;
102 string align, group, oligosfile;
103 vector<string> files;
104 vector<vector<string> > fastaFileNames;
106 float match, misMatch, gapOpen, gapExtend;
107 int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq;
108 bool allFiles, createOligosGroup, createFileGroup, done, trimOverlap, reorient;
109 map<string, int> groupCounts;
110 map<string, string> groupMap;
114 contigsData(string g, vector<string> f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, vector<vector<string> > ffn, string olig, bool ro, int pdf, int bdf, int tdf, bool cg, bool cfg, bool all, bool to, int tid) {
117 outputMisMatches = om;
127 outputScrapFasta = osf;
128 fastaFileNames = ffn;
135 createOligosGroup = cg;
136 createFileGroup = cfg;
144 /**************************************************************************************************/
145 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
147 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
148 contigsData* pDataArray;
149 pDataArray = (contigsData*)lpParam;
152 int longestBase = 1000;
153 Alignment* alignment;
154 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); }
155 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); }
157 pDataArray->count = 0;
158 string thisffastafile = pDataArray->files[0];
159 string thisfqualfile = pDataArray->files[1];
160 string thisrfastafile = pDataArray->files[2];
161 string thisrqualfile = pDataArray->files[3];
162 string thisfindexfile = pDataArray->files[4];
163 string thisrindexfile = pDataArray->files[5];
165 if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
167 if(pDataArray->allFiles){
168 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
169 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
170 if (pDataArray->fastaFileNames[i][j] != "") {
172 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
178 ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
179 ofstream outFasta, outMisMatch, outScrapFasta;
180 pDataArray->m->openInputFile(thisffastafile, inFFasta);
181 pDataArray->m->openInputFile(thisrfastafile, inRFasta);
182 if (thisfqualfile != "") {
183 pDataArray->m->openInputFile(thisfqualfile, inFQual);
184 pDataArray->m->openInputFile(thisrqualfile, inRQual);
187 if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex); }
188 if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex); }
190 pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
191 pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
192 pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
194 outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";
197 if (pDataArray->oligosfile != "") { oligos.read(pDataArray->oligosfile); }
198 int numFPrimers = oligos.getPairedPrimers().size();
199 int numBarcodes = oligos.getPairedBarcodes().size();
202 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getPairedPrimers(), oligos.getPairedBarcodes());
203 TrimOligos* rtrimOligos = NULL;
204 if (pDataArray->reorient) {
205 rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getReorientedPairedPrimers(), oligos.getReorientedPairedBarcodes()); numBarcodes = oligos.getReorientedPairedBarcodes().size();
208 while ((!inFFasta.eof()) && (!inRFasta.eof())) {
210 if (pDataArray->m->control_pressed) { break; }
213 string trashCode = "";
214 int currentSeqsDiffs = 0;
216 //read seqs and quality info
217 Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
218 Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
219 QualityScores* fQual = NULL; QualityScores* rQual = NULL;
220 if (thisfqualfile != "") {
221 fQual = new QualityScores(inFQual); pDataArray->m->gobble(inFQual);
222 rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
225 Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE");
226 if (thisfindexfile != "") {
227 Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex);
228 findexBarcode.setAligned(temp.getAligned());
231 if (thisrindexfile != "") {
232 Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex);
233 rindexBarcode.setAligned(temp.getAligned());
236 int barcodeIndex = 0;
238 Sequence savedFSeq(fSeq.getName(), fSeq.getAligned()); Sequence savedRSeq(rSeq.getName(), rSeq.getAligned());
239 Sequence savedFindex(findexBarcode.getName(), findexBarcode.getAligned()); Sequence savedRIndex(rindexBarcode.getName(), rindexBarcode.getAligned());
240 QualityScores* savedFQual = NULL; QualityScores* savedRQual = NULL;
241 if (thisfqualfile != "") {
242 savedFQual = new QualityScores(fQual->getName(), fQual->getQualityScores());
243 savedRQual = new QualityScores(rQual->getName(), rQual->getQualityScores());
246 if(numBarcodes != 0){
247 if (thisfqualfile != "") {
248 if ((thisfindexfile != "") || (thisrindexfile != "")) {
249 success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
251 success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
254 success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
256 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
257 else{ currentSeqsDiffs += success; }
260 if(numFPrimers != 0){
261 if (thisfqualfile != "") {
262 success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
264 success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
266 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
267 else{ currentSeqsDiffs += success; }
270 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
272 if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
274 string thisTrashCode = "";
275 int thisCurrentSeqsDiffs = 0;
277 int thisBarcodeIndex = 0;
278 int thisPrimerIndex = 0;
280 if(numBarcodes != 0){
281 if (thisfqualfile != "") {
282 if ((thisfindexfile != "") || (thisrindexfile != "")) {
283 thisSuccess = rtrimOligos->stripBarcode(savedFindex, savedRIndex, *savedFQual, *savedRQual, thisBarcodeIndex);
285 thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisBarcodeIndex);
288 thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, thisBarcodeIndex);
290 if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; }
291 else{ thisCurrentSeqsDiffs += thisSuccess; }
294 if(numFPrimers != 0){
295 if (thisfqualfile != "") {
296 thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisPrimerIndex);
298 thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, thisPrimerIndex);
300 if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; }
301 else{ thisCurrentSeqsDiffs += thisSuccess; }
304 if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; }
306 if (thisTrashCode == "") {
307 trashCode = thisTrashCode;
308 success = thisSuccess;
309 currentSeqsDiffs = thisCurrentSeqsDiffs;
310 barcodeIndex = thisBarcodeIndex;
311 primerIndex = thisPrimerIndex;
312 savedFSeq.reverseComplement();
313 savedRSeq.reverseComplement();
314 fSeq.setAligned(savedFSeq.getAligned());
315 rSeq.setAligned(savedRSeq.getAligned());
316 if(thisfqualfile != ""){
317 savedFQual->flipQScores(); savedRQual->flipQScores();
318 fQual->setScores(savedFQual->getScores()); rQual->setScores(savedRQual->getScores());
320 }else { trashCode += "(" + thisTrashCode + ")"; }
323 //flip the reverse reads
324 rSeq.reverseComplement();
325 if (thisfqualfile != "") { rQual->flipQScores(); }
328 alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
329 map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
330 map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
331 fSeq.setAligned(alignment->getSeqAAln());
332 rSeq.setAligned(alignment->getSeqBAln());
333 int length = fSeq.getAligned().length();
335 //traverse alignments merging into one contiguous seq
337 int numMismatches = 0;
338 string seq1 = fSeq.getAligned();
339 string seq2 = rSeq.getAligned();
340 vector<int> scores1, scores2;
341 if (thisfqualfile != "") {
342 scores1 = fQual->getQualityScores();
343 scores2 = rQual->getQualityScores();
344 delete fQual; delete rQual; delete savedFQual; delete savedRQual;
347 int overlapStart = fSeq.getStartPos();
348 int seq2Start = rSeq.getStartPos();
349 //bigger of the 2 starting positions is the location of the overlapping start
350 if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
351 overlapStart = seq2Start;
352 for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; }
353 }else { //seq1 starts later so take from 0 to overlapStart from seq2
354 for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; }
357 int seq1End = fSeq.getEndPos();
358 int seq2End = rSeq.getEndPos();
359 int overlapEnd = seq1End;
360 if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
362 int oStart = contig.length();
363 for (int i = overlapStart; i < overlapEnd; i++) {
364 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
366 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
367 if (thisfqualfile != "") {
368 if (scores2[BBaseMap[i]] < pDataArray->insert) { } //
369 else { contig += seq2[i]; }
370 }else { contig += seq2[i]; } //with no quality info, then we keep it?
371 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
372 if (thisfqualfile != "") {
373 if (scores1[ABaseMap[i]] < pDataArray->insert) { } //
374 else { contig += seq1[i]; }
375 }else { contig += seq1[i]; } //with no quality info, then we keep it?
376 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
377 if (thisfqualfile != "") {
378 if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
380 if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
382 }else { //if no, base becomes n
386 }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
387 }else { //should never get here
388 pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
391 int oend = contig.length();
393 if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
394 for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; }
395 }else { //seq2 ends before seq1 so take from overlap to length from seq1
396 for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; }
399 if (pDataArray->trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
401 if(trashCode.length() == 0){
403 if (pDataArray->createOligosGroup) {
404 string thisGroup = oligos.getGroupName(barcodeIndex, primerIndex);
405 if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
407 int pos = thisGroup.find("ignore");
408 if (pos == string::npos) {
409 pDataArray->groupMap[fSeq.getName()] = thisGroup;
411 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
412 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
413 else { pDataArray->groupCounts[it->first] ++; }
414 }else { ignore = true; }
415 }else if (pDataArray->createFileGroup) {
416 int pos = pDataArray->group.find("ignore");
417 if (pos == string::npos) {
418 pDataArray->groupMap[fSeq.getName()] = pDataArray->group;
420 map<string, int>::iterator it = pDataArray->groupCounts.find(pDataArray->group);
421 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[pDataArray->group] = 1; }
422 else { pDataArray->groupCounts[it->first]++; }
423 }else { ignore = true; }
427 if(pDataArray->allFiles && !ignore){
429 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
430 output << ">" << fSeq.getName() << endl << contig << endl;
435 outFasta << ">" << fSeq.getName() << endl << contig << endl;
437 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } }
438 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
441 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
446 if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
450 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
456 outScrapFasta.close();
457 if (thisfqualfile != "") {
462 if (pDataArray->reorient) { delete rtrimOligos; }
464 pDataArray->done = true;
465 if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); }
470 catch(exception& e) {
471 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");