1 #ifndef Mothur_makecontigscommand_h
2 #define Mothur_makecontigscommand_h
5 // makecontigscommand.h
8 // Created by Sarah Westcott on 5/15/12.
9 // Copyright (c) 2012 Schloss Lab. All rights reserved.
12 #include "command.hpp"
13 #include "sequence.hpp"
14 #include "qualityscores.h"
15 #include "alignment.hpp"
16 #include "gotohoverlap.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "blastalign.hpp"
19 #include "noalign.hpp"
20 #include "trimoligos.h"
27 fastqRead() { name = ""; sequence = ""; scores.clear(); };
28 fastqRead(string n, string s, vector<int> sc) : name(n), sequence(s), scores(sc) {};
32 struct pairFastqRead {
37 pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
40 /**************************************************************************************************/
42 class MakeContigsCommand : public Command {
44 MakeContigsCommand(string);
46 ~MakeContigsCommand(){}
48 vector<string> setParameters();
49 string getCommandName() { return "make.contigs"; }
50 string getCommandCategory() { return "Sequence Processing"; }
51 //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
52 string getOutputFileNameTag(string, string);
53 string getHelpString();
54 string getCitation() { return "http://www.mothur.org/wiki/Make.contigs"; }
55 string getDescription() { return "description"; }
58 void help() { m->mothurOut(getHelpString()); }
61 bool abort, allFiles, createGroup;
62 string outputDir, ffastqfile, rfastqfile, align, oligosfile;
63 float match, misMatch, gapOpen, gapExtend;
64 int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
65 vector<string> outputNames;
67 map<int, oligosPair> barcodes;
68 map<int, oligosPair> primers;
69 vector<string> linker;
70 vector<string> spacer;
71 vector<string> primerNameVector;
72 vector<string> barcodeNameVector;
74 map<string, int> groupCounts;
75 map<string, string> groupMap;
76 //map<string, int> combos;
77 //map<string, int> groupToIndex;
78 //vector<string> groupVector;
80 fastqRead readFastq(ifstream&, bool&);
81 vector< vector<string> > readFastqFiles(int&);
82 bool checkReads(fastqRead&, fastqRead&);
83 int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
84 int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
85 bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
86 string reverseOligo(string);
87 vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
90 /**************************************************************************************************/
92 /**************************************************************************************************/
93 //custom data structure for threads to use.
94 // This is passed by void pointer so it can be any data type
95 // that can be passed using a single void pointer (LPVOID).
99 string outputScrapFasta;
100 string outputScrapQual;
101 string outputMisMatches;
103 vector<string> files;
104 vector<vector<string> > fastaFileNames;
105 vector<vector<string> > qualFileNames;
107 float match, misMatch, gapOpen, gapExtend;
108 int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
109 bool allFiles, createGroup;
110 map<string, int> groupCounts;
111 map<string, string> groupMap;
112 vector<string> primerNameVector;
113 vector<string> barcodeNameVector;
114 map<int, oligosPair> barcodes;
115 map<int, oligosPair> primers;
118 contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
122 outputMisMatches = om;
131 outputScrapFasta = osf;
132 outputScrapQual = osq;
133 fastaFileNames = ffn;
137 barcodeNameVector = bnv;
138 primerNameVector = pnv;
148 /**************************************************************************************************/
149 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
151 static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
152 contigsData* pDataArray;
153 pDataArray = (contigsData*)lpParam;
156 int longestBase = 1000;
157 Alignment* alignment;
158 if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); }
159 else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); }
162 string thisffastafile = pDataArray->files[0];
163 string thisfqualfile = pDataArray->files[1];
164 string thisrfastafile = pDataArray->files[2];
165 string thisrqualfile = pDataArray->files[3];
167 if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
169 if(pDataArray->allFiles){
170 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
171 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
172 if (pDataArray->fastaFileNames[i][j] != "") {
174 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
175 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
181 ifstream inFFasta, inRFasta, inFQual, inRQual;
182 pDataArray->m->openInputFile(thisffastafile, inFFasta);
183 pDataArray->m->openInputFile(thisfqualfile, inFQual);
184 pDataArray->m->openInputFile(thisrfastafile, inRFasta);
185 pDataArray->m->openInputFile(thisrqualfile, inRQual);
187 ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
188 pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
189 pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
190 pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
191 pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
192 pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
193 outMisMatch << "Name\tLength\tMisMatches\n";
195 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
197 while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
199 if (pDataArray->m->control_pressed) { break; }
202 string trashCode = "";
203 int currentSeqsDiffs = 0;
205 //read seqs and quality info
206 Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
207 Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
208 QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual);
209 QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual);
211 int barcodeIndex = 0;
214 if(pDataArray->barcodes.size() != 0){
215 success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex);
216 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
217 else{ currentSeqsDiffs += success; }
220 if(pDataArray->primers.size() != 0){
221 success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex);
222 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
223 else{ currentSeqsDiffs += success; }
226 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
228 //flip the reverse reads
229 rSeq.reverseComplement();
233 alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
234 map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
235 map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
236 fSeq.setAligned(alignment->getSeqAAln());
237 rSeq.setAligned(alignment->getSeqBAln());
238 int length = fSeq.getAligned().length();
240 //traverse alignments merging into one contiguous seq
242 vector<int> contigScores;
243 int numMismatches = 0;
244 string seq1 = fSeq.getAligned();
245 string seq2 = rSeq.getAligned();
246 vector<int> scores1 = fQual.getQualityScores();
247 vector<int> scores2 = rQual.getQualityScores();
249 int overlapStart = fSeq.getStartPos();
250 int seq2Start = rSeq.getStartPos();
251 //bigger of the 2 starting positions is the location of the overlapping start
252 if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
253 overlapStart = seq2Start;
254 for (int i = 0; i < overlapStart; i++) {
256 contigScores.push_back(scores1[ABaseMap[i]]);
258 }else { //seq1 starts later so take from 0 to overlapStart from seq2
259 for (int i = 0; i < overlapStart; i++) {
261 contigScores.push_back(scores2[BBaseMap[i]]);
265 int seq1End = fSeq.getEndPos();
266 int seq2End = rSeq.getEndPos();
267 int overlapEnd = seq1End;
268 if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
270 for (int i = overlapStart; i < overlapEnd; i++) {
271 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
273 contigScores.push_back(scores1[ABaseMap[i]]);
274 if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
275 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
276 if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
279 contigScores.push_back(scores2[BBaseMap[i]]);
281 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
282 if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
285 contigScores.push_back(scores1[ABaseMap[i]]);
287 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
289 contigScores.push_back(scores1[ABaseMap[i]]);
290 if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
293 }else { //should never get here
294 pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
298 if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
299 for (int i = overlapEnd; i < length; i++) {
301 contigScores.push_back(scores2[BBaseMap[i]]);
303 }else { //seq2 ends before seq1 so take from overlap to length from seq1
304 for (int i = overlapEnd; i < length; i++) {
306 contigScores.push_back(scores1[ABaseMap[i]]);
311 if(trashCode.length() == 0){
312 if (pDataArray->createGroup) {
313 if(pDataArray->barcodes.size() != 0){
314 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
315 if (pDataArray->primers.size() != 0) {
316 if (pDataArray->primerNameVector[primerIndex] != "") {
317 if(thisGroup != "") {
318 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
320 thisGroup = pDataArray->primerNameVector[primerIndex];
325 if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
327 pDataArray->groupMap[fSeq.getName()] = thisGroup;
329 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
330 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
331 else { pDataArray->groupCounts[it->first] ++; }
336 if(pDataArray->allFiles){
338 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
339 output << ">" << fSeq.getName() << endl << contig << endl;
342 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
343 output << ">" << fSeq.getName() << endl;
344 for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
350 outFasta << ">" << fSeq.getName() << endl << contig << endl;
351 outQual << ">" << fSeq.getName() << endl;
352 for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
354 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
357 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
358 outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
359 for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
360 outScrapQual << endl;
365 if((num) % 1000 == 0){ pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine(); }
369 if((num) % 1000 != 0){ pDataArray->m->mothurOut(toString(num)); pDataArray->m->mothurOutEndLine(); }
378 outScrapFasta.close();
379 outScrapQual.close();
382 if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);}
387 catch(exception& e) {
388 pDataArray->m->errorOut(e, "AlignCommand", "MyContigsThreadFunction");