]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
bd1908dd035f6a123ee1f749dfb47579ef5b831d
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11
12 //***************************************************************************************************************
13 ChimeraSlayer::ChimeraSlayer(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
14 //***************************************************************************************************************
15
16 ChimeraSlayer::~ChimeraSlayer() {
17         try {
18                 for (int i = 0; i < querySeqs.size(); i++)                      {  delete querySeqs[i];                 }
19                 for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
20         }
21         catch(exception& e) {
22                 errorOut(e, "ChimeraSlayer", "~ChimeraSlayer");
23                 exit(1);
24         }
25 }       
26 //***************************************************************************************************************
27 void ChimeraSlayer::print(ostream& out) {
28         try {
29                 mothurOutEndLine();
30                 
31                 for (int i = 0; i < querySeqs.size(); i++) {
32                 
33                         if (chimeraFlags[i] == "yes") { 
34                                 mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine();
35                         
36                         }else{
37                                 out << querySeqs[i]->getName() << "\tno" << endl;
38                                 mothurOut("no");
39                         }
40                 }
41 /*              
42         
43         my $div_ratio_QLA_QRB = $data_struct->{div_ratio_QLA_QRB};
44         my $div_ratio_QRA_QLB = $data_struct->{div_ratio_QLB_QRA};
45         
46         my $per_id_QLA = $data_struct->{per_id_QLA};
47         my $per_id_QRB = $data_struct->{per_id_QRB};
48         my $per_id_AB = $data_struct->{per_id_AB};
49         my $per_id_QA = $data_struct->{per_id_QA};
50         my $per_id_QB = $data_struct->{per_id_QB}; 
51         my $per_id_LAB = $data_struct->{per_id_LAB};
52         my $per_id_RAB = $data_struct->{per_id_RAB};
53         my $per_id_QRA = $data_struct->{per_id_QRA};
54         my $per_id_QLB = $data_struct->{per_id_QLB};
55         my $per_id_QLB_QRA = $data_struct->{per_id_QLB_QRA};
56         my $per_id_QLA_QRB = $data_struct->{per_id_QLA_QRB};
57         
58         my $win_left_end5 = $data_struct->{win_left_end5};
59         my $win_left_end3 = $data_struct->{win_left_end3};
60         my $win_right_end5 = $data_struct->{win_right_end5};
61         my $win_right_end3 = $data_struct->{win_right_end3};
62         my $Q = $data_struct->{query_alignment};
63         my $A = $data_struct->{parent_A_alignment};
64         my $B = $data_struct->{parent_B_alignment}; 
65         my $BS_A = $data_struct->{BS_A};
66         my $BS_B = $data_struct->{BS_B};
67         
68         my @Q_chars = @{$Q->{align}};
69         my @A_chars = @{$A->{align}};
70         my @B_chars = @{$B->{align}};
71         
72         my $query_acc = $Q->{acc};
73         my $A_acc = $A->{acc};
74         my $B_acc = $B->{acc};
75         
76         my $break_left = $Q->{seqPos}->[$win_left_end3];
77         my $break_right = $Q->{seqPos}->[$win_right_end5];
78         
79         
80         cout << "//\n## CHIMERA\t" << querySeqs[i]->getName() << "\t" << $break_left-$break_right" << endl  
81                 << "\tDIV_QLARB: ". sprintf("%.3f", $div_ratio_QLA_QRB)
82                 << "\tBS_QLARB: " . sprintf("%.2f", $BS_A)
83                 << "\tDIV_QRALB: " . sprintf("%.3f", $div_ratio_QRA_QLB)
84                 << "\tBS_QRALB: " . sprintf("%.2f", $BS_B)
85                 << "\t$A_acc\t$B_acc" 
86                 << "\tbreakpoint: $break_left-$break_right\n\n";
87         
88         ## draw illustration:
89
90         print "            Per_id parents: " . sprintf("%.2f", $per_id_AB) . "\n\n";
91         print "           Per_id(Q,A): " . sprintf("%.2f", $per_id_QA) . "\n";
92         print "--------------------------------------------------- A: $A_acc\n"
93                 . " " . sprintf("%.2f", $per_id_QLA) . "                                " . sprintf("%.2f", $per_id_QRA) . "\n"
94                 . "~~~~~~~~~~~~~~~~~~~~~~~~\\ /~~~~~~~~~~~~~~~~~~~~~~~~ Q: $query_acc\n"
95                 . "DivR: " . sprintf("%.3f", $div_ratio_QLA_QRB) . " BS: " . sprintf("%.2f", $BS_A) . "     |\n"
96                 . "Per_id(QLA,QRB): " . sprintf("%.2f", $per_id_QLA_QRB) . "   |\n"
97                 . "                         |\n"
98                 . "   (L-AB: " . sprintf("%.2f", $per_id_LAB) . ")         |      (R-AB: " . sprintf("%.2f", $per_id_RAB) . ")\n"
99                 . "   WinL:$win_left_end5-$win_left_end3            |      WinR:$win_right_end5-$win_right_end3\n"
100                 . "                         |\n"
101                 . "Per_id(QLB,QRA): " . sprintf("%.2f", $per_id_QLB_QRA) . "   |\n"
102                 . "DivR: " . sprintf("%.3f", $div_ratio_QRA_QLB) . " BS: " . sprintf("%.2f", $BS_B) . "    |\n"
103                 . "~~~~~~~~~~~~~~~~~~~~~~~~/ \\~~~~~~~~~~~~~~~~~~~~~~~~~ Q: $query_acc\n"
104                 . " " . sprintf("%.2f", $per_id_QLB) . "                                " . sprintf("%.2f", $per_id_QRB) . "\n"
105                 . "---------------------------------------------------- B: $B_acc\n";
106         print "            Per_id(Q,B): ". sprintf("%.2f", $per_id_QB) . "\n\n";
107         
108         my $deltaL = $per_id_QLA - $per_id_QLB;
109         my $deltaR = $per_id_QRA - $per_id_QRB;
110
111         print "DeltaL: " . sprintf("%.2f", $deltaL) . "                   DeltaR: " . sprintf("%.2f", $deltaR) . "\n\n";
112         
113         unless ($printAlignmentsFlag) { return; }
114         
115         
116         ## build the left windows:
117         my @Q_left_win = @Q_chars[$win_left_end5..$win_left_end3];
118         my @A_left_win = @A_chars[$win_left_end5..$win_left_end3];
119         my @B_left_win = @B_chars[$win_left_end5..$win_left_end3];
120         
121         &print_alignment($A_acc, \@A_left_win, 
122                                          $query_acc, \@Q_left_win, 
123                                          $B_acc, \@B_left_win);
124         
125         print "\t\t** Breakpoint **\n\n";
126         
127         my @Q_right_win = @Q_chars[$win_right_end5..$win_right_end3];
128         my @A_right_win = @A_chars[$win_right_end5..$win_right_end3];
129         my @B_right_win = @B_chars[$win_right_end5..$win_right_end3];
130         
131         &print_alignment($A_acc, \@A_right_win, 
132                                          $query_acc, \@Q_right_win, 
133                                          $B_acc, \@B_right_win);
134         
135         return;
136 }
137
138
139 ####
140                 
141         */      
142                                 
143         }
144         catch(exception& e) {
145                 errorOut(e, "ChimeraSlayer", "print");
146                 exit(1);
147         }
148 }
149
150 //***************************************************************************************************************
151 void ChimeraSlayer::getChimeras() {
152         try {
153                 
154                 //read in query sequences and subject sequences
155                 mothurOut("Reading sequences and template file... "); cout.flush();
156                 querySeqs = readSeqs(fastafile);
157                 templateSeqs = readSeqs(templateFile);
158                 mothurOut("Done."); mothurOutEndLine();
159                 
160                 int numSeqs = querySeqs.size();
161                 
162                 chimeraResults.resize(numSeqs);
163                 chimeraFlags.resize(numSeqs, "no");
164                 
165                 //break up file if needed
166                 int linesPerProcess = numSeqs / processors ;
167                 
168                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
169                         //find breakup of sequences for all times we will Parallelize
170                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
171                         else {
172                                 //fill line pairs
173                                 for (int i = 0; i < (processors-1); i++) {                      
174                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
175                                 }
176                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
177                                 int i = processors - 1;
178                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
179                         }
180                 #else
181                         lines.push_back(new linePair(0, numSeqs));
182                 #endif
183                 
184                 if (seqMask != "") {    decalc = new DeCalculator();    } //to use below
185                 
186                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
187                 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim);
188                 slayer = new Slayer(window, increment, minSim, divR);
189                 
190                 for (int i = 0; i < querySeqs.size(); i++) {
191                 
192                         string chimeraFlag = maligner->getResults(querySeqs[i]);
193                         float percentIdentical = maligner->getPercentID();
194                         vector<results> Results = maligner->getOutput();
195                         
196                         //cout << querySeqs[i]->getName() << '\t' << chimeraFlag << '\t' << percentIdentical << endl;
197                         
198                         for (int j = 0; j < Results.size(); j++) {
199                                 //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl;
200                         }
201                         
202                         if (chimeraFlag == "yes") {
203                         
204                                 //get sequence that were given from maligner results
205                                 vector<SeqDist> seqs;
206                                 for (int j = 0; j < Results.size(); j++) {
207                                         Sequence* seq = getSequence(Results[j].parent); //makes copy so you can filter and mask and not effect template
208                                         
209                                         //seq = NULL if error occurred in getSequence
210                                         if (seq == NULL) {  break;      }
211                                         else {  
212                                                 SeqDist member;
213                                                 member.seq = seq;
214                                                 member.dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
215                                                 seqs.push_back(member); 
216                                         }
217                                 }
218                         
219                                 //limit number of parents to explore - default 5
220                                 if (Results.size() > parents) {
221                                         //sort by distance
222                                         sort(seqs.begin(), seqs.end(), compareSeqDist);
223                                         //prioritize larger more similiar sequence fragments
224                                         reverse(seqs.begin(), seqs.end());
225                                         
226                                         for (int k = seqs.size()-1; k > (parents-1); k--)  {  
227                                                 delete seqs[k].seq;
228                                                 seqs.pop_back();        
229                                         }
230                                 }
231                 
232                                 //put seqs into vector to send to slayer
233                                 vector<Sequence*> seqsForSlayer;
234                                 for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
235                         
236                                 //mask then send to slayer...
237                                 if (seqMask != "") {
238                                         decalc->setMask(seqMask);
239
240                                         //mask querys
241                                         decalc->runMask(querySeqs[i]);
242                                         
243                                         //mask parents
244                                         for (int k = 0; k < seqsForSlayer.size(); k++) {
245                                                 decalc->runMask(seqsForSlayer[k]);
246                                         }
247                                         
248                                 }
249                                 
250                                 //send to slayer
251                                 chimeraFlags[i] = slayer->getResults(querySeqs[i], seqsForSlayer);
252                                 chimeraResults[i] = slayer->getOutput();
253                         
254                                 //free memory
255                                 for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
256                         }
257                         
258                 }       
259                 //free memory
260                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
261                 
262                 if (seqMask != "") {
263                         delete decalc; 
264                 }
265
266                         
267         }
268         catch(exception& e) {
269                 errorOut(e, "ChimeraSlayer", "getChimeras");
270                 exit(1);
271         }
272 }
273 //***************************************************************************************************************
274 Sequence* ChimeraSlayer::getSequence(string name) {
275         try{
276                 Sequence* temp;
277                 
278                 //look through templateSeqs til you find it
279                 int spot = -1;
280                 for (int i = 0; i < templateSeqs.size(); i++) {
281                         if (name == templateSeqs[i]->getName()) {  
282                                 spot = i;
283                                 break;
284                         }
285                 }
286                 
287                 if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; }
288                 
289                 temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
290                 
291                 return temp;
292         }
293         catch(exception& e) {
294                 errorOut(e, "ChimeraSlayer", "getSequence");
295                 exit(1);
296         }
297 }
298 //***************************************************************************************************************
299
300
301