]> git.donarmstrong.com Git - mothur.git/blob - searchchime.cpp
a1c981a5dc32f7b267c8c0a11e727870a426e4fe
[mothur.git] / searchchime.cpp
1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.\r
2 \r
3 #include "myutils.h"\r
4 #include "ultra.h"\r
5 #include "chime.h"\r
6 #include "uc.h"\r
7 #include "dp.h"\r
8 #include <set>\r
9 #include <algorithm>\r
10 \r
11 #define TRACE   0\r
12 \r
13 extern FILE *g_fUChime;\r
14 \r
15 void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
16   vector<unsigned> &Parents);\r
17 \r
18 void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
19   const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
20 \r
21 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
22 \r
23 static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
24   vector<unsigned> &IdVec, unsigned d)\r
25         {\r
26         IdVec.clear();\r
27         const unsigned ColCount = SIZE(Path);\r
28 \r
29         const byte *Q = QSD.Seq;\r
30         const byte *P = PSD.Seq;\r
31 \r
32         const unsigned QL = QSD.L;\r
33         const unsigned PL = PSD.L;\r
34 \r
35         if (QL <= d)\r
36                 {\r
37                 IdVec.resize(QSD.L, 0);\r
38                 return;\r
39                 }\r
40 \r
41         unsigned QPos = 0;\r
42         unsigned PPos = 0;\r
43 \r
44         vector<bool> SameVec;\r
45         SameVec.reserve(QL);\r
46         for (unsigned Col = 0; Col < ColCount; ++Col)\r
47                 {\r
48                 char c = Path[Col];\r
49 \r
50                 bool Same = false;\r
51                 if (c == 'M')\r
52                         {\r
53                         byte q = Q[QPos];\r
54                         byte p = P[PPos];\r
55                         Same = (toupper(q) == toupper(p));\r
56                         }\r
57 \r
58                 if (c == 'M' || c == 'D')\r
59                         {\r
60                         ++QPos;\r
61                         SameVec.push_back(Same);\r
62                         }\r
63 \r
64                 if (c == 'M' || c == 'I')\r
65                         ++PPos;\r
66                 }\r
67 \r
68         asserta(SIZE(SameVec) == QL);\r
69 \r
70         unsigned n = 0;\r
71         for (unsigned QPos = 0; QPos < d; ++QPos)\r
72                 {\r
73                 if (SameVec[QPos])\r
74                         ++n;\r
75                 IdVec.push_back(n);\r
76                 }\r
77 \r
78         for (unsigned QPos = d; QPos < QL; ++QPos)\r
79                 {\r
80                 if (SameVec[QPos])\r
81                         ++n;\r
82                 IdVec.push_back(n);\r
83                 if (SameVec[QPos-d])\r
84                         --n;\r
85                 }\r
86         asserta(SIZE(IdVec) == QL);\r
87 \r
88 #if     TRACE\r
89         {\r
90         Log("\n");\r
91         Log("GetSmoothedIdVec\n");\r
92         unsigned QPos = 0;\r
93         unsigned PPos = 0;\r
94         Log("Q P  Same       Id\n");\r
95         Log("- -  ----  -------\n");\r
96         for (unsigned Col = 0; Col < ColCount; ++Col)\r
97                 {\r
98                 char c = Path[Col];\r
99 \r
100                 bool Same = false;\r
101                 if (c == 'M')\r
102                         {\r
103                         byte q = Q[QPos];\r
104                         byte p = P[PPos];\r
105                         Same = (toupper(q) == toupper(p));\r
106                         Log("%c %c  %4c  %7d\n", q, p, tof(Same), IdVec[QPos]);\r
107                         }\r
108 \r
109                 if (c == 'M' || c == 'D')\r
110                         ++QPos;\r
111                 if (c == 'M' || c == 'I')\r
112                         ++PPos;\r
113                 }\r
114         }\r
115 #endif\r
116         }\r
117 \r
118 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
119   const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
120   float MinFractId, ChimeHit2 &Hit)\r
121         {\r
122         Hit.Clear();\r
123         Hit.QLabel = QSD.Label;\r
124 \r
125         if (opt_verbose)\r
126                 {\r
127                 Log("\n");\r
128                 Log("SearchChime()\n");\r
129                 Log("Query>%s\n", QSD.Label);\r
130                 }\r
131 \r
132         vector<unsigned> Parents;\r
133         GetCandidateParents(U, QSD, QAb, Parents);\r
134 \r
135         unsigned ParentCount = SIZE(Parents);\r
136         if (ParentCount <= 1)\r
137                 {\r
138                 if (opt_verbose)\r
139                         Log("%u candidate parents, done.\n", ParentCount);\r
140                 return false;\r
141                 }\r
142 \r
143         if (opt_fastalign)\r
144                 HF.SetA(QSD);\r
145         HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
146 \r
147         unsigned ChunkLength;\r
148         vector<unsigned> ChunkLos;\r
149         GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
150         const unsigned ChunkCount = SIZE(ChunkLos);\r
151 \r
152         vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
153         vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
154 \r
155         vector<SeqData> PSDs;\r
156         vector<string> Paths;\r
157         double TopPctId = 0.0;\r
158         unsigned TopParentIndex = UINT_MAX;\r
159         unsigned QL = QSD.L;\r
160         vector<unsigned> MaxIdVec(QL, 0);\r
161         for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
162                 {\r
163                 unsigned ParentSeqIndex = Parents[ParentIndex];\r
164 \r
165                 SeqData PSD;\r
166                 //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
167                 //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
168                 //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
169                 //PSD.Index = ParentSeqIndex;\r
170                 U.GetSeqData(ParentSeqIndex, PSD);\r
171                 PSDs.push_back(PSD);\r
172 \r
173                 if (opt_fastalign)\r
174                         HF.SetB(PSD);\r
175 \r
176                 PathData PD;\r
177 \r
178                 float HSPId;\r
179                 bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
180                 if (!Found)\r
181                         {\r
182                         Paths.push_back("");                            \r
183                         continue;\r
184                         }\r
185 \r
186                 double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
187                 if (opt_selfid && PctId == 100.0)\r
188                         {\r
189                         Paths.push_back("");                            \r
190                         continue;\r
191                         }\r
192 \r
193                 if (PctId > TopPctId)\r
194                         {\r
195                         TopParentIndex = ParentIndex;\r
196                         TopPctId = PctId;\r
197                         if (TopPctId >= 100.0 - opt_mindiv)\r
198                                 {\r
199                                 if (opt_verbose)\r
200                                         {\r
201                                         Log("  %.1f%%  >%s\n", TopPctId, PSD.Label);\r
202                                         Log("  Top hit exceeds ctl threshold, done.\n");\r
203                                         return false;\r
204                                         }\r
205                                 }\r
206                         }\r
207 \r
208                 string Path = PD.Start;\r
209                 Paths.push_back(Path);\r
210 \r
211                 vector<unsigned> IdVec;\r
212                 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
213 \r
214                 for (unsigned QPos = 0; QPos < QL; ++QPos)\r
215                         if (IdVec[QPos] > MaxIdVec[QPos])\r
216                                 MaxIdVec[QPos] = IdVec[QPos];\r
217                 }\r
218 \r
219         vector<unsigned> BestParents;\r
220         for (unsigned k = 0; k < opt_maxp; ++k)\r
221                 {\r
222                 unsigned BestParent = UINT_MAX;\r
223                 unsigned BestCov = 0;\r
224                 for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
225                         {\r
226                         const SeqData &PSD = PSDs[ParentIndex];\r
227                         const string &Path = Paths[ParentIndex];\r
228                         if (Path == "")\r
229                                 continue;\r
230 \r
231                         vector<unsigned> IdVec;\r
232                         GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
233 \r
234                         unsigned Cov = 0;\r
235                         for (unsigned QPos = 0; QPos < QL; ++QPos)\r
236                                 if (IdVec[QPos] == MaxIdVec[QPos])\r
237                                         ++Cov;\r
238 \r
239                         if (Cov > BestCov)\r
240                                 {\r
241                                 BestParent = ParentIndex;\r
242                                 BestCov = Cov;\r
243                                 }\r
244                         }\r
245 \r
246                 if (BestParent == UINT_MAX)\r
247                         break;\r
248 \r
249                 BestParents.push_back(BestParent);\r
250                 vector<unsigned> IdVec;\r
251 \r
252                 const SeqData &PSD = PSDs[BestParent];\r
253                 const string &Path = Paths[BestParent];\r
254                 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
255                 for (unsigned QPos = 0; QPos < QL; ++QPos)\r
256                         if (IdVec[QPos] == MaxIdVec[QPos])\r
257                                 MaxIdVec[QPos] = UINT_MAX;\r
258                 }\r
259 \r
260         unsigned BestParentCount = SIZE(BestParents);\r
261 \r
262         if (opt_verbose)\r
263                 {\r
264                 Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
265                 for (unsigned k = 0; k < BestParentCount; ++k)\r
266                         {\r
267                         unsigned i = BestParents[k];\r
268                         Log(" %s\n", PSDs[i].Label);\r
269                         }\r
270                 }\r
271 \r
272         bool Found = false;\r
273         for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
274                 {\r
275                 unsigned i1 = BestParents[k1];\r
276                 asserta(i1 < ParentCount);\r
277 \r
278                 const SeqData &PSD1 = PSDs[i1];\r
279                 const string &Path1 = Paths[i1];\r
280 \r
281                 for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
282                         {\r
283                         unsigned i2 = BestParents[k2];\r
284                         asserta(i2 < ParentCount);\r
285                         asserta(i2 != i1);\r
286 \r
287                         const SeqData &PSD2 = PSDs[i2];\r
288                         const string &Path2 = Paths[i2];\r
289 \r
290                         ChimeHit2 Hit2;\r
291                         AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
292                         Hit2.PctIdQT = TopPctId;\r
293 \r
294                         if (Hit2.Accept())\r
295                                 Found = true;\r
296 \r
297                         if (Hit2.Score > Hit.Score)\r
298                                 Hit = Hit2;\r
299 \r
300                         if (opt_verbose)\r
301                                 Hit2.LogMe();\r
302                         }\r
303                 }\r
304 \r
305         return Found;\r
306         }\r