]> git.donarmstrong.com Git - mothur.git/blob - uchime_src/searchchime.cpp
changes while testing
[mothur.git] / uchime_src / searchchime.cpp
1 #include "myutils.h"\r
2 #include "ultra.h"\r
3 #include "chime.h"\r
4 #include "uc.h"\r
5 #include "dp.h"\r
6 #include <set>\r
7 #include <algorithm>\r
8 \r
9 #define TRACE   0\r
10 \r
11 extern FILE *g_fUChime;\r
12 \r
13 void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,\r
14   vector<unsigned> &Parents);\r
15 \r
16 void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,\r
17   const string &PathQA, const string &PathQB, ChimeHit2 &Hit);\r
18 \r
19 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);\r
20 \r
21 static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,\r
22   vector<unsigned> &IdVec, unsigned d)\r
23         {\r
24         IdVec.clear();\r
25         const unsigned ColCount = SIZE(Path);\r
26 \r
27         const byte *Q = QSD.Seq;\r
28         const byte *P = PSD.Seq;\r
29 \r
30         const unsigned QL = QSD.L;\r
31         const unsigned PL = PSD.L;\r
32 \r
33         if (QL <= d)\r
34                 {\r
35                 IdVec.resize(QSD.L, 0);\r
36                 return;\r
37                 }\r
38 \r
39         unsigned QPos = 0;\r
40         unsigned PPos = 0;\r
41 \r
42         vector<bool> SameVec;\r
43         SameVec.reserve(QL);\r
44         for (unsigned Col = 0; Col < ColCount; ++Col)\r
45                 {\r
46                 char c = Path[Col];\r
47 \r
48                 bool Same = false;\r
49                 if (c == 'M')\r
50                         {\r
51                         byte q = Q[QPos];\r
52                         byte p = P[PPos];\r
53                         Same = (toupper(q) == toupper(p));\r
54                         }\r
55 \r
56                 if (c == 'M' || c == 'D')\r
57                         {\r
58                         ++QPos;\r
59                         SameVec.push_back(Same);\r
60                         }\r
61 \r
62                 if (c == 'M' || c == 'I')\r
63                         ++PPos;\r
64                 }\r
65 \r
66         asserta(SIZE(SameVec) == QL);\r
67 \r
68         unsigned n = 0;\r
69         for (unsigned QPos = 0; QPos < d; ++QPos)\r
70                 {\r
71                 if (SameVec[QPos])\r
72                         ++n;\r
73                 IdVec.push_back(n);\r
74                 }\r
75 \r
76         for (unsigned QPos = d; QPos < QL; ++QPos)\r
77                 {\r
78                 if (SameVec[QPos])\r
79                         ++n;\r
80                 IdVec.push_back(n);\r
81                 if (SameVec[QPos-d])\r
82                         --n;\r
83                 }\r
84         asserta(SIZE(IdVec) == QL);\r
85 \r
86 #if     TRACE\r
87         {\r
88         Log("\n");\r
89         Log("GetSmoothedIdVec\n");\r
90         unsigned QPos = 0;\r
91         unsigned PPos = 0;\r
92         Log("Q P  Same       Id\n");\r
93         Log("- -  ----  -------\n");\r
94         for (unsigned Col = 0; Col < ColCount; ++Col)\r
95                 {\r
96                 char c = Path[Col];\r
97 \r
98                 bool Same = false;\r
99                 if (c == 'M')\r
100                         {\r
101                         byte q = Q[QPos];\r
102                         byte p = P[PPos];\r
103                         Same = (toupper(q) == toupper(p));\r
104                         Log("%c %c  %4c  %7d\n", q, p, tof(Same), IdVec[QPos]);\r
105                         }\r
106 \r
107                 if (c == 'M' || c == 'D')\r
108                         ++QPos;\r
109                 if (c == 'M' || c == 'I')\r
110                         ++PPos;\r
111                 }\r
112         }\r
113 #endif\r
114         }\r
115 \r
116 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
117   const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
118   float MinFractId, ChimeHit2 &Hit)\r
119         {\r
120         Hit.Clear();\r
121         Hit.QLabel = QSD.Label;\r
122 \r
123         if (opt_verbose)\r
124                 {\r
125                 Log("\n");\r
126                 Log("SearchChime()\n");\r
127                 Log("Query>%s\n", QSD.Label);\r
128                 }\r
129 \r
130         vector<unsigned> Parents;\r
131         GetCandidateParents(U, QSD, QAb, Parents);\r
132 \r
133         unsigned ParentCount = SIZE(Parents);\r
134         if (ParentCount <= 1)\r
135                 {\r
136                 if (opt_verbose)\r
137                         Log("%u candidate parents, done.\n", ParentCount);\r
138                 return false;\r
139                 }\r
140 \r
141         if (opt_fastalign)\r
142                 HF.SetA(QSD);\r
143         HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);\r
144 \r
145         unsigned ChunkLength;\r
146         vector<unsigned> ChunkLos;\r
147         GetChunkInfo(QSD.L, ChunkLength, ChunkLos);\r
148         const unsigned ChunkCount = SIZE(ChunkLos);\r
149 \r
150         vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);\r
151         vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);\r
152 \r
153         vector<SeqData> PSDs;\r
154         vector<string> Paths;\r
155         double TopPctId = 0.0;\r
156         unsigned TopParentIndex = UINT_MAX;\r
157         unsigned QL = QSD.L;\r
158         vector<unsigned> MaxIdVec(QL, 0);\r
159         for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
160                 {\r
161                 unsigned ParentSeqIndex = Parents[ParentIndex];\r
162 \r
163                 SeqData PSD;\r
164                 //PSD.Label = U.GetSeedLabel(ParentSeqIndex);\r
165                 //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);\r
166                 //PSD.L = U.GetSeedLength(ParentSeqIndex);\r
167                 //PSD.Index = ParentSeqIndex;\r
168                 U.GetSeqData(ParentSeqIndex, PSD);\r
169                 PSDs.push_back(PSD);\r
170 \r
171                 if (opt_fastalign)\r
172                         HF.SetB(PSD);\r
173 \r
174                 PathData PD;\r
175 \r
176                 float HSPId;\r
177                 bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);\r
178                 if (!Found)\r
179                         {\r
180                         Paths.push_back("");                            \r
181                         continue;\r
182                         }\r
183 \r
184                 double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);\r
185                 if (opt_selfid && PctId == 100.0)\r
186                         {\r
187                         Paths.push_back("");                            \r
188                         continue;\r
189                         }\r
190 \r
191                 if (PctId > TopPctId)\r
192                         {\r
193                         TopParentIndex = ParentIndex;\r
194                         TopPctId = PctId;\r
195                         if (TopPctId >= 100.0 - opt_mindiv)\r
196                                 {\r
197                                 if (opt_verbose)\r
198                                         {\r
199                                         Log("  %.1f%%  >%s\n", TopPctId, PSD.Label);\r
200                                         Log("  Top hit exceeds ctl threshold, done.\n");\r
201                                         return false;\r
202                                         }\r
203                                 }\r
204                         }\r
205 \r
206                 string Path = PD.Start;\r
207                 Paths.push_back(Path);\r
208 \r
209                 vector<unsigned> IdVec;\r
210                 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
211 \r
212                 for (unsigned QPos = 0; QPos < QL; ++QPos)\r
213                         if (IdVec[QPos] > MaxIdVec[QPos])\r
214                                 MaxIdVec[QPos] = IdVec[QPos];\r
215                 }\r
216 \r
217         vector<unsigned> BestParents;\r
218         for (unsigned k = 0; k < opt_maxp; ++k)\r
219                 {\r
220                 unsigned BestParent = UINT_MAX;\r
221                 unsigned BestCov = 0;\r
222                 for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)\r
223                         {\r
224                         const SeqData &PSD = PSDs[ParentIndex];\r
225                         const string &Path = Paths[ParentIndex];\r
226                         if (Path == "")\r
227                                 continue;\r
228 \r
229                         vector<unsigned> IdVec;\r
230                         GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
231 \r
232                         unsigned Cov = 0;\r
233                         for (unsigned QPos = 0; QPos < QL; ++QPos)\r
234                                 if (IdVec[QPos] == MaxIdVec[QPos])\r
235                                         ++Cov;\r
236 \r
237                         if (Cov > BestCov)\r
238                                 {\r
239                                 BestParent = ParentIndex;\r
240                                 BestCov = Cov;\r
241                                 }\r
242                         }\r
243 \r
244                 if (BestParent == UINT_MAX)\r
245                         break;\r
246 \r
247                 BestParents.push_back(BestParent);\r
248                 vector<unsigned> IdVec;\r
249 \r
250                 const SeqData &PSD = PSDs[BestParent];\r
251                 const string &Path = Paths[BestParent];\r
252                 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);\r
253                 for (unsigned QPos = 0; QPos < QL; ++QPos)\r
254                         if (IdVec[QPos] == MaxIdVec[QPos])\r
255                                 MaxIdVec[QPos] = UINT_MAX;\r
256                 }\r
257 \r
258         unsigned BestParentCount = SIZE(BestParents);\r
259 \r
260         if (opt_verbose)\r
261                 {\r
262                 Log("%u/%u best parents\n", BestParentCount, ParentCount);\r
263                 for (unsigned k = 0; k < BestParentCount; ++k)\r
264                         {\r
265                         unsigned i = BestParents[k];\r
266                         Log(" %s\n", PSDs[i].Label);\r
267                         }\r
268                 }\r
269 \r
270         bool Found = false;\r
271         for (unsigned k1 = 0; k1 < BestParentCount; ++k1)\r
272                 {\r
273                 unsigned i1 = BestParents[k1];\r
274                 asserta(i1 < ParentCount);\r
275 \r
276                 const SeqData &PSD1 = PSDs[i1];\r
277                 const string &Path1 = Paths[i1];\r
278 \r
279                 for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)\r
280                         {\r
281                         unsigned i2 = BestParents[k2];\r
282                         asserta(i2 < ParentCount);\r
283                         asserta(i2 != i1);\r
284 \r
285                         const SeqData &PSD2 = PSDs[i2];\r
286                         const string &Path2 = Paths[i2];\r
287 \r
288                         ChimeHit2 Hit2;\r
289                         AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);\r
290                         Hit2.PctIdQT = TopPctId;\r
291 \r
292                         if (Hit2.Accept())\r
293                                 Found = true;\r
294 \r
295                         if (Hit2.Score > Hit.Score)\r
296                                 Hit = Hit2;\r
297 \r
298                         if (opt_verbose)\r
299                                 Hit2.LogMe();\r
300                         }\r
301                 }\r
302 \r
303         return Found;\r
304         }\r