]> git.donarmstrong.com Git - mothur.git/blob - fractid.cpp
ad4c1579c43c1b75d41583e700fdf4ac85cc49e9
[mothur.git] / fractid.cpp
1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.\r
2 \r
3 #include "myutils.h"\r
4 #include "alpha.h"\r
5 \r
6 //unsigned g_MaxL = 0;\r
7 \r
8 static bool *g_IsChar = g_IsAminoChar;\r
9 \r
10 // Term gaps allowed in query (A) only\r
11 static double GetFractIdGivenPathDerep(const byte *A, const byte *B, const char *Path,\r
12   char *ptrDesc)\r
13         {\r
14         if (*Path == 'D')\r
15                 {\r
16                 if (ptrDesc != 0)\r
17                         sprintf(ptrDesc, "(term gap in Query)");\r
18                 return 0;\r
19                 }\r
20 \r
21         const char *LastM = 0;\r
22         for (const char *p = Path; *p; ++p)\r
23                 if (*p == 'M')\r
24                         LastM = p;\r
25 \r
26         unsigned PosA = 0;\r
27         unsigned PosB = 0;\r
28         unsigned Ids = 0;\r
29         unsigned Diffs = 0;\r
30         unsigned Cols = 0;\r
31         for (const char *p = Path; *p && p != LastM; ++p)\r
32                 {\r
33                 ++Cols;\r
34                 char c = *p;\r
35                 if (c == 'M')\r
36                         {\r
37                         byte a = toupper(A[PosA]);\r
38                         byte b = toupper(B[PosB]);\r
39                         if (g_IsChar[a] && g_IsChar[b])\r
40                                 {\r
41                                 if (a == b)\r
42                                         ++Ids;\r
43                                 else\r
44                                         ++Diffs;\r
45                                 }\r
46                         else\r
47                                 --Cols;\r
48                         }\r
49                 if (c == 'D' || c == 'I')\r
50                         ++Diffs;\r
51                 if (c == 'M' || c == 'D')\r
52                         ++PosA;\r
53                 if (c == 'M' || c == 'I')\r
54                         ++PosB;\r
55                 }\r
56 \r
57         double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
58         if (ptrDesc != 0)\r
59                 sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
60         return FractId;\r
61         }\r
62 \r
63 static double GetFractIdGivenPathAllDiffs(const byte *A, const byte *B, const char *Path,\r
64   char *ptrDesc)\r
65         {\r
66         unsigned PosA = 0;\r
67         unsigned PosB = 0;\r
68         unsigned Ids = 0;\r
69         unsigned Diffs = 0;\r
70         unsigned Cols = 0;\r
71         for (const char *p = Path; *p; ++p)\r
72                 {\r
73                 ++Cols;\r
74                 char c = *p;\r
75                 if (c == 'M')\r
76                         {\r
77                         byte a = toupper(A[PosA]);\r
78                         byte b = toupper(B[PosB]);\r
79                         if (g_IsChar[a] && g_IsChar[b])\r
80                                 {\r
81                                 if (a == b)\r
82                                         ++Ids;\r
83                                 else\r
84                                         ++Diffs;\r
85                                 }\r
86                         else\r
87                                 --Cols;\r
88                         }\r
89                 if (c == 'D' || c == 'I')\r
90                         ++Diffs;\r
91                 if (c == 'M' || c == 'D')\r
92                         ++PosA;\r
93                 if (c == 'M' || c == 'I')\r
94                         ++PosB;\r
95                 }\r
96 \r
97         double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
98         if (ptrDesc != 0)\r
99                 sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
100         return FractId;\r
101         }\r
102 \r
103 static double GetFractIdGivenPathInternalDiffs(const byte *A, const byte *B,\r
104   const char *Path, char *ptrDesc)\r
105         {\r
106         unsigned i = 0;\r
107         unsigned FirstM = UINT_MAX;\r
108         unsigned LastM = UINT_MAX;\r
109         for (const char *p = Path; *p; ++p)\r
110                 {\r
111                 if (*p == 'M')\r
112                         {\r
113                         if (FirstM == UINT_MAX)\r
114                                 FirstM = i;\r
115                         LastM = i;\r
116                         }\r
117                 ++i;\r
118                 }\r
119         if (FirstM == UINT_MAX)\r
120                 {\r
121                 if (ptrDesc != 0)\r
122                         strcpy(ptrDesc, "(no matches)");\r
123                 return 0.0;\r
124                 }\r
125 \r
126         unsigned PosA = 0;\r
127         unsigned PosB = 0;\r
128         unsigned Ids = 0;\r
129         unsigned Diffs = 0;\r
130         unsigned Cols = 0;\r
131         for (unsigned i = 0; i < FirstM; ++i)\r
132                 {\r
133                 char c = Path[i];\r
134                 if (c == 'M' || c == 'D')\r
135                         ++PosA;\r
136                 if (c == 'M' || c == 'I')\r
137                         ++PosB;\r
138                 }\r
139 \r
140         for (unsigned i = FirstM; i <= LastM; ++i)\r
141                 {\r
142                 ++Cols;\r
143                 char c = Path[i];\r
144                 if (c == 'M')\r
145                         {\r
146                         byte a = toupper(A[PosA]);\r
147                         byte b = toupper(B[PosB]);\r
148                         if (g_IsChar[a] && g_IsChar[b])\r
149                                 {\r
150                                 if (a == b)\r
151                                         ++Ids;\r
152                                 else\r
153                                         ++Diffs;\r
154                                 }\r
155                         else\r
156                                 --Cols;\r
157                         }\r
158                 if (c == 'D' || c == 'I')\r
159                         ++Diffs;\r
160                 if (c == 'M' || c == 'D')\r
161                         ++PosA;\r
162                 if (c == 'M' || c == 'I')\r
163                         ++PosB;\r
164                 }\r
165 \r
166         double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols));\r
167         if (ptrDesc != 0)\r
168                 sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
169         return FractId;\r
170         }\r
171 \r
172 static double GetFractIdGivenPathMBL(const byte *A, const byte *B, const char *Path,\r
173   char *ptrDesc)\r
174         {\r
175         unsigned PosA = 0;\r
176         unsigned PosB = 0;\r
177         unsigned Mismatches = 0;\r
178         unsigned Gaps = 0;\r
179         for (const char *p = Path; *p; ++p)\r
180                 {\r
181                 char c = *p;\r
182                 if (c == 'M' && toupper(A[PosA]) != toupper(B[PosB]))\r
183                         ++Mismatches;\r
184                 if (c == 'D' || c == 'I' && (p == Path || p[-1] == 'M'))\r
185                         ++Gaps;\r
186                 if (c == 'M' || c == 'D')\r
187                         ++PosA;\r
188                 if (c == 'M' || c == 'I')\r
189                         ++PosB;\r
190                 }\r
191         unsigned Diffs = Gaps + Mismatches;\r
192         double FractDiffs = (PosB == 0 ? 0.0 : double(Diffs)/double(PosB));\r
193         if (ptrDesc != 0)\r
194                 sprintf(ptrDesc, "Gap opens %u, Id=1 - [(diffs=%u)/(target_length=%u)]",\r
195                   Gaps, Diffs, PosB);\r
196         double FractId = 1.0 - FractDiffs;\r
197         if (FractId < 0.0)\r
198                 return 0.0;\r
199         return FractId;\r
200         }\r
201 \r
202 static double GetFractIdGivenPathBLAST(const byte *A, const byte *B, const char *Path,\r
203   char *ptrDesc)\r
204         {\r
205         unsigned PosA = 0;\r
206         unsigned PosB = 0;\r
207         unsigned Ids = 0;\r
208         unsigned Wilds = 0;\r
209         unsigned Cols = 0;\r
210         for (const char *p = Path; *p; ++p)\r
211                 {\r
212                 ++Cols;\r
213                 char c = *p;\r
214                 if (c == 'M')\r
215                         {\r
216                         byte a = toupper(A[PosA]);\r
217                         byte b = toupper(B[PosB]);\r
218                         if (g_IsChar[a] && g_IsChar[b])\r
219                                 {\r
220                                 if (a == b)\r
221                                         ++Ids;\r
222                                 }\r
223                         else\r
224                                 ++Wilds;\r
225                         }\r
226                 if (c == 'M' || c == 'D')\r
227                         ++PosA;\r
228                 if (c == 'M' || c == 'I')\r
229                         ++PosB;\r
230                 }\r
231         asserta(Cols >= Wilds);\r
232         Cols -= Wilds;\r
233         double FractId = Cols == 0 ? 0.0f : float(Ids)/float(Cols);\r
234         if (ptrDesc != 0)\r
235                 sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols);\r
236         return FractId;\r
237         }\r
238 \r
239 static double GetFractIdGivenPathDefault(const byte *A, const byte *B, const char *Path,\r
240   char *ptrDesc)\r
241         {\r
242         unsigned PosA = 0;\r
243         unsigned PosB = 0;\r
244         unsigned Ids = 0;\r
245         unsigned Wilds = 0;\r
246         for (const char *p = Path; *p; ++p)\r
247                 {\r
248                 char c = *p;\r
249                 if (c == 'M')\r
250                         {\r
251                         byte a = toupper(A[PosA]);\r
252                         byte b = toupper(B[PosB]);\r
253                         if (g_IsChar[a] && g_IsChar[b])\r
254                                 {\r
255                                 if (a == b)\r
256                                         ++Ids;\r
257                                 }\r
258                         else\r
259                                 ++Wilds;\r
260                         }\r
261                 if (c == 'M' || c == 'D')\r
262                         ++PosA;\r
263                 if (c == 'M' || c == 'I')\r
264                         ++PosB;\r
265                 }\r
266         unsigned MinLen = min(PosA, PosB) - Wilds;\r
267         double FractId = (MinLen == 0 ? 0.0 : double(Ids)/double(MinLen));\r
268         if (ptrDesc != 0)\r
269                 sprintf(ptrDesc, "(ids=%u/shorter_length=%u)", Ids, MinLen);\r
270         return FractId;\r
271         }\r
272 \r
273 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
274   bool Nucleo, char *ptrDesc, unsigned IdDef)\r
275         {\r
276         if (Nucleo)\r
277                 g_IsChar = g_IsACGTU;\r
278         else\r
279                 g_IsChar = g_IsAminoChar;\r
280 \r
281         if (Path == 0)\r
282                 {\r
283                 if (ptrDesc != 0)\r
284                         strcpy(ptrDesc, "(NULL path)");\r
285                 return 0.0;\r
286                 }\r
287 \r
288         unsigned ColCount = (unsigned) strlen(Path);\r
289         if (ColCount == 0)\r
290                 return 0.0;\r
291 \r
292         if (opt_leftjust)\r
293                 {\r
294                 if (Path[0] != 'M' || Path[ColCount-1] == 'D')\r
295                         {\r
296                         if (ptrDesc != 0)\r
297                                 strcpy(ptrDesc, "(leftjust)");\r
298                         return 0.0;\r
299                         }\r
300                 }\r
301 \r
302         if (opt_rightjust)\r
303                 {\r
304                 if (Path[0] == 'D' || Path[ColCount-1] != 'M')\r
305                         {\r
306                         if (ptrDesc != 0)\r
307                                 strcpy(ptrDesc, "(rightjust)");\r
308                         return 0.0;\r
309                         }\r
310                 }\r
311 \r
312         double FractId = 0.0;\r
313         //if (opt_idprefix > 0)\r
314         //      {\r
315         //      for (unsigned i = 0; i < opt_idprefix; ++i)\r
316         //              {\r
317         //              char c = Path[i];\r
318         //              if (c != 'M' || toupper(A[i]) != toupper(B[i]))\r
319         //                      {\r
320         //                      if (ptrDesc != 0)\r
321         //                              sprintf(ptrDesc, "Prefix ids %u < idprefix(%u)",\r
322         //                                i, opt_idprefix);\r
323         //                      return 0.0;\r
324         //                      }\r
325         //              }\r
326         //      }\r
327 \r
328         //if (opt_idsuffix > 0)\r
329         //      {\r
330         //      unsigned Cols = strlen(Path);\r
331         //      for (unsigned i = 0; i < opt_idsuffix && i > Cols; ++i)\r
332         //              {\r
333         //              unsigned k = Cols - 1 - i;\r
334         //              char c = Path[k];\r
335         //              if (c != 'M' || toupper(A[k]) != toupper(B[k]))\r
336         //                      {\r
337         //                      if (ptrDesc != 0)\r
338         //                              sprintf(ptrDesc, "Suffix ids %u < idsuffix(%u)",\r
339         //                                i, opt_idsuffix);\r
340         //                      return 0.0;\r
341         //                      }\r
342         //              }\r
343         //      }\r
344 \r
345         if (opt_maxqgap > 0 || opt_maxtgap > 0)\r
346                 {\r
347                 unsigned L = 0;\r
348                 const char *LastM = 0;\r
349                 for (const char *p = Path; *p; ++p)\r
350                         if (*p == 'M')\r
351                                 LastM = p;\r
352 \r
353 //              g_MaxL = 0;\r
354                 for (const char *p = Path; *p && p != LastM; ++p)\r
355                         {\r
356                         char c = *p;\r
357                         switch (c)\r
358                                 {\r
359                         case 'M':\r
360                                 if (L > 0)\r
361                                         {\r
362                                         if (p[-1] == 'D')\r
363                                                 {\r
364                                                 if (L > opt_maxtgap)\r
365                                                         {\r
366                                                         if (ptrDesc != 0)\r
367                                                                 sprintf(ptrDesc, "(maxtgap)");\r
368                                                         return 0.0;\r
369                                                         }\r
370                                                 }\r
371                                         else if (p[-1] == 'I')\r
372                                                 {\r
373                                                 if (L > opt_maxqgap)\r
374                                                         {\r
375                                                         if (ptrDesc != 0)\r
376                                                                 sprintf(ptrDesc, "(maxqgap)");\r
377                                                         return 0.0;\r
378                                                         }\r
379                                                 }\r
380                                         else\r
381                                                 asserta(false);\r
382                                         }\r
383                                 L = 0;\r
384                                 break;\r
385 \r
386                         case 'D':\r
387                         case 'I':\r
388                                 ++L;\r
389                                 //if (L > g_MaxL)\r
390                                 //      g_MaxL = L;\r
391                                 break;\r
392 \r
393                         default:\r
394                                 asserta(false);\r
395                                 }\r
396                         }\r
397                 }\r
398 \r
399         switch (IdDef)\r
400                 {\r
401         case 0:\r
402                 FractId = GetFractIdGivenPathDefault(A, B, Path, ptrDesc);\r
403                 break;\r
404 \r
405         case 1:\r
406                 FractId = GetFractIdGivenPathAllDiffs(A, B, Path, ptrDesc);\r
407                 break;\r
408 \r
409         case 2:\r
410                 FractId = GetFractIdGivenPathInternalDiffs(A, B, Path, ptrDesc);\r
411                 break;\r
412 \r
413         case 3:\r
414                 FractId = GetFractIdGivenPathMBL(A, B, Path, ptrDesc);\r
415                 break;\r
416 \r
417         case 4:\r
418                 FractId = GetFractIdGivenPathBLAST(A, B, Path, ptrDesc);\r
419                 break;\r
420 \r
421         case 5:\r
422                 FractId = GetFractIdGivenPathDerep(A, B, Path, ptrDesc);\r
423                 break;\r
424 \r
425         default:\r
426                 Die("--iddef %u invalid", opt_iddef);\r
427                 }\r
428 \r
429         return FractId;\r
430         }\r
431 \r
432 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path,\r
433   bool Nucleo, char *ptrDesc)\r
434         {\r
435         return GetFractIdGivenPath(A, B, Path, Nucleo, ptrDesc, opt_iddef);\r
436         }\r
437 \r
438 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo)\r
439         {\r
440         return GetFractIdGivenPath(A, B, Path, Nucleo, (char *) 0);\r
441         }\r
442 \r
443 double GetFractIdGivenPath(const byte *A, const byte *B, const string &Path)\r
444         {\r
445         return GetFractIdGivenPath(A, B, Path.c_str(), true);\r
446         }\r
447 \r
448 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path)\r
449         {\r
450         return GetFractIdGivenPath(A, B, Path, true);\r
451         }\r