]> git.donarmstrong.com Git - mothur.git/blob - uchime_main.cpp
18bd8bb4fbb8cd7cde71c4044901ff4054e55f94
[mothur.git] / uchime_main.cpp
1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.\r
2 \r
3 #include "myutils.h"\r
4 #include "chime.h"\r
5 #include "seqdb.h"\r
6 #include "dp.h"\r
7 #include "ultra.h"\r
8 #include "hspfinder.h"\r
9 #include <algorithm>\r
10 #include <set>\r
11 #include "mothurout.h"\r
12 \r
13 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, \r
14   const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,\r
15   float MinFractId, ChimeHit2 &Hit);\r
16 \r
17 FILE *g_fUChime;\r
18 FILE *g_fUChimeAlns;\r
19 const vector<float> *g_SortVecFloat;\r
20 bool g_UchimeDeNovo = false;\r
21 \r
22 void Usage()\r
23         {\r
24         //printf("\n");\r
25         //printf("UCHIME %s by Robert C. Edgar\n", MY_VERSION);\r
26         //printf("http://www.drive5.com/uchime\n");\r
27         //printf("\n");\r
28         //printf("This software is donated to the public domain\n");\r
29         //printf("\n");\r
30 \r
31         //printf(\r
32 //#include "help.h"\r
33                 //);\r
34         }\r
35 \r
36 void SetBLOSUM62()\r
37         {\r
38         Die("SetBLOSUM62 not implemented");\r
39         }\r
40 \r
41 void ReadSubstMx(const string &/*FileName*/, Mx<float> &/*Mxf*/)\r
42         {\r
43         Die("ReadSubstMx not implemented");\r
44         }\r
45 \r
46 void LogAllocs()\r
47         {\r
48         /*empty*/\r
49         }\r
50 \r
51 static bool CmpDescVecFloat(unsigned i, unsigned j)\r
52         {\r
53         return (*g_SortVecFloat)[i] > (*g_SortVecFloat)[j];\r
54         }\r
55 \r
56 void Range(vector<unsigned> &v, unsigned N)\r
57         {\r
58         v.clear();\r
59         v.reserve(N);\r
60         for (unsigned i = 0; i < N; ++i)\r
61                 v.push_back(i);\r
62         }\r
63 \r
64 void SortDescending(const vector<float> &Values, vector<unsigned> &Order)\r
65         {\r
66         StartTimer(Sort);\r
67         const unsigned N = SIZE(Values);\r
68         Range(Order, N);\r
69         g_SortVecFloat = &Values;\r
70         sort(Order.begin(), Order.end(), CmpDescVecFloat);\r
71         EndTimer(Sort);\r
72         }\r
73 \r
74 float GetAbFromLabel(const string &Label)\r
75         {\r
76         vector<string> Fields;\r
77         Split(Label, Fields, '/');\r
78         const unsigned N = SIZE(Fields);\r
79         for (unsigned i = 0; i < N; ++i)\r
80                 {\r
81                 const string &Field = Fields[i];\r
82                 if (Field.substr(0, 3) == "ab=")\r
83                         {\r
84                         string a = Field.substr(3, string::npos);\r
85                         return (float) atof(a.c_str());\r
86                         }\r
87                 }\r
88         if (g_UchimeDeNovo)\r
89                 Die("Missing abundance /ab=xx/ in label >%s", Label.c_str());\r
90         return 0.0;\r
91         }\r
92 \r
93 int uchime_main(int argc, char *argv[])\r
94         {\r
95         MothurOut* m;\r
96         m = MothurOut::getInstance();\r
97                 \r
98         MyCmdLine(argc, argv);\r
99 \r
100         if (argc < 2)\r
101                 {\r
102                 Usage();\r
103                 return 0;\r
104                 }\r
105 \r
106         if (opt_version)\r
107                 {\r
108                 printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);\r
109                 return 0;\r
110                 }\r
111 \r
112                 \r
113         if (!optset_w)\r
114                 opt_w = 8;\r
115         \r
116         float MinFractId = 0.95f;\r
117         if (optset_id)\r
118                 MinFractId = (float) opt_id;\r
119 \r
120         Log("%8.2f  minh\n", opt_minh);\r
121         Log("%8.2f  xn\n", opt_xn);\r
122         Log("%8.2f  dn\n", opt_dn);\r
123         Log("%8.2f  xa\n", opt_xa);\r
124         Log("%8.2f  mindiv\n", opt_mindiv);\r
125         Log("%8u  maxp\n", opt_maxp);\r
126 \r
127         if (opt_input == "" && opt_uchime != "")\r
128                 opt_input = opt_uchime;\r
129 \r
130         if (opt_input == "")\r
131                 Die("Missing --input");\r
132 \r
133         g_UchimeDeNovo = (opt_db == "");\r
134 \r
135         if (opt_uchimeout != "")\r
136                 g_fUChime = CreateStdioFile(opt_uchimeout);\r
137 \r
138         if (opt_uchimealns != "")\r
139                 g_fUChimeAlns = CreateStdioFile(opt_uchimealns);\r
140 \r
141         SeqDB Input;\r
142         SeqDB DB;\r
143 \r
144         Input.FromFasta(opt_input);\r
145         if (!Input.IsNucleo())\r
146                 Die("Input contains amino acid sequences");\r
147 \r
148         const unsigned QuerySeqCount = Input.GetSeqCount();\r
149         vector<unsigned> Order;\r
150         for (unsigned i = 0; i < QuerySeqCount; ++i)\r
151                 Order.push_back(i);\r
152 \r
153         if (g_UchimeDeNovo)\r
154                 {\r
155                 vector<float> Abs;\r
156                 for (unsigned i = 0; i < QuerySeqCount; ++i)\r
157                         {\r
158                         const char *Label = Input.GetLabel(i);\r
159                         float Ab = GetAbFromLabel(Label);\r
160                         Abs.push_back(Ab);\r
161                         }\r
162                 SortDescending(Abs, Order);\r
163                 DB.m_IsNucleoSet = true;\r
164                 DB.m_IsNucleo = true;\r
165                 }\r
166         else\r
167                 {\r
168                 DB.FromFasta(opt_db);\r
169                 if (!DB.IsNucleo())\r
170                         Die("Database contains amino acid sequences");\r
171                 }\r
172 \r
173         vector<ChimeHit2> Hits;\r
174         unsigned HitCount = 0;\r
175         for (unsigned i = 0; i < QuerySeqCount; ++i)\r
176                 {\r
177                         \r
178                 if (m->control_pressed) { break; }\r
179                         \r
180                 unsigned QuerySeqIndex = Order[i];\r
181 \r
182                 SeqData QSD;\r
183                 Input.GetSeqData(QuerySeqIndex, QSD);\r
184 \r
185                 float QAb = -1.0;\r
186                 if (g_UchimeDeNovo)\r
187                         QAb = GetAbFromLabel(QSD.Label);\r
188 \r
189                 ChimeHit2 Hit;\r
190                 AlnParams &AP = *(AlnParams *) 0;\r
191                 AlnHeuristics &AH = *(AlnHeuristics *) 0;\r
192                 HSPFinder &HF = *(HSPFinder *) 0;\r
193                 bool Found = SearchChime(DB, QSD, QAb, AP, AH, HF, MinFractId, Hit);\r
194                 if (Found)\r
195                         ++HitCount;\r
196                 else\r
197                         {\r
198                         if (g_UchimeDeNovo)\r
199                                 DB.AddSeq(QSD.Label, QSD.Seq, QSD.L);\r
200                         }\r
201 \r
202                 WriteChimeHit(g_fUChime, Hit);\r
203 \r
204                 //ProgressStep(i, QuerySeqCount, "%u/%u chimeras found (%.1f%%)", HitCount, i, Pct(HitCount, i+1));\r
205                         //report progress\r
206                         if((i+1) % 100 == 0){   m->mothurOut("Processing sequence: " + toString(i+1) + ", " + toString(HitCount) + " chimeras found."); m->mothurOutEndLine();          }\r
207                 }\r
208                 if (!m->control_pressed) { \r
209                         //report progress\r
210                         if((QuerySeqCount) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(QuerySeqCount) + ", " + toString(HitCount) + " chimeras found."); m->mothurOutEndLine();                }\r
211                 }\r
212 \r
213         Log("\n");\r
214         Log("%s: %u/%u chimeras found (%.1f%%)\n",\r
215           opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount));\r
216 \r
217         CloseStdioFile(g_fUChime);\r
218         CloseStdioFile(g_fUChimeAlns);\r
219 \r
220         ProgressExit();\r
221         return 0;\r
222         }\r