1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.
\r
10 void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)
\r
12 const unsigned ROWLEN = 80;
\r
14 fprintf(f, ">%s\n", Label);
\r
15 unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;
\r
16 for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)
\r
18 unsigned From = BlockIndex*ROWLEN;
\r
19 unsigned To = From + ROWLEN;
\r
22 for (unsigned Pos = From; Pos < To; ++Pos)
\r
38 void SeqDB::Clear(bool ctor)
\r
42 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
44 unsigned n = strlen(m_Labels[i]);
\r
45 MYFREE(m_Labels[i], n, SeqDB);
\r
46 MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);
\r
48 MYFREE(m_Labels, m_Size, SeqDB);
\r
49 MYFREE(m_Seqs, m_Size, SeqDB);
\r
50 MYFREE(m_SeqLengths, m_Size, SeqDB);
\r
63 m_IsNucleoSet = false;
\r
66 void SeqDB::InitEmpty(bool Nucleo)
\r
69 m_IsNucleo = Nucleo;
\r
70 m_IsNucleoSet = true;
\r
73 void SeqDB::FromFasta(const string &FileName, bool AllowGaps)
\r
76 m_FileName = FileName;
\r
80 SF.m_AllowGaps = AllowGaps;
\r
82 ProgressStep(0, 1000, "Reading %s", FileName.c_str());
\r
85 unsigned QueryPctDoneX10 = SF.GetPctDoneX10();
\r
86 ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());
\r
87 const byte *Seq = SF.GetNextSeq();
\r
91 const char *Label = SF.GetLabel();
\r
92 unsigned L = SF.GetSeqLength();
\r
93 AddSeq(Label, Seq, L);
\r
95 ProgressStep(999, 1000, "Reading %s", FileName.c_str());
\r
99 Progress("%s sequences\n", IntToStr(GetSeqCount()));
\r
102 void SeqDB::ToFasta(const string &FileName) const
\r
104 FILE *f = CreateStdioFile(FileName);
\r
105 for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)
\r
106 ToFasta(f, SeqIndex);
\r
110 void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const
\r
113 fprintf(f, ">%s\n", GetLabel(SeqIndex));
\r
115 const unsigned ROWLEN = 80;
\r
117 unsigned L = GetSeqLength(SeqIndex);
\r
118 const byte *Seq = GetSeq(SeqIndex);
\r
119 unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;
\r
120 for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)
\r
122 unsigned From = BlockIndex*ROWLEN;
\r
123 unsigned To = From + ROWLEN;
\r
126 for (unsigned Pos = From; Pos < To; ++Pos)
\r
127 fputc(Seq[Pos], f);
\r
132 void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const
\r
134 asserta(SeqIndex < m_SeqCount);
\r
135 fprintf(f, ">%s\n", GetLabel(SeqIndex));
\r
136 SeqToFasta(f, SeqIndex);
\r
139 unsigned SeqDB::GetMaxLabelLength() const
\r
141 const unsigned SeqCount = GetSeqCount();
\r
143 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
145 unsigned L = (unsigned) strlen(m_Labels[Index]);
\r
152 unsigned SeqDB::GetMaxSeqLength() const
\r
154 const unsigned SeqCount = GetSeqCount();
\r
156 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
158 unsigned L = m_SeqLengths[Index];
\r
165 void SeqDB::LogMe() const
\r
168 const unsigned SeqCount = GetSeqCount();
\r
169 Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));
\r
173 Log("Index Label Length Seq\n");
\r
174 Log("----- ---------------- ------ ---\n");
\r
175 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
178 Log(" %16.16s", m_Labels[Index]);
\r
179 unsigned L = m_SeqLengths[Index];
\r
181 Log(" %*.*s", L, L, m_Seqs[Index]);
\r
186 void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const
\r
188 asserta(Id < m_SeqCount);
\r
189 Buffer.Seq = m_Seqs[Id];
\r
190 Buffer.Label = m_Labels[Id];
\r
191 Buffer.L = m_SeqLengths[Id];
\r
193 Buffer.ORFParent = 0;
\r
194 Buffer.RevComp = false;
\r
195 Buffer.Nucleo = IsNucleo();
\r
198 void SeqDB::SetIsNucleo()
\r
200 const unsigned SeqCount = GetSeqCount();
\r
202 for (unsigned i = 0; i < 100; ++i)
\r
204 unsigned SeqIndex = unsigned(rand()%SeqCount);
\r
205 const byte *Seq = GetSeq(SeqIndex);
\r
206 unsigned L = GetSeqLength(SeqIndex);
\r
207 const unsigned Pos = unsigned(rand()%L);
\r
210 if (g_IsNucleoChar[c])
\r
213 m_IsNucleo = (N > 80);
\r
214 m_IsNucleoSet = true;
\r
217 unsigned SeqDB::GetTotalLength() const
\r
219 const unsigned SeqCount = GetSeqCount();
\r
220 unsigned TotalLength = 0;
\r
221 for (unsigned Id = 0; Id < SeqCount; ++Id)
\r
222 TotalLength += GetSeqLength(Id);
\r
223 return TotalLength;
\r
226 unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)
\r
228 StartTimer(AddSeq);
\r
229 if (m_SeqCount >= m_Size)
\r
231 unsigned NewSize = unsigned(m_Size*1.5) + 1024;
\r
232 char **NewLabels = MYALLOC(char *, NewSize, SeqDB);
\r
233 byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);
\r
234 unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);
\r
236 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
238 NewLabels[i] = m_Labels[i];
\r
239 NewSeqs[i] = m_Seqs[i];
\r
240 NewSeqLengths[i] = m_SeqLengths[i];
\r
243 MYFREE(m_Labels, m_SeqCount, SeqDB);
\r
244 MYFREE(m_Seqs, m_SeqCount, SeqDB);
\r
245 MYFREE(m_SeqLengths, m_SeqCount, SeqDB);
\r
247 m_Labels = NewLabels;
\r
249 m_SeqLengths = NewSeqLengths;
\r
253 unsigned Index = m_SeqCount++;
\r
254 m_Seqs[Index] = MYALLOC(byte, L, SeqDB);
\r
255 memcpy(m_Seqs[Index], Seq, L);
\r
257 unsigned n = strlen(Label) + 1;
\r
258 m_Labels[Index] = MYALLOC(char, n, SeqDB);
\r
259 memcpy(m_Labels[Index], Label, n);
\r
264 m_Aligned = (m_Aligned && L == m_SeqLengths[0]);
\r
266 m_SeqLengths[Index] = L;
\r
272 unsigned SeqDB::GetIndex(const char *Label) const
\r
274 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
275 if (strcmp(Label, m_Labels[i]) == 0)
\r
277 Die("SeqDB::GetIndex(%s), not found", Label);
\r
281 void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)
\r
283 LabelToIndex.clear();
\r
284 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
286 const string &Label = string(GetLabel(i));
\r
287 if (LabelToIndex.find(Label) != LabelToIndex.end())
\r
288 Die("Duplicate label: %s", Label.c_str());
\r
289 LabelToIndex[Label] = i;
\r