1 //uchime by Robert C. Edgar http://drive5.com/uchime This code is donated to the public domain.
\r
8 static inline bool isgap(byte c)
\r
10 return c == '-' || c == '.';
\r
13 const unsigned BufferSize = 16*1024*1024;
\r
15 static unsigned GetMaxPoly(const byte *Seq, unsigned L)
\r
17 byte CurrChar = Seq[0];
\r
19 unsigned MaxLen = 1;
\r
20 for (unsigned i = 1; i < L; ++i)
\r
23 if (c != CurrChar || i+1 == L)
\r
25 unsigned Len = i - Start;
\r
47 m_TooShortCount = 0;
\r
49 m_ShortestLength = 0;
\r
50 m_LongestLength = 0;
\r
52 m_IsNucleoSet = false;
\r
60 void SFasta::Clear()
\r
62 MYFREE(m_Buffer, m_BufferSize, SFasta);
\r
64 CloseStdioFile(m_File);
\r
76 m_SeqIndex = UINT_MAX;
\r
77 m_AllowGaps = false;
\r
79 m_IsNucleoSet = false;
\r
80 m_TooShortCount = 0;
\r
82 m_ShortestLength = 0;
\r
83 m_LongestLength = 0;
\r
87 void SFasta::LogMe() const
\r
90 Log("SFasta::LogMe()\n");
\r
91 Log("FileName=%s\n", m_FileName.c_str());
\r
92 Log("FileSize=%u\n", (unsigned) m_FileSize);
\r
93 Log("FilePos=%u\n", (unsigned) m_FilePos);
\r
94 Log("BufferSize=%u\n", m_BufferSize);
\r
95 Log("BufferPos=%u\n", m_BufferOffset);
\r
96 Log("BufferBytes=%u\n", m_BufferBytes);
\r
98 Log("Label=NULL\n");
\r
100 Log("Label=%s\n", m_Label);
\r
101 Log("SeqLength=%u\n", m_SeqLength);
\r
104 const byte *SFasta::GetNextSeq()
\r
108 const byte *Seq = GetNextSeqLo();
\r
111 if (m_TooShortCount > 0)
\r
112 Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",
\r
113 m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());
\r
114 if (m_TooLongCount > 0)
\r
115 Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",
\r
116 m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());
\r
117 if (m_TooPolyCount > 0)
\r
118 Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",
\r
119 m_TooPolyCount, opt_maxpoly);
\r
122 if (m_SeqLength < opt_minlen)
\r
125 if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)
\r
126 m_ShortestLength = m_SeqLength;
\r
129 if (m_SeqLength > opt_maxlen && opt_maxlen != 0)
\r
131 if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)
\r
132 m_LongestLength = m_SeqLength;
\r
140 const byte *SFasta::GetNextSeqLo()
\r
143 if (m_BufferOffset == m_BufferBytes)
\r
146 if (m_FilePos == m_FileSize)
\r
151 StartTimer(SF_GetNextSeq);
\r
152 asserta(m_Buffer[m_BufferOffset] == '>');
\r
153 m_Label = (char *) (m_Buffer + m_BufferOffset + 1);
\r
155 //// Scan to end-of-line.
\r
156 //// Use dubious library function strchr() in the hope
\r
157 //// that it uses fast machine code.
\r
158 // byte *ptr = (byte *) strchr(m_Label, '\n');
\r
159 // asserta(ptr != 0);
\r
163 for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)
\r
165 char c = m_Buffer[i];
\r
166 if (c == '\n' || c == '\r')
\r
168 ptr = m_Buffer + i;
\r
174 if (opt_trunclabels)
\r
176 for (char *p = m_Label; *p; ++p)
\r
185 for (char *p = m_Label; *p; ++p)
\r
189 else if (*p == '\r' || *p == '\n')
\r
192 char NextChar = *(p+1);
\r
193 if (NextChar == '\r' || NextChar == '\n')
\r
200 // ptr points to end-of-line.
\r
201 // Move to start of sequence data.
\r
204 // Delete white space in-place
\r
206 m_BufferOffset = (unsigned) (ptr - m_Buffer);
\r
207 while (m_BufferOffset < m_BufferBytes)
\r
209 byte c = m_Buffer[m_BufferOffset];
\r
213 if (m_BufferOffset > 0)
\r
214 prevc = m_Buffer[m_BufferOffset-1];
\r
215 if (prevc == '\n' || prevc == '\r')
\r
219 if (isalpha(c) || (isgap(c) && m_AllowGaps))
\r
221 else if (c == '\n' || c == '\r')
\r
225 const char *Label = (m_Label == 0 ? "" : m_Label);
\r
226 static bool WarningDone = false;
\r
230 //Warning("Ignoring gaps in FASTA file '%s'",
\r
231 //m_FileName.c_str());
\r
233 else if (isprint(c))
\r
234 Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",
\r
235 m_FileName.c_str(), c, Label);
\r
237 Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",
\r
238 m_FileName.c_str(), c, Label);
\r
239 WarningDone = true;
\r
244 m_SeqLength = unsigned(To - Seq);
\r
246 if (m_SeqIndex == UINT_MAX)
\r
251 EndTimer(SF_GetNextSeq);
\r
255 void SFasta::Open(const string &FileName)
\r
258 m_FileName = FileName;
\r
259 m_File = OpenStdioFile(FileName);
\r
260 m_BufferSize = BufferSize;
\r
261 //m_Buffer = myalloc<byte>(m_BufferSize);
\r
262 m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);
\r
263 m_FileSize = GetStdioFileSize(m_File);
\r
266 void SFasta::Rewind()
\r
268 m_BufferOffset = 0;
\r
273 bool SFasta::SetIsNucleo()
\r
275 if (m_FilePos != 0)
\r
276 Die("SFasta::IsNucleo, not at BOF");
\r
278 unsigned LetterCount = 0;
\r
279 unsigned NucleoLetterCount = 0;
\r
282 const byte *Seq = GetNextSeq();
\r
285 unsigned L = GetSeqLength();
\r
286 for (unsigned i = 0; i < L; ++i)
\r
287 if (g_IsNucleoChar[Seq[i]])
\r
288 ++NucleoLetterCount;
\r
290 if (LetterCount > 256)
\r
294 if (LetterCount == 0)
\r
296 m_IsNucleoSet = true;
\r
301 // Nucleo if more than 90% nucleo letters AGCTUN
\r
302 m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;
\r
303 m_IsNucleoSet = true;
\r
307 void SFasta::FillCache()
\r
309 StartTimer(SF_FillCache);
\r
310 asserta(m_FilePos < m_FileSize);
\r
312 // off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.
\r
313 off_t otBytesToRead = m_FileSize - m_FilePos;
\r
315 bool FinalBuffer = true;
\r
316 if (otBytesToRead > (off_t) m_BufferSize)
\r
318 FinalBuffer = false;
\r
319 otBytesToRead = m_BufferSize;
\r
322 unsigned BytesToRead = unsigned(otBytesToRead);
\r
323 asserta(BytesToRead > 0);
\r
324 asserta(BytesToRead <= m_BufferSize);
\r
326 SetStdioFilePos(m_File, m_FilePos);
\r
327 ReadStdioFile(m_File, m_Buffer, BytesToRead);
\r
328 if (m_Buffer[0] != '>')
\r
330 if (m_FilePos == 0)
\r
331 Die("Input is not FASTA file");
\r
333 Die("SFasta::FillCache() failed, expected '>'");
\r
336 m_BufferOffset = 0;
\r
338 // If last buffer in file, done
\r
341 m_BufferBytes = BytesToRead;
\r
342 m_FilePos += BytesToRead;
\r
343 EndTimer(SF_FillCache);
\r
347 // If not last buffer, truncate any partial sequence
\r
348 // at end of buffer. Search backwards to find last '>'.
\r
349 byte *ptr = m_Buffer + BytesToRead - 1;
\r
350 while (ptr > m_Buffer)
\r
352 if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))
\r
357 if (ptr == m_Buffer)
\r
363 // This might techincally be legal FASTA if the entire
\r
364 // buffer is white space, but strange if not the last buffer
\r
365 // in the file, so quit anyway.
\r
366 Die("Failed to find '>' (pos=%u, bytes=%u)",
\r
367 (unsigned) m_FilePos, BytesToRead);
\r
371 // Entire buffer is one sequence which may be truncated.
\r
372 Die("Sequence too long (pos=%u, bytes=%u)",
\r
373 (unsigned) m_FilePos, BytesToRead);
\r
377 asserta(*ptr == '>');
\r
379 m_BufferBytes = unsigned(ptr - m_Buffer);
\r
380 m_FilePos += m_BufferBytes;
\r
382 EndTimer(SF_FillCache);
\r
385 unsigned SFasta::GetPctDoneX10() const
\r
387 if (m_FilePos == 0 || m_FileSize == 0)
\r
390 assert(m_FilePos >= (off_t) m_BufferBytes);
\r
391 off_t BufferStart = m_FilePos - m_BufferBytes;
\r
392 off_t BufferPos = BufferStart + m_BufferOffset;
\r
394 unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));
\r
397 if (iPctX10 >= 999)
\r
402 double SFasta::GetPctDone() const
\r
404 if (m_FilePos == 0 || m_FileSize == 0)
\r
407 assert(m_FilePos >= (off_t) m_BufferBytes);
\r
408 off_t BufferStart = m_FilePos - m_BufferBytes;
\r
409 off_t BufferPos = BufferStart + m_BufferOffset;
\r
411 return double(BufferPos)*100.0/double(m_FileSize);
\r
414 bool SFasta::GetNextSD(SeqData &SD)
\r
416 SD.Seq = GetNextSeq();
\r
420 SD.Label = GetLabel();
\r
421 SD.L = GetSeqLength();
\r
422 SD.Index = GetSeqIndex();
\r
424 SD.Nucleo = GetIsNucleo();
\r
425 SD.RevComp = false;
\r
434 SF.Open(opt_input);
\r
438 Log(" Index Length Label\n");
\r
439 Log("------- ------- -----\n");
\r
442 unsigned Index = 0;
\r
443 unsigned SeqCount = 0;
\r
444 double LetterCount = 0.0;
\r
445 ProgressStep(0, 1000, "Reading");
\r
448 const byte *Seq = SF.GetNextSeq();
\r
451 ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");
\r
452 const char *Label = SF.GetLabel();
\r
453 unsigned L = SF.GetSeqLength();
\r
459 Log(">%7u %7u '%s'\n", Index, L, Label);
\r
460 Log("+%7.7s %7.7s \"%*.*s\"\n", "", "", L, L, Seq);
\r
465 ProgressStep(999, 1000, "Reading");
\r
467 Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));
\r
468 Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));
\r