6 static inline bool isgap(byte c)
\r
8 return c == '-' || c == '.';
\r
11 const unsigned BufferSize = 16*1024*1024;
\r
13 static unsigned GetMaxPoly(const byte *Seq, unsigned L)
\r
15 byte CurrChar = Seq[0];
\r
17 unsigned MaxLen = 1;
\r
18 for (unsigned i = 1; i < L; ++i)
\r
21 if (c != CurrChar || i+1 == L)
\r
23 unsigned Len = i - Start;
\r
45 m_TooShortCount = 0;
\r
47 m_ShortestLength = 0;
\r
48 m_LongestLength = 0;
\r
50 m_IsNucleoSet = false;
\r
58 void SFasta::Clear()
\r
60 MYFREE(m_Buffer, m_BufferSize, SFasta);
\r
62 CloseStdioFile(m_File);
\r
74 m_SeqIndex = UINT_MAX;
\r
75 m_AllowGaps = false;
\r
77 m_IsNucleoSet = false;
\r
78 m_TooShortCount = 0;
\r
80 m_ShortestLength = 0;
\r
81 m_LongestLength = 0;
\r
85 void SFasta::LogMe() const
\r
88 Log("SFasta::LogMe()\n");
\r
89 Log("FileName=%s\n", m_FileName.c_str());
\r
90 Log("FileSize=%u\n", (unsigned) m_FileSize);
\r
91 Log("FilePos=%u\n", (unsigned) m_FilePos);
\r
92 Log("BufferSize=%u\n", m_BufferSize);
\r
93 Log("BufferPos=%u\n", m_BufferOffset);
\r
94 Log("BufferBytes=%u\n", m_BufferBytes);
\r
96 Log("Label=NULL\n");
\r
98 Log("Label=%s\n", m_Label);
\r
99 Log("SeqLength=%u\n", m_SeqLength);
\r
102 const byte *SFasta::GetNextSeq()
\r
106 const byte *Seq = GetNextSeqLo();
\r
109 if (m_TooShortCount > 0)
\r
110 Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s",
\r
111 m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str());
\r
112 if (m_TooLongCount > 0)
\r
113 Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s",
\r
114 m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str());
\r
115 if (m_TooPolyCount > 0)
\r
116 Warning("%u sequences with long homopolymers discarded (--maxpoly %u)",
\r
117 m_TooPolyCount, opt_maxpoly);
\r
120 if (m_SeqLength < opt_minlen)
\r
123 if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength)
\r
124 m_ShortestLength = m_SeqLength;
\r
127 if (m_SeqLength > opt_maxlen && opt_maxlen != 0)
\r
129 if (m_LongestLength == 0 || m_SeqLength > m_LongestLength)
\r
130 m_LongestLength = m_SeqLength;
\r
138 const byte *SFasta::GetNextSeqLo()
\r
141 if (m_BufferOffset == m_BufferBytes)
\r
144 if (m_FilePos == m_FileSize)
\r
149 StartTimer(SF_GetNextSeq);
\r
150 asserta(m_Buffer[m_BufferOffset] == '>');
\r
151 m_Label = (char *) (m_Buffer + m_BufferOffset + 1);
\r
153 //// Scan to end-of-line.
\r
154 //// Use dubious library function strchr() in the hope
\r
155 //// that it uses fast machine code.
\r
156 // byte *ptr = (byte *) strchr(m_Label, '\n');
\r
157 // asserta(ptr != 0);
\r
161 for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i)
\r
163 char c = m_Buffer[i];
\r
164 if (c == '\n' || c == '\r')
\r
166 ptr = m_Buffer + i;
\r
172 if (opt_trunclabels)
\r
174 for (char *p = m_Label; *p; ++p)
\r
183 for (char *p = m_Label; *p; ++p)
\r
187 else if (*p == '\r' || *p == '\n')
\r
190 char NextChar = *(p+1);
\r
191 if (NextChar == '\r' || NextChar == '\n')
\r
198 // ptr points to end-of-line.
\r
199 // Move to start of sequence data.
\r
202 // Delete white space in-place
\r
204 m_BufferOffset = (unsigned) (ptr - m_Buffer);
\r
205 while (m_BufferOffset < m_BufferBytes)
\r
207 byte c = m_Buffer[m_BufferOffset];
\r
211 if (m_BufferOffset > 0)
\r
212 prevc = m_Buffer[m_BufferOffset-1];
\r
213 if (prevc == '\n' || prevc == '\r')
\r
217 if (isalpha(c) || (isgap(c) && m_AllowGaps))
\r
219 else if (c == '\n' || c == '\r')
\r
223 const char *Label = (m_Label == 0 ? "" : m_Label);
\r
224 static bool WarningDone = false;
\r
228 Warning("Ignoring gaps in FASTA file '%s'",
\r
229 m_FileName.c_str());
\r
230 else if (isprint(c))
\r
231 Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s",
\r
232 m_FileName.c_str(), c, Label);
\r
234 Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s",
\r
235 m_FileName.c_str(), c, Label);
\r
236 WarningDone = true;
\r
241 m_SeqLength = unsigned(To - Seq);
\r
243 if (m_SeqIndex == UINT_MAX)
\r
248 EndTimer(SF_GetNextSeq);
\r
252 void SFasta::Open(const string &FileName)
\r
255 m_FileName = FileName;
\r
256 m_File = OpenStdioFile(FileName);
\r
257 m_BufferSize = BufferSize;
\r
258 //m_Buffer = myalloc<byte>(m_BufferSize);
\r
259 m_Buffer = MYALLOC(byte, m_BufferSize, SFasta);
\r
260 m_FileSize = GetStdioFileSize(m_File);
\r
263 void SFasta::Rewind()
\r
265 m_BufferOffset = 0;
\r
270 bool SFasta::SetIsNucleo()
\r
272 if (m_FilePos != 0)
\r
273 Die("SFasta::IsNucleo, not at BOF");
\r
275 unsigned LetterCount = 0;
\r
276 unsigned NucleoLetterCount = 0;
\r
279 const byte *Seq = GetNextSeq();
\r
282 unsigned L = GetSeqLength();
\r
283 for (unsigned i = 0; i < L; ++i)
\r
284 if (g_IsNucleoChar[Seq[i]])
\r
285 ++NucleoLetterCount;
\r
287 if (LetterCount > 256)
\r
291 if (LetterCount == 0)
\r
293 m_IsNucleoSet = true;
\r
298 // Nucleo if more than 90% nucleo letters AGCTUN
\r
299 m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9;
\r
300 m_IsNucleoSet = true;
\r
304 void SFasta::FillCache()
\r
306 StartTimer(SF_FillCache);
\r
307 asserta(m_FilePos < m_FileSize);
\r
309 // off_t may be larger type than unsigned, e.g. 64- vs. 32-bit.
\r
310 off_t otBytesToRead = m_FileSize - m_FilePos;
\r
312 bool FinalBuffer = true;
\r
313 if (otBytesToRead > (off_t) m_BufferSize)
\r
315 FinalBuffer = false;
\r
316 otBytesToRead = m_BufferSize;
\r
319 unsigned BytesToRead = unsigned(otBytesToRead);
\r
320 asserta(BytesToRead > 0);
\r
321 asserta(BytesToRead <= m_BufferSize);
\r
323 SetStdioFilePos(m_File, m_FilePos);
\r
324 ReadStdioFile(m_File, m_Buffer, BytesToRead);
\r
325 if (m_Buffer[0] != '>')
\r
327 if (m_FilePos == 0)
\r
328 Die("Input is not FASTA file");
\r
330 Die("SFasta::FillCache() failed, expected '>'");
\r
333 m_BufferOffset = 0;
\r
335 // If last buffer in file, done
\r
338 m_BufferBytes = BytesToRead;
\r
339 m_FilePos += BytesToRead;
\r
340 EndTimer(SF_FillCache);
\r
344 // If not last buffer, truncate any partial sequence
\r
345 // at end of buffer. Search backwards to find last '>'.
\r
346 byte *ptr = m_Buffer + BytesToRead - 1;
\r
347 while (ptr > m_Buffer)
\r
349 if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r'))
\r
354 if (ptr == m_Buffer)
\r
360 // This might techincally be legal FASTA if the entire
\r
361 // buffer is white space, but strange if not the last buffer
\r
362 // in the file, so quit anyway.
\r
363 Die("Failed to find '>' (pos=%u, bytes=%u)",
\r
364 (unsigned) m_FilePos, BytesToRead);
\r
368 // Entire buffer is one sequence which may be truncated.
\r
369 Die("Sequence too long (pos=%u, bytes=%u)",
\r
370 (unsigned) m_FilePos, BytesToRead);
\r
374 asserta(*ptr == '>');
\r
376 m_BufferBytes = unsigned(ptr - m_Buffer);
\r
377 m_FilePos += m_BufferBytes;
\r
379 EndTimer(SF_FillCache);
\r
382 unsigned SFasta::GetPctDoneX10() const
\r
384 if (m_FilePos == 0 || m_FileSize == 0)
\r
387 assert(m_FilePos >= (off_t) m_BufferBytes);
\r
388 off_t BufferStart = m_FilePos - m_BufferBytes;
\r
389 off_t BufferPos = BufferStart + m_BufferOffset;
\r
391 unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize));
\r
394 if (iPctX10 >= 999)
\r
399 double SFasta::GetPctDone() const
\r
401 if (m_FilePos == 0 || m_FileSize == 0)
\r
404 assert(m_FilePos >= (off_t) m_BufferBytes);
\r
405 off_t BufferStart = m_FilePos - m_BufferBytes;
\r
406 off_t BufferPos = BufferStart + m_BufferOffset;
\r
408 return double(BufferPos)*100.0/double(m_FileSize);
\r
411 bool SFasta::GetNextSD(SeqData &SD)
\r
413 SD.Seq = GetNextSeq();
\r
417 SD.Label = GetLabel();
\r
418 SD.L = GetSeqLength();
\r
419 SD.Index = GetSeqIndex();
\r
421 SD.Nucleo = GetIsNucleo();
\r
422 SD.RevComp = false;
\r
431 SF.Open(opt_input);
\r
435 Log(" Index Length Label\n");
\r
436 Log("------- ------- -----\n");
\r
439 unsigned Index = 0;
\r
440 unsigned SeqCount = 0;
\r
441 double LetterCount = 0.0;
\r
442 ProgressStep(0, 1000, "Reading");
\r
445 const byte *Seq = SF.GetNextSeq();
\r
448 ProgressStep(SF.GetPctDoneX10(), 1000, "Reading");
\r
449 const char *Label = SF.GetLabel();
\r
450 unsigned L = SF.GetSeqLength();
\r
456 Log(">%7u %7u '%s'\n", Index, L, Label);
\r
457 Log("+%7.7s %7.7s \"%*.*s\"\n", "", "", L, L, Seq);
\r
462 ProgressStep(999, 1000, "Reading");
\r
464 Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));
\r
465 Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount));
\r