8 void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L)
\r
10 const unsigned ROWLEN = 80;
\r
12 fprintf(f, ">%s\n", Label);
\r
13 unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;
\r
14 for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)
\r
16 unsigned From = BlockIndex*ROWLEN;
\r
17 unsigned To = From + ROWLEN;
\r
20 for (unsigned Pos = From; Pos < To; ++Pos)
\r
36 void SeqDB::Clear(bool ctor)
\r
40 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
42 unsigned n = strlen(m_Labels[i]);
\r
43 MYFREE(m_Labels[i], n, SeqDB);
\r
44 MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB);
\r
46 MYFREE(m_Labels, m_Size, SeqDB);
\r
47 MYFREE(m_Seqs, m_Size, SeqDB);
\r
48 MYFREE(m_SeqLengths, m_Size, SeqDB);
\r
61 m_IsNucleoSet = false;
\r
64 void SeqDB::InitEmpty(bool Nucleo)
\r
67 m_IsNucleo = Nucleo;
\r
68 m_IsNucleoSet = true;
\r
71 void SeqDB::FromFasta(const string &FileName, bool AllowGaps)
\r
74 m_FileName = FileName;
\r
78 SF.m_AllowGaps = AllowGaps;
\r
80 ProgressStep(0, 1000, "Reading %s", FileName.c_str());
\r
83 unsigned QueryPctDoneX10 = SF.GetPctDoneX10();
\r
84 ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str());
\r
85 const byte *Seq = SF.GetNextSeq();
\r
89 const char *Label = SF.GetLabel();
\r
90 unsigned L = SF.GetSeqLength();
\r
91 AddSeq(Label, Seq, L);
\r
93 ProgressStep(999, 1000, "Reading %s", FileName.c_str());
\r
97 Progress("%s sequences\n", IntToStr(GetSeqCount()));
\r
100 void SeqDB::ToFasta(const string &FileName) const
\r
102 FILE *f = CreateStdioFile(FileName);
\r
103 for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex)
\r
104 ToFasta(f, SeqIndex);
\r
108 void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const
\r
111 fprintf(f, ">%s\n", GetLabel(SeqIndex));
\r
113 const unsigned ROWLEN = 80;
\r
115 unsigned L = GetSeqLength(SeqIndex);
\r
116 const byte *Seq = GetSeq(SeqIndex);
\r
117 unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN;
\r
118 for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex)
\r
120 unsigned From = BlockIndex*ROWLEN;
\r
121 unsigned To = From + ROWLEN;
\r
124 for (unsigned Pos = From; Pos < To; ++Pos)
\r
125 fputc(Seq[Pos], f);
\r
130 void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const
\r
132 asserta(SeqIndex < m_SeqCount);
\r
133 fprintf(f, ">%s\n", GetLabel(SeqIndex));
\r
134 SeqToFasta(f, SeqIndex);
\r
137 unsigned SeqDB::GetMaxLabelLength() const
\r
139 const unsigned SeqCount = GetSeqCount();
\r
141 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
143 unsigned L = (unsigned) strlen(m_Labels[Index]);
\r
150 unsigned SeqDB::GetMaxSeqLength() const
\r
152 const unsigned SeqCount = GetSeqCount();
\r
154 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
156 unsigned L = m_SeqLengths[Index];
\r
163 void SeqDB::LogMe() const
\r
166 const unsigned SeqCount = GetSeqCount();
\r
167 Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned));
\r
171 Log("Index Label Length Seq\n");
\r
172 Log("----- ---------------- ------ ---\n");
\r
173 for (unsigned Index = 0; Index < SeqCount; ++Index)
\r
176 Log(" %16.16s", m_Labels[Index]);
\r
177 unsigned L = m_SeqLengths[Index];
\r
179 Log(" %*.*s", L, L, m_Seqs[Index]);
\r
184 void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const
\r
186 asserta(Id < m_SeqCount);
\r
187 Buffer.Seq = m_Seqs[Id];
\r
188 Buffer.Label = m_Labels[Id];
\r
189 Buffer.L = m_SeqLengths[Id];
\r
191 Buffer.ORFParent = 0;
\r
192 Buffer.RevComp = false;
\r
193 Buffer.Nucleo = IsNucleo();
\r
196 void SeqDB::SetIsNucleo()
\r
198 const unsigned SeqCount = GetSeqCount();
\r
200 for (unsigned i = 0; i < 100; ++i)
\r
202 unsigned SeqIndex = unsigned(rand()%SeqCount);
\r
203 const byte *Seq = GetSeq(SeqIndex);
\r
204 unsigned L = GetSeqLength(SeqIndex);
\r
205 const unsigned Pos = unsigned(rand()%L);
\r
208 if (g_IsNucleoChar[c])
\r
211 m_IsNucleo = (N > 80);
\r
212 m_IsNucleoSet = true;
\r
215 unsigned SeqDB::GetTotalLength() const
\r
217 const unsigned SeqCount = GetSeqCount();
\r
218 unsigned TotalLength = 0;
\r
219 for (unsigned Id = 0; Id < SeqCount; ++Id)
\r
220 TotalLength += GetSeqLength(Id);
\r
221 return TotalLength;
\r
224 unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L)
\r
226 StartTimer(AddSeq);
\r
227 if (m_SeqCount >= m_Size)
\r
229 unsigned NewSize = unsigned(m_Size*1.5) + 1024;
\r
230 char **NewLabels = MYALLOC(char *, NewSize, SeqDB);
\r
231 byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB);
\r
232 unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB);
\r
234 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
236 NewLabels[i] = m_Labels[i];
\r
237 NewSeqs[i] = m_Seqs[i];
\r
238 NewSeqLengths[i] = m_SeqLengths[i];
\r
241 MYFREE(m_Labels, m_SeqCount, SeqDB);
\r
242 MYFREE(m_Seqs, m_SeqCount, SeqDB);
\r
243 MYFREE(m_SeqLengths, m_SeqCount, SeqDB);
\r
245 m_Labels = NewLabels;
\r
247 m_SeqLengths = NewSeqLengths;
\r
251 unsigned Index = m_SeqCount++;
\r
252 m_Seqs[Index] = MYALLOC(byte, L, SeqDB);
\r
253 memcpy(m_Seqs[Index], Seq, L);
\r
255 unsigned n = strlen(Label) + 1;
\r
256 m_Labels[Index] = MYALLOC(char, n, SeqDB);
\r
257 memcpy(m_Labels[Index], Label, n);
\r
262 m_Aligned = (m_Aligned && L == m_SeqLengths[0]);
\r
264 m_SeqLengths[Index] = L;
\r
270 unsigned SeqDB::GetIndex(const char *Label) const
\r
272 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
273 if (strcmp(Label, m_Labels[i]) == 0)
\r
275 Die("SeqDB::GetIndex(%s), not found", Label);
\r
279 void SeqDB::MakeLabelToIndex(map<string, unsigned> &LabelToIndex)
\r
281 LabelToIndex.clear();
\r
282 for (unsigned i = 0; i < m_SeqCount; ++i)
\r
284 const string &Label = string(GetLabel(i));
\r
285 if (LabelToIndex.find(Label) != LabelToIndex.end())
\r
286 Die("Duplicate label: %s", Label.c_str());
\r
287 LabelToIndex[Label] = i;
\r