1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
7 /* Byte array for fast convertion of binary blocks to DNA. */
8 /* Binary blocks holds four nucleotides encoded in 2 bits: */
9 /* A=00 T=11 C=01 G=10 */
10 char *bin2dna[256] = {
11 "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
12 "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
13 "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
14 "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
15 "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
16 "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
17 "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
18 "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
19 "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
20 "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
21 "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
22 "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
23 "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
24 "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
25 "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
26 "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
27 "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
28 "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
29 "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
30 "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
31 "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
32 "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
33 "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
34 "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
35 "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
36 "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
37 "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
38 "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
39 "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
40 "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
41 "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
42 "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
46 seq_entry *seq_new( size_t max_seq_name, size_t max_seq )
48 /* Martin A. Hansen, August 2008 */
50 /* Initialize a new sequence entry. */
52 seq_entry *entry = NULL;
53 entry = mem_get( sizeof( seq_entry ) );
54 entry->seq_name = mem_get( max_seq_name );
55 entry->seq = mem_get( max_seq );
62 void seq_destroy( seq_entry *entry )
64 /* Martin A. Hansen, August 2008 */
66 /* Destroy a sequence entry. */
68 mem_free( &entry->seq_name );
69 mem_free( &entry->seq );
74 void seq_uppercase( char *seq )
76 /* Martin A. Hansen, May 2008 */
78 /* Uppercase a sequence in place. */
82 for ( i = 0; seq[ i ]; i++ ) {
83 seq[ i ] = toupper( seq[ i ] );
88 void lowercase_seq( char *seq )
90 /* Martin A. Hansen, May 2008 */
92 /* Lowercase a sequence in place. */
96 for ( i = 0; seq[ i ]; i++ ) {
97 seq[ i ] = tolower( seq[ i ] );
102 void revcomp_dna( char *seq )
104 /* Martin A. Hansen, May 2008 */
106 /* Reverse complement a DNA sequence in place. */
108 complement_dna( seq );
113 void revcomp_rna( char *seq )
115 /* Martin A. Hansen, May 2008 */
117 /* Reverse complement a RNA sequence in place. */
119 complement_rna( seq );
124 void revcomp_nuc( char *seq )
126 /* Martin A. Hansen, May 2008 */
128 /* Reverse complements a nucleotide sequence in place. */
130 complement_nuc( seq );
135 void complement_nuc( char *seq )
137 /* Martin A. Hansen, May 2008 */
139 /* Complements a nucleotide sequence, */
140 /* after guess the type. */
142 if ( is_dna( seq ) ) {
143 complement_dna( seq );
144 } else if ( is_rna( seq ) ) {
145 complement_rna( seq );
152 void complement_dna( char *seq )
154 /* Martin A. Hansen, May 2008 */
156 /* Complements a DNA sequence including */
157 /* ambiguity coded nucleotides. */;
161 for ( i = 0; seq[ i ]; i++ )
165 case 'a': seq[ i ] = 't'; break;
166 case 'A': seq[ i ] = 'T'; break;
167 case 'c': seq[ i ] = 'g'; break;
168 case 'C': seq[ i ] = 'G'; break;
169 case 'g': seq[ i ] = 'c'; break;
170 case 'G': seq[ i ] = 'C'; break;
171 case 't': seq[ i ] = 'a'; break;
172 case 'u': seq[ i ] = 'a'; break;
173 case 'T': seq[ i ] = 'A'; break;
174 case 'U': seq[ i ] = 'A'; break;
175 case 'm': seq[ i ] = 'k'; break;
176 case 'M': seq[ i ] = 'K'; break;
177 case 'r': seq[ i ] = 'y'; break;
178 case 'R': seq[ i ] = 'Y'; break;
179 case 'w': seq[ i ] = 'w'; break;
180 case 'W': seq[ i ] = 'W'; break;
181 case 's': seq[ i ] = 'S'; break;
182 case 'S': seq[ i ] = 'S'; break;
183 case 'y': seq[ i ] = 'r'; break;
184 case 'Y': seq[ i ] = 'R'; break;
185 case 'k': seq[ i ] = 'm'; break;
186 case 'K': seq[ i ] = 'M'; break;
187 case 'b': seq[ i ] = 'v'; break;
188 case 'B': seq[ i ] = 'V'; break;
189 case 'd': seq[ i ] = 'h'; break;
190 case 'D': seq[ i ] = 'H'; break;
191 case 'h': seq[ i ] = 'd'; break;
192 case 'H': seq[ i ] = 'D'; break;
193 case 'v': seq[ i ] = 'b'; break;
194 case 'V': seq[ i ] = 'B'; break;
195 case 'n': seq[ i ] = 'n'; break;
196 case 'N': seq[ i ] = 'N'; break;
203 void complement_rna( char *seq )
205 /* Martin A. Hansen, May 2008 */
207 /* Complements an RNA sequence including */
208 /* ambiguity coded nucleotides. */;
212 for ( i = 0; seq[ i ]; i++ )
216 case 'a': seq[ i ] = 'u'; break;
217 case 'A': seq[ i ] = 'U'; break;
218 case 'c': seq[ i ] = 'g'; break;
219 case 'C': seq[ i ] = 'G'; break;
220 case 'g': seq[ i ] = 'c'; break;
221 case 'G': seq[ i ] = 'C'; break;
222 case 't': seq[ i ] = 'a'; break;
223 case 'u': seq[ i ] = 'a'; break;
224 case 'T': seq[ i ] = 'A'; break;
225 case 'U': seq[ i ] = 'A'; break;
226 case 'm': seq[ i ] = 'k'; break;
227 case 'M': seq[ i ] = 'K'; break;
228 case 'r': seq[ i ] = 'y'; break;
229 case 'R': seq[ i ] = 'Y'; break;
230 case 'w': seq[ i ] = 'w'; break;
231 case 'W': seq[ i ] = 'W'; break;
232 case 's': seq[ i ] = 'S'; break;
233 case 'S': seq[ i ] = 'S'; break;
234 case 'y': seq[ i ] = 'r'; break;
235 case 'Y': seq[ i ] = 'R'; break;
236 case 'k': seq[ i ] = 'm'; break;
237 case 'K': seq[ i ] = 'M'; break;
238 case 'b': seq[ i ] = 'v'; break;
239 case 'B': seq[ i ] = 'V'; break;
240 case 'd': seq[ i ] = 'h'; break;
241 case 'D': seq[ i ] = 'H'; break;
242 case 'h': seq[ i ] = 'd'; break;
243 case 'H': seq[ i ] = 'D'; break;
244 case 'v': seq[ i ] = 'b'; break;
245 case 'V': seq[ i ] = 'B'; break;
246 case 'n': seq[ i ] = 'n'; break;
247 case 'N': seq[ i ] = 'N'; break;
254 void reverse( char *string )
256 /* Martin A. Hansen, May 2008 */
258 /* Reverses a string in place. */
265 j = strlen( string ) - 1;
271 string[ i ] = string[ j ];
280 void seq2nuc_simple( char *seq )
282 /* Martin A. Hansen, May 2008 */
284 /* Uppercases all DNA letters, while transforming */
285 /* all non-DNA letters in sequence to Ns. */
289 for ( i = 0; seq[ i ]; i++ )
299 case 'a': seq[ i ] = 'A'; break;
300 case 't': seq[ i ] = 'T'; break;
301 case 'c': seq[ i ] = 'C'; break;
302 case 'g': seq[ i ] = 'G'; break;
303 case 'u': seq[ i ] = 'U'; break;
304 default: seq[ i ] = 'N';
310 void dna2rna( char *seq )
312 /* Martin A. Hansen, May 2008 */
314 /* Converts a DNA sequence to RNA by changing T and t to U and u. */
318 for ( i = 0; seq[ i ]; i++ )
322 case 't': seq[ i ] = 'u'; break;
323 case 'T': seq[ i ] = 'U'; break;
330 void rna2dna( char *seq )
332 /* Martin A. Hansen, May 2008 */
334 /* Converts a RNA sequence to RNA by changing T and u to T and t. */
338 for ( i = 0; seq[ i ]; i++ )
342 case 'u': seq[ i ] = 't'; break;
343 case 'U': seq[ i ] = 'T'; break;
350 bool is_dna( char *seq )
352 /* Martin A. Hansen, May 2008 */
354 /* Determines if a given sequence is DNA, */
355 /* from inspection of the first 100 residues. */
359 for ( i = 0; seq[ i ]; i++ )
363 case 'A': case 'a': break;
364 case 'G': case 'g': break;
365 case 'C': case 'c': break;
366 case 'T': case 't': break;
367 case 'R': case 'r': break;
368 case 'Y': case 'y': break;
369 case 'W': case 'w': break;
370 case 'S': case 's': break;
371 case 'M': case 'm': break;
372 case 'K': case 'k': break;
373 case 'H': case 'h': break;
374 case 'D': case 'd': break;
375 case 'V': case 'v': break;
376 case 'B': case 'b': break;
377 case 'N': case 'n': break;
382 default: return FALSE;
394 bool is_rna( char *seq )
396 /* Martin A. Hansen, May 2008 */
398 /* Determines if a given sequence is RNA, */
399 /* from inspection of the first 100 residues. */
403 for ( i = 0; seq[ i ]; i++ )
407 case 'A': case 'a': break;
408 case 'G': case 'g': break;
409 case 'C': case 'c': break;
410 case 'U': case 'u': break;
411 case 'R': case 'r': break;
412 case 'Y': case 'y': break;
413 case 'W': case 'w': break;
414 case 'S': case 's': break;
415 case 'M': case 'm': break;
416 case 'K': case 'k': break;
417 case 'H': case 'h': break;
418 case 'D': case 'd': break;
419 case 'V': case 'v': break;
420 case 'B': case 'b': break;
421 case 'N': case 'n': break;
426 default: return FALSE;
438 bool is_protein( char *seq )
440 /* Martin A. Hansen, May 2008 */
442 /* Determines if a given sequence is protein, */
443 /* from inspection of the first 100 residues. */
447 for ( i = 0; seq[ i ]; i++ )
451 case 'K': case 'k': break;
452 case 'R': case 'r': break;
453 case 'H': case 'h': break;
454 case 'D': case 'd': break;
455 case 'E': case 'e': break;
456 case 'S': case 's': break;
457 case 'T': case 't': break;
458 case 'N': case 'n': break;
459 case 'Q': case 'q': break;
460 case 'A': case 'a': break;
461 case 'V': case 'v': break;
462 case 'I': case 'i': break;
463 case 'L': case 'l': break;
464 case 'M': case 'm': break;
465 case 'F': case 'f': break;
466 case 'Y': case 'y': break;
467 case 'W': case 'w': break;
468 case 'C': case 'c': break;
469 case 'G': case 'g': break;
470 case 'P': case 'p': break;
471 case 'Z': case 'z': break;
472 case 'B': case 'b': break;
473 case 'X': case 'x': break;
479 default: return FALSE;
491 char *seq_guess_type( char *seq )
493 /* Martin A. Hansen, May 2008 */
495 /* Guess the type of a given sequnce, */
496 /* which is returned as a pointer to a string. */
502 if ( is_dna( seq ) ) {
504 } else if ( is_rna( seq ) ) {
506 } else if ( is_protein( seq ) ) {
516 bool contain_N( char *seq )
518 /* Martin A. Hansen, May 2008 */
520 /* Check if a sequence contain N or n residues. */
524 for ( i = 0; seq[ i ]; i++ )
528 case 'N': case 'n': return TRUE;
537 int oligo2bin( char *oligo )
539 /* Martin A. Hansen, August 2004 */
541 /* Pack a max 15 nucleotide long oligo into a four byte integer. */
546 if ( strlen( oligo ) > 15 ) {
552 for ( i = 0; oligo[ i ]; i++ )
556 switch ( oligo[ i ] )
558 case 'A': case 'a': bin |= 0; break;
559 case 'N': case 'n': bin |= 0; break;
560 case 'T': case 't': bin |= 1; break;
561 case 'U': case 'u': bin |= 1; break;
562 case 'C': case 'c': bin |= 2; break;
563 case 'G': case 'g': bin |= 3; break;