code_c/Maasha/src/lib/seq.c

   1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
   2
   3 #include "common.h"
   4 #include "mem.h"
   5 #include "seq.h"
   6
   7 /* Byte array for fast convertion of binary blocks to DNA. */
   8 /* Binary blocks holds four nucleotides encoded in 2 bits: */
   9 /* A=00 T=11 C=01 G=10 */
  10 char *bin2dna[256] = {
  11     "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
  12     "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
  13     "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
  14     "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
  15     "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
  16     "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
  17     "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
  18     "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
  19     "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
  20     "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
  21     "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
  22     "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
  23     "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
  24     "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
  25     "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
  26     "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
  27     "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
  28     "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
  29     "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
  30     "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
  31     "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
  32     "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
  33     "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
  34     "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
  35     "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
  36     "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
  37     "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
  38     "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
  39     "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
  40     "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
  41     "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
  42     "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
  43 };
  44
  45
  46 seq_entry *seq_new( size_t max_seq_name, size_t max_seq )
  47 {
  48     /* Martin A. Hansen, August 2008 */
  49
  50     /* Initialize a new sequence entry. */
  51
  52     seq_entry *entry = NULL;
  53     entry            = mem_get( sizeof( seq_entry ) );
  54     entry->seq_name  = mem_get( max_seq_name );
  55     entry->seq       = mem_get( max_seq );
  56     entry->seq_len   = 0;
  57
  58     return entry;
  59 }
  60
  61
  62 void seq_destroy( seq_entry *entry )
  63 {
  64     /* Martin A. Hansen, August 2008 */
  65
  66     /* Destroy a sequence entry. */
  67
  68     mem_free( &entry->seq_name );
  69     mem_free( &entry->seq );
  70     mem_free( &entry );
  71 }
  72
  73
  74 void seq_uppercase( char *seq )
  75 {
  76     /* Martin A. Hansen, May 2008 */
  77
  78     /* Uppercase a sequence in place. */
  79
  80     size_t i;
  81
  82     for ( i = 0; seq[ i ]; i++ ) {
  83         seq[ i ] = toupper( seq[ i ] );
  84     }
  85 }
  86
  87
  88 void lowercase_seq( char *seq )
  89 {
  90     /* Martin A. Hansen, May 2008 */
  91
  92     /* Lowercase a sequence in place. */
  93
  94     size_t i;
  95
  96     for ( i = 0; seq[ i ]; i++ ) {
  97         seq[ i ] = tolower( seq[ i ] );
  98     }
  99 }
 100
 101
 102 void revcomp_dna( char *seq )
 103 {
 104     /* Martin A. Hansen, May 2008 */
 105
 106     /* Reverse complement a DNA sequence in place. */
 107
 108     complement_dna( seq );
 109     reverse( seq );
 110 }
 111
 112
 113 void revcomp_rna( char *seq )
 114 {
 115     /* Martin A. Hansen, May 2008 */
 116
 117     /* Reverse complement a RNA sequence in place. */
 118
 119     complement_rna( seq );
 120     reverse( seq );
 121 }
 122
 123
 124 void revcomp_nuc( char *seq )
 125 {
 126     /* Martin A. Hansen, May 2008 */
 127
 128     /* Reverse complements a nucleotide sequence in place. */
 129
 130     complement_nuc( seq );
 131     reverse( seq );
 132 }
 133
 134
 135 void complement_nuc( char *seq )
 136 {
 137     /* Martin A. Hansen, May 2008 */
 138
 139     /* Complements a nucleotide sequence, */
 140     /* after guess the type. */
 141
 142     if ( is_dna( seq ) ) {
 143         complement_dna( seq );
 144     } else if ( is_rna( seq ) ) {
 145         complement_rna( seq );
 146     } else {
 147         abort();
 148     }
 149 }
 150
 151
 152 void complement_dna( char *seq )
 153 {
 154     /* Martin A. Hansen, May 2008 */
 155
 156     /* Complements a DNA sequence including */
 157     /* ambiguity coded nucleotides. */;
 158
 159     size_t i;
 160
 161     for ( i = 0; seq[ i ]; i++ )
 162     {
 163         switch ( seq[ i ] )
 164         {
 165             case 'a': seq[ i ] = 't'; break;
 166             case 'A': seq[ i ] = 'T'; break;
 167             case 'c': seq[ i ] = 'g'; break;
 168             case 'C': seq[ i ] = 'G'; break;
 169             case 'g': seq[ i ] = 'c'; break;
 170             case 'G': seq[ i ] = 'C'; break;
 171             case 't': seq[ i ] = 'a'; break;
 172             case 'u': seq[ i ] = 'a'; break;
 173             case 'T': seq[ i ] = 'A'; break;
 174             case 'U': seq[ i ] = 'A'; break;
 175             case 'm': seq[ i ] = 'k'; break;
 176             case 'M': seq[ i ] = 'K'; break;
 177             case 'r': seq[ i ] = 'y'; break;
 178             case 'R': seq[ i ] = 'Y'; break;
 179             case 'w': seq[ i ] = 'w'; break;
 180             case 'W': seq[ i ] = 'W'; break;
 181             case 's': seq[ i ] = 'S'; break;
 182             case 'S': seq[ i ] = 'S'; break;
 183             case 'y': seq[ i ] = 'r'; break;
 184             case 'Y': seq[ i ] = 'R'; break;
 185             case 'k': seq[ i ] = 'm'; break;
 186             case 'K': seq[ i ] = 'M'; break;
 187             case 'b': seq[ i ] = 'v'; break;
 188             case 'B': seq[ i ] = 'V'; break;
 189             case 'd': seq[ i ] = 'h'; break;
 190             case 'D': seq[ i ] = 'H'; break;
 191             case 'h': seq[ i ] = 'd'; break;
 192             case 'H': seq[ i ] = 'D'; break;
 193             case 'v': seq[ i ] = 'b'; break;
 194             case 'V': seq[ i ] = 'B'; break;
 195             case 'n': seq[ i ] = 'n'; break;
 196             case 'N': seq[ i ] = 'N'; break;
 197             default: break;
 198         }
 199     }
 200 }
 201
 202
 203 void complement_rna( char *seq )
 204 {
 205     /* Martin A. Hansen, May 2008 */
 206
 207     /* Complements an RNA sequence including */
 208     /* ambiguity coded nucleotides. */;
 209
 210     size_t i;
 211
 212     for ( i = 0; seq[ i ]; i++ )
 213     {
 214         switch ( seq[ i ] )
 215         {
 216             case 'a': seq[ i ] = 'u'; break;
 217             case 'A': seq[ i ] = 'U'; break;
 218             case 'c': seq[ i ] = 'g'; break;
 219             case 'C': seq[ i ] = 'G'; break;
 220             case 'g': seq[ i ] = 'c'; break;
 221             case 'G': seq[ i ] = 'C'; break;
 222             case 't': seq[ i ] = 'a'; break;
 223             case 'u': seq[ i ] = 'a'; break;
 224             case 'T': seq[ i ] = 'A'; break;
 225             case 'U': seq[ i ] = 'A'; break;
 226             case 'm': seq[ i ] = 'k'; break;
 227             case 'M': seq[ i ] = 'K'; break;
 228             case 'r': seq[ i ] = 'y'; break;
 229             case 'R': seq[ i ] = 'Y'; break;
 230             case 'w': seq[ i ] = 'w'; break;
 231             case 'W': seq[ i ] = 'W'; break;
 232             case 's': seq[ i ] = 'S'; break;
 233             case 'S': seq[ i ] = 'S'; break;
 234             case 'y': seq[ i ] = 'r'; break;
 235             case 'Y': seq[ i ] = 'R'; break;
 236             case 'k': seq[ i ] = 'm'; break;
 237             case 'K': seq[ i ] = 'M'; break;
 238             case 'b': seq[ i ] = 'v'; break;
 239             case 'B': seq[ i ] = 'V'; break;
 240             case 'd': seq[ i ] = 'h'; break;
 241             case 'D': seq[ i ] = 'H'; break;
 242             case 'h': seq[ i ] = 'd'; break;
 243             case 'H': seq[ i ] = 'D'; break;
 244             case 'v': seq[ i ] = 'b'; break;
 245             case 'V': seq[ i ] = 'B'; break;
 246             case 'n': seq[ i ] = 'n'; break;
 247             case 'N': seq[ i ] = 'N'; break;
 248             default: break;
 249         }
 250     }
 251 }
 252
 253
 254 void reverse( char *string )
 255 {
 256     /* Martin A. Hansen, May 2008 */
 257
 258     /* Reverses a string in place. */
 259
 260     char   c;
 261     size_t i;
 262     size_t j;
 263
 264     i = 0;
 265     j = strlen( string ) - 1;
 266
 267     while ( i <= j )
 268     {
 269         c = string[ i ];
 270
 271         string[ i ] = string[ j ];
 272         string[ j ] = c;
 273
 274         i++;
 275         j--;
 276     }
 277 }
 278
 279
 280 void seq2nuc_simple( char *seq )
 281 {
 282     /* Martin A. Hansen, May 2008 */
 283
 284     /* Uppercases all DNA letters, while transforming */
 285     /* all non-DNA letters in sequence to Ns. */
 286
 287     size_t i;
 288
 289     for ( i = 0; seq[ i ]; i++ )
 290     {
 291         switch ( seq[ i ] )
 292         {
 293             case 'A': break;
 294             case 'T': break;
 295             case 'C': break;
 296             case 'G': break;
 297             case 'U': break;
 298             case 'N': break;
 299             case 'a': seq[ i ] = 'A'; break;
 300             case 't': seq[ i ] = 'T'; break;
 301             case 'c': seq[ i ] = 'C'; break;
 302             case 'g': seq[ i ] = 'G'; break;
 303             case 'u': seq[ i ] = 'U'; break;
 304             default:  seq[ i ] = 'N';
 305         }
 306     }
 307 }
 308
 309
 310 void dna2rna( char *seq )
 311 {
 312     /* Martin A. Hansen, May 2008 */
 313
 314     /* Converts a DNA sequence to RNA by changing T and t to U and u. */
 315
 316     size_t i;
 317
 318     for ( i = 0; seq[ i ]; i++ )
 319     {
 320         switch ( seq[ i ] )
 321         {
 322             case 't': seq[ i ] = 'u'; break;
 323             case 'T': seq[ i ] = 'U'; break;
 324             default: break;
 325         }
 326     }
 327 }
 328
 329
 330 void rna2dna( char *seq )
 331 {
 332     /* Martin A. Hansen, May 2008 */
 333
 334     /* Converts a RNA sequence to RNA by changing T and u to T and t. */
 335
 336     size_t i;
 337
 338     for ( i = 0; seq[ i ]; i++ )
 339     {
 340         switch ( seq[ i ] )
 341         {
 342             case 'u': seq[ i ] = 't'; break;
 343             case 'U': seq[ i ] = 'T'; break;
 344             default: break;
 345         }
 346     }
 347 }
 348
 349
 350 bool is_dna( char *seq )
 351 {
 352     /* Martin A. Hansen, May 2008 */
 353
 354     /* Determines if a given sequence is DNA, */
 355     /* from inspection of the first 100 residues. */
 356
 357     size_t i;
 358
 359     for ( i = 0; seq[ i ]; i++ )
 360     {
 361         switch ( seq[ i ] )
 362         {
 363             case 'A': case 'a': break;
 364             case 'G': case 'g': break;
 365             case 'C': case 'c': break;
 366             case 'T': case 't': break;
 367             case 'R': case 'r': break;
 368             case 'Y': case 'y': break;
 369             case 'W': case 'w': break;
 370             case 'S': case 's': break;
 371             case 'M': case 'm': break;
 372             case 'K': case 'k': break;
 373             case 'H': case 'h': break;
 374             case 'D': case 'd': break;
 375             case 'V': case 'v': break;
 376             case 'B': case 'b': break;
 377             case 'N': case 'n': break;
 378             case '-': break;
 379             case '~': break;
 380             case '_': break;
 381             case '.': break;
 382             default: return FALSE;
 383         }
 384
 385         if ( i == 100 ) {
 386             break;
 387         }
 388     }
 389
 390     return TRUE;
 391 }
 392
 393
 394 bool is_rna( char *seq )
 395 {
 396     /* Martin A. Hansen, May 2008 */
 397
 398     /* Determines if a given sequence is RNA, */
 399     /* from inspection of the first 100 residues. */
 400
 401     size_t i;
 402
 403     for ( i = 0; seq[ i ]; i++ )
 404     {
 405         switch ( seq[ i ] )
 406         {
 407             case 'A': case 'a': break;
 408             case 'G': case 'g': break;
 409             case 'C': case 'c': break;
 410             case 'U': case 'u': break;
 411             case 'R': case 'r': break;
 412             case 'Y': case 'y': break;
 413             case 'W': case 'w': break;
 414             case 'S': case 's': break;
 415             case 'M': case 'm': break;
 416             case 'K': case 'k': break;
 417             case 'H': case 'h': break;
 418             case 'D': case 'd': break;
 419             case 'V': case 'v': break;
 420             case 'B': case 'b': break;
 421             case 'N': case 'n': break;
 422             case '-': break;
 423             case '~': break;
 424             case '_': break;
 425             case '.': break;
 426             default: return FALSE;
 427         }
 428
 429         if ( i == 100 ) {
 430             break;
 431         }
 432     }
 433
 434     return TRUE;
 435 }
 436
 437
 438 bool is_protein( char *seq )
 439 {
 440     /* Martin A. Hansen, May 2008 */
 441
 442     /* Determines if a given sequence is protein, */
 443     /* from inspection of the first 100 residues. */
 444
 445     size_t i;
 446
 447     for ( i = 0; seq[ i ]; i++ )
 448     {
 449         switch ( seq[ i ] )
 450         {
 451             case 'K': case 'k': break;
 452             case 'R': case 'r': break;
 453             case 'H': case 'h': break;
 454             case 'D': case 'd': break;
 455             case 'E': case 'e': break;
 456             case 'S': case 's': break;
 457             case 'T': case 't': break;
 458             case 'N': case 'n': break;
 459             case 'Q': case 'q': break;
 460             case 'A': case 'a': break;
 461             case 'V': case 'v': break;
 462             case 'I': case 'i': break;
 463             case 'L': case 'l': break;
 464             case 'M': case 'm': break;
 465             case 'F': case 'f': break;
 466             case 'Y': case 'y': break;
 467             case 'W': case 'w': break;
 468             case 'C': case 'c': break;
 469             case 'G': case 'g': break;
 470             case 'P': case 'p': break;
 471             case 'Z': case 'z': break;
 472             case 'B': case 'b': break;
 473             case 'X': case 'x': break;
 474             case '*': break;
 475             case '-': break;
 476             case '~': break;
 477             case '_': break;
 478             case '.': break;
 479             default: return FALSE;
 480         }
 481
 482         if ( i == 100 ) {
 483             break;
 484         }
 485     }
 486
 487     return TRUE;
 488 }
 489
 490
 491 char *seq_guess_type( char *seq )
 492 {
 493     /* Martin A. Hansen, May 2008 */
 494
 495     /* Guess the type of a given sequnce, */
 496     /* which is returned as a pointer to a string. */
 497
 498     char *type;
 499
 500     type = mem_get( 8 );
 501
 502     if ( is_dna( seq ) ) {
 503         type = "DNA";
 504     } else if ( is_rna( seq ) ) {
 505         type = "RNA";
 506     } else if ( is_protein( seq ) ) {
 507         type = "PROTEIN";
 508     } else {
 509         abort();
 510     }
 511
 512     return type;
 513 }
 514
 515
 516 bool contain_N( char *seq )
 517 {
 518     /* Martin A. Hansen, May 2008 */
 519
 520     /* Check if a sequence contain N or n residues. */
 521
 522     size_t i;
 523
 524     for ( i = 0; seq[ i ]; i++ )
 525     {
 526         switch ( seq[ i ] )
 527         {
 528             case 'N': case 'n': return TRUE;
 529             default: break;
 530         }
 531     }
 532
 533     return FALSE;
 534 }
 535
 536
 537 int oligo2bin( char *oligo )
 538 {
 539     /* Martin A. Hansen, August 2004 */
 540
 541     /* Pack a max 15 nucleotide long oligo into a four byte integer. */
 542
 543     int i;
 544     int bin;
 545
 546     if ( strlen( oligo ) > 15 ) {
 547         abort();
 548     }
 549
 550     bin = 0;
 551
 552     for ( i = 0; oligo[ i ]; i++ )
 553     {
 554         bin <<= 2;
 555
 556         switch ( oligo[ i ] )
 557         {
 558             case 'A': case 'a': bin |= 0; break;
 559             case 'N': case 'n': bin |= 0; break;
 560             case 'T': case 't': bin |= 1; break;
 561             case 'U': case 'u': bin |= 1; break;
 562             case 'C': case 'c': bin |= 2; break;
 563             case 'G': case 'g': bin |= 3; break;
 564             default: abort();
 565         }
 566     }
 567
 568     return bin;
 569 }