bp_doc/biotools_cookbook.lyx

   1 #LyX 1.5.1 created this file. For more info see http://www.lyx.org/
   2 \lyxformat 276
   3 \begin_document
   4 \begin_header
   5 \textclass scrartcl
   6 \begin_preamble
   7 \usepackage[colorlinks=true, urlcolor=blue, linkcolor=black]{hyperref}
   8 \end_preamble
   9 \language english
  10 \inputencoding auto
  11 \font_roman default
  12 \font_sans default
  13 \font_typewriter default
  14 \font_default_family default
  15 \font_sc false
  16 \font_osf false
  17 \font_sf_scale 100
  18 \font_tt_scale 100
  19 \graphics default
  20 \paperfontsize default
  21 \spacing single
  22 \papersize default
  23 \use_geometry false
  24 \use_amsmath 1
  25 \use_esint 1
  26 \cite_engine basic
  27 \use_bibtopic false
  28 \paperorientation portrait
  29 \secnumdepth 3
  30 \tocdepth 3
  31 \paragraph_separation skip
  32 \defskip medskip
  33 \quotes_language english
  34 \papercolumns 1
  35 \papersides 1
  36 \paperpagestyle default
  37 \tracking_changes false
  38 \output_changes false
  39 \author ""
  40 \author ""
  41 \end_header
  42
  43 \begin_body
  44
  45 \begin_layout Title
  46 Biotools Cookbook
  47 \end_layout
  48
  49 \begin_layout Author
  50 Martin Asser Hansen
  51 \end_layout
  52
  53 \begin_layout Publishers
  54 John Mattick Group
  55 \newline
  56 Institute for Molecular Bioscience
  57 \newline
  58 University of Queensland
  59 \newline
  60 Aust
  61 ralia
  62 \newline
  63 E-mail: mail@maasha.dk
  64 \end_layout
  65
  66 \begin_layout Standard
  67 \begin_inset ERT
  68 status open
  69
  70 \begin_layout Standard
  71
  72
  73 \backslash
  74 thispagestyle{empty}
  75 \end_layout
  76
  77 \end_inset
  78
  79
  80 \end_layout
  81
  82 \begin_layout Standard
  83
  84 \newpage
  85
  86 \end_layout
  87
  88 \begin_layout Standard
  89 \begin_inset LatexCommand tableofcontents
  90
  91 \end_inset
  92
  93
  94 \end_layout
  95
  96 \begin_layout Standard
  97 \begin_inset FloatList figure
  98
  99 \end_inset
 100
 101
 102 \end_layout
 103
 104 \begin_layout Standard
 105
 106 \newpage
 107
 108 \end_layout
 109
 110 \begin_layout Section
 111 Introduction
 112 \end_layout
 113
 114 \begin_layout Standard
 115 Biotools is a selection of simple tools that can be linked together (piped
 116  as we shall call it) in a very flexible manner to perform both simple and
 117  complex tasks.
 118  The fundamental idea is that biotools work on a data stream that will only
 119  terminate at the end of an analysis and that this data stream can be passed
 120  through several different biotools, each performing one specific task.
 121  The advantage of this approach is that a user can perform simple and complex
 122  tasks without having to write advanced code.
 123  Moreover, since the data format used to pass data between biotools is text
 124  based, biotools can be written by different developers in their favorite
 125  programming language --- and still the biotools will be able to work together.
 126 \end_layout
 127
 128 \begin_layout Standard
 129 In the most simple form bioools can be piped together on the command line
 130  like this (using the pipe character '|'):
 131 \end_layout
 132
 133 \begin_layout LyX-Code
 134 read_data | calculate_something | write_result
 135 \end_layout
 136
 137 \begin_layout Standard
 138 However, a more comprehensive analysis could be composed:
 139 \end_layout
 140
 141 \begin_layout LyX-Code
 142 read_data | select_entries | convert_entries | search_database
 143 \end_layout
 144
 145 \begin_layout LyX-Code
 146 evaluate_results | plot_diagram | plot_another_diagram |
 147 \end_layout
 148
 149 \begin_layout LyX-Code
 150 load_to_database
 151 \end_layout
 152
 153 \begin_layout Standard
 154 The data stream that is piped through the biotools consists of records of
 155  key/value pairs in the same way a hash does in order to keep as simple
 156  a structure as possible.
 157  An example record can be seen below:
 158 \end_layout
 159
 160 \begin_layout LyX-Code
 161
 162 \end_layout
 163
 164 \begin_layout LyX-Code
 165
 166 \size scriptsize
 167 REC_TYPE: PATSCAN
 168 \end_layout
 169
 170 \begin_layout LyX-Code
 171
 172 \size scriptsize
 173 MATCH: AGATCAAGTG
 174 \end_layout
 175
 176 \begin_layout LyX-Code
 177
 178 \size scriptsize
 179 S_BEG: 7
 180 \end_layout
 181
 182 \begin_layout LyX-Code
 183
 184 \size scriptsize
 185 S_END: 16
 186 \end_layout
 187
 188 \begin_layout LyX-Code
 189
 190 \size scriptsize
 191 ALIGN_LEN: 9
 192 \end_layout
 193
 194 \begin_layout LyX-Code
 195
 196 \size scriptsize
 197 S_ID: piR-t6
 198 \end_layout
 199
 200 \begin_layout LyX-Code
 201
 202 \size scriptsize
 203 STRAND: +
 204 \end_layout
 205
 206 \begin_layout LyX-Code
 207
 208 \size scriptsize
 209 PATTERN: AGATCAAGTG
 210 \end_layout
 211
 212 \begin_layout LyX-Code
 213
 214 \size scriptsize
 215 ---
 216 \end_layout
 217
 218 \begin_layout Standard
 219 The '
 220 \begin_inset ERT
 221 status collapsed
 222
 223 \begin_layout Standard
 224
 225 -
 226 \backslash
 227 /-
 228 \backslash
 229 /-
 230 \end_layout
 231
 232 \end_inset
 233
 234 ' denotes the delimiter of the records, and each key is a word followed
 235  by a ':' and a white-space and then the value.
 236  By convention the biotools only uses upper case keys (a list of used keys
 237  can be seen in Appendix\InsetSpace ~
 238
 239 \begin_inset LatexCommand ref
 240 reference "sec:Keys"
 241
 242 \end_inset
 243
 244 ).
 245  Since the records basically are hash structures this mean that the order
 246  of the keys in the stream is unordered, and in the above example it is
 247  pure coincidence that HIT_BEG is displayed before HIT_END, however, when
 248  the order of the keys is importent, the biotools will automagically see
 249  to that.
 250 \end_layout
 251
 252 \begin_layout Standard
 253 All of the biotools are able to read and write a data stream to and from
 254  file as long as the records are in the biotools format.
 255  This means that if you are undertaking a lengthy analysis where one of
 256  the steps is time consuming, you may save the stream after this step, and
 257  subsequently start one or more analysis from that last step
 258 \begin_inset Foot
 259 status collapsed
 260
 261 \begin_layout Standard
 262 It is a goal that the biotools at some point will be able to dump the data
 263  stream to file in case one of the tools fail critically.
 264 \end_layout
 265
 266 \end_inset
 267
 268 .
 269  If you are running a lengthy analysis it is highly recommended that you
 270  create a small test sample of the data and run that through the pipeline
 271  --- and once you are satisfied with the result proceed with the full data
 272  set (see\InsetSpace ~
 273
 274 \begin_inset LatexCommand ref
 275 reference "sub:How-to-select-a-few-records"
 276
 277 \end_inset
 278
 279 ).
 280 \end_layout
 281
 282 \begin_layout Standard
 283 All of the biotools can be supplied with long arguments prefixed with
 284 \begin_inset ERT
 285 status collapsed
 286
 287 \begin_layout Standard
 288
 289 -
 290 \backslash
 291 /-
 292 \end_layout
 293
 294 \end_inset
 295
 296  switches or single character arguments prefixed with - switches that can
 297  be grouped together (e.g.
 298  -xok).
 299  In this cookbook only the long switches are used to emphasize what these
 300  switches do.
 301 \end_layout
 302
 303 \begin_layout Section
 304 Setup
 305 \end_layout
 306
 307 \begin_layout Standard
 308 In order to get the biotools to work, you need to add environment settings
 309  to include the code binaries, scripts, and modules that constitute the
 310  biotools package.
 311  Assuming that you are using bash, add the following to your '~/.bashrc'
 312  file using your favorite editor.
 313  After the changes has been saved you need to either run 'source ~/.bashrc'
 314  or relogin.
 315 \end_layout
 316
 317 \begin_layout LyX-Code
 318
 319 \size scriptsize
 320 if [ -f "/home/m.hansen/maasha/conf/bashrc" ]; then
 321 \end_layout
 322
 323 \begin_layout LyX-Code
 324
 325 \size scriptsize
 326     source "/home/m.hansen/maasha/conf/bashrc"
 327 \end_layout
 328
 329 \begin_layout LyX-Code
 330
 331 \size scriptsize
 332 fi
 333 \end_layout
 334
 335 \begin_layout Section
 336 Getting Started
 337 \end_layout
 338
 339 \begin_layout Standard
 340 The biotool
 341 \series bold
 342 list_biotools
 343 \series default
 344  lists all the biotools along with a description:
 345 \end_layout
 346
 347 \begin_layout LyX-Code
 348
 349 \size scriptsize
 350 list_biotools
 351 \end_layout
 352
 353 \begin_layout LyX-Code
 354
 355 \size scriptsize
 356 align_seq             Align sequences in stream using Muscle.
 357 \end_layout
 358
 359 \begin_layout LyX-Code
 360
 361 \size scriptsize
 362 analyze_seq           Analysis the residue composition of each sequence
 363  in stream.
 364 \end_layout
 365
 366 \begin_layout LyX-Code
 367
 368 \size scriptsize
 369 analyze_vals          Determine type, count, min, max, sum and mean for
 370  values in stream.
 371 \end_layout
 372
 373 \begin_layout LyX-Code
 374
 375 \size scriptsize
 376 blast_seq             BLAST sequences in stream against a specified database.
 377 \end_layout
 378
 379 \begin_layout LyX-Code
 380
 381 \size scriptsize
 382 blat_seq              BLAT sequences in stream against a specified genome.
 383 \end_layout
 384
 385 \begin_layout LyX-Code
 386
 387 \size scriptsize
 388 complement_seq        Complement sequences in stream.
 389 \end_layout
 390
 391 \begin_layout LyX-Code
 392
 393 \size scriptsize
 394 count_records         Count the number of records in stream.
 395 \end_layout
 396
 397 \begin_layout LyX-Code
 398
 399 \size scriptsize
 400 count_seq             Count sequences in stream.
 401 \end_layout
 402
 403 \begin_layout LyX-Code
 404
 405 \size scriptsize
 406 count_vals            Count the number of times values of given keys exists
 407  in stream.
 408 \end_layout
 409
 410 \begin_layout LyX-Code
 411
 412 \size scriptsize
 413 create_blast_db       Create a BLAST database from sequences in stream for
 414  use with BLAST.
 415 \end_layout
 416
 417 \begin_layout LyX-Code
 418
 419 \size scriptsize
 420 ...
 421 \end_layout
 422
 423 \begin_layout Standard
 424 To list the biotools for writing different formats, you can use unix's grep
 425  like this:
 426 \end_layout
 427
 428 \begin_layout LyX-Code
 429
 430 \size scriptsize
 431 list_biotools | grep write
 432 \end_layout
 433
 434 \begin_layout LyX-Code
 435
 436 \size scriptsize
 437 write_align           Write aligned sequences in pretty alignment format.
 438 \end_layout
 439
 440 \begin_layout LyX-Code
 441
 442 \size scriptsize
 443 write_bed             Write records from stream as BED lines.
 444 \end_layout
 445
 446 \begin_layout LyX-Code
 447
 448 \size scriptsize
 449 write_blast           Write BLAST records from stream in BLAST tabular format
 450  (-m8 and 9).
 451 \end_layout
 452
 453 \begin_layout LyX-Code
 454
 455 \size scriptsize
 456 write_fasta           Write sequences in FASTA format.
 457 \end_layout
 458
 459 \begin_layout LyX-Code
 460
 461 \size scriptsize
 462 write_psl             Write records from stream in PSL format.
 463 \end_layout
 464
 465 \begin_layout LyX-Code
 466
 467 \size scriptsize
 468 write_tab             Write records from stream as tab separated table.
 469 \end_layout
 470
 471 \begin_layout Standard
 472 In order to find out how a specific biotool works, you just type the program
 473  name without any arguments and press return and the usage of the biotool
 474  will be displayed.
 475  E.g.
 476
 477 \series bold
 478 read_fasta
 479 \series default
 480  <return>:
 481 \end_layout
 482
 483 \begin_layout Standard
 484 \begin_inset Box Frameless
 485 position "t"
 486 hor_pos "c"
 487 has_inner_box 1
 488 inner_pos "t"
 489 use_parbox 0
 490 width "100col%"
 491 special "none"
 492 height "1in"
 493 height_special "totalheight"
 494 status open
 495
 496 \begin_layout LyX-Code
 497
 498 \size scriptsize
 499 Program name:   read_fasta
 500 \end_layout
 501
 502 \begin_layout LyX-Code
 503
 504 \size scriptsize
 505 Author:         Martin Asser Hansen - Copyright (C) - All rights reserved
 506 \end_layout
 507
 508 \begin_layout LyX-Code
 509
 510 \size scriptsize
 511 Contact:        mail@maasha.dk
 512 \end_layout
 513
 514 \begin_layout LyX-Code
 515
 516 \size scriptsize
 517 Date:           August 2007
 518 \end_layout
 519
 520 \begin_layout LyX-Code
 521
 522 \size scriptsize
 523 License:        GNU General Public License version 2 (http://www.gnu.org/copyleft/
 524 gpl.html)
 525 \end_layout
 526
 527 \begin_layout LyX-Code
 528
 529 \size scriptsize
 530 Description:    Read FASTA entries.
 531 \end_layout
 532
 533 \begin_layout LyX-Code
 534
 535 \size scriptsize
 536 Usage:          read_fasta [options] -i <FASTA file(s)>
 537 \end_layout
 538
 539 \begin_layout LyX-Code
 540
 541 \size scriptsize
 542 Options:
 543 \end_layout
 544
 545 \begin_layout LyX-Code
 546
 547 \size scriptsize
 548    [-i <file(s)> | --data_in=<file(s)>]  -  Comma separated list of files
 549  to read.
 550 \end_layout
 551
 552 \begin_layout LyX-Code
 553
 554 \size scriptsize
 555    [-n <int>     | --num=<int>]          -  Limit number of records to read.
 556 \end_layout
 557
 558 \begin_layout LyX-Code
 559
 560 \size scriptsize
 561    [-I <file>    | --stream_in=<file>]   -  Read input stream from file
 562   -  Default=STDIN
 563 \end_layout
 564
 565 \begin_layout LyX-Code
 566
 567 \size scriptsize
 568    [-O <file>    | --stream_out=<file>]  -  Write output stream to file
 569   -  Default=STDOUT
 570 \end_layout
 571
 572 \begin_layout LyX-Code
 573
 574 \end_layout
 575
 576 \begin_layout LyX-Code
 577
 578 \size scriptsize
 579 Examples:
 580 \end_layout
 581
 582 \begin_layout LyX-Code
 583
 584 \size scriptsize
 585    read_fasta -i test.fna             -  Read FASTA entries from file.
 586 \end_layout
 587
 588 \begin_layout LyX-Code
 589
 590 \size scriptsize
 591    read_fasta -i test1.fna,test2,fna  -  Read FASTA entries from files.
 592 \end_layout
 593
 594 \begin_layout LyX-Code
 595
 596 \size scriptsize
 597    read_fasta -i '*.fna'              -  Read FASTA entries from files.
 598 \end_layout
 599
 600 \begin_layout LyX-Code
 601
 602 \size scriptsize
 603    read_fasta -i test.fna -n 10       -  Read first 10 FASTA entries from
 604  file.
 605 \end_layout
 606
 607 \end_inset
 608
 609
 610 \end_layout
 611
 612 \begin_layout Section
 613 The Data Stream
 614 \end_layout
 615
 616 \begin_layout Subsection
 617 How to read the data stream from file?
 618 \begin_inset LatexCommand label
 619 name "sub:How-to-read-stream"
 620
 621 \end_inset
 622
 623
 624 \end_layout
 625
 626 \begin_layout Standard
 627 You want to read a data stream that you previously have saved to file in
 628  biotools format.
 629  This can be done implicetly or explicitly.
 630  The implicit way uses the 'stdout' stream of the Unix terminal:
 631 \end_layout
 632
 633 \begin_layout LyX-Code
 634 cat | <biotool>
 635 \end_layout
 636
 637 \begin_layout Standard
 638 cat is the Unix command that reads a file and output the result to 'stdout'
 639  --- which in this case is piped to any biotool represented by the <biotool>.
 640  It is also possible to read the data stream using '<' to direct the 'stdout'
 641  stream into the biotool like this:
 642 \end_layout
 643
 644 \begin_layout LyX-Code
 645 <biotool> < <file>
 646 \end_layout
 647
 648 \begin_layout Standard
 649 However, that will not work if you pipe more biotools together.
 650  Then it is much safer to read the stream from a file explicitly like this:
 651 \end_layout
 652
 653 \begin_layout LyX-Code
 654 <biotool> --stream_in=<file>
 655 \end_layout
 656
 657 \begin_layout Standard
 658 Here the filename <file> is explicetly given to the biotool <biotool> with
 659  the switch
 660 \begin_inset ERT
 661 status collapsed
 662
 663 \begin_layout Standard
 664
 665 -
 666 \backslash
 667 /-
 668 \end_layout
 669
 670 \end_inset
 671
 672 stream_in.
 673  This switch works with all biotools.
 674  It is also possible to read in data from multiple sources by repeating
 675  the explicit read step:
 676 \end_layout
 677
 678 \begin_layout LyX-Code
 679 <biotool> --stream_in=<file1> | <biotool> --stream_in=<file2>
 680 \end_layout
 681
 682 \begin_layout Subsection
 683 How to write the data stream to file?
 684 \begin_inset LatexCommand label
 685 name "sub:How-to-write-stream"
 686
 687 \end_inset
 688
 689
 690 \end_layout
 691
 692 \begin_layout Standard
 693 In order to save the output stream from a biotool to file, so you can read
 694  in the stream again at a later time, you can do one of two things:
 695 \end_layout
 696
 697 \begin_layout LyX-Code
 698 <biotool> > <file>
 699 \end_layout
 700
 701 \begin_layout Standard
 702 All, the biotools write the data stream to 'stdout' by default which can
 703  be written to a file by redirecting 'stdout' to file using '>' , however,
 704  if one of the biotools for writing other formats is used then the both
 705  the biotools records as well as the result output will go to 'stdout' in
 706  a mixture causing havock! To avoid this you must use the switch
 707 \begin_inset ERT
 708 status collapsed
 709
 710 \begin_layout Standard
 711
 712 -
 713 \backslash
 714 /-
 715 \end_layout
 716
 717 \end_inset
 718
 719 stream_out that explictly tells the biotool to write the output stream to
 720  file:
 721 \end_layout
 722
 723 \begin_layout LyX-Code
 724 <biotool> --stream_out=<file>
 725 \end_layout
 726
 727 \begin_layout Standard
 728 The
 729 \begin_inset ERT
 730 status collapsed
 731
 732 \begin_layout Standard
 733
 734 -
 735 \backslash
 736 /-
 737 \end_layout
 738
 739 \end_inset
 740
 741 stream_out switch works with all biotools.
 742 \end_layout
 743
 744 \begin_layout Subsection
 745 How to terminate the data stream?
 746 \end_layout
 747
 748 \begin_layout Standard
 749 The data stream is never stops unless the user want to save the stream or
 750  by supplying the
 751 \begin_inset ERT
 752 status collapsed
 753
 754 \begin_layout Standard
 755
 756 -
 757 \backslash
 758 /-
 759 \end_layout
 760
 761 \end_inset
 762
 763 no_stream switch that will terminate the stream:
 764 \end_layout
 765
 766 \begin_layout LyX-Code
 767 <biotool> --no_stream
 768 \end_layout
 769
 770 \begin_layout Standard
 771 The
 772 \begin_inset ERT
 773 status collapsed
 774
 775 \begin_layout Standard
 776
 777 -
 778 \backslash
 779 /-
 780 \end_layout
 781
 782 \end_inset
 783
 784 no_stream switch only works with those biotools where it makes sense that
 785  the user might want to terminale the data stream,
 786 \emph on
 787 i.e
 788 \emph default
 789 .
 790  after an analysis step where the user wants to output the result, but not
 791  the data stream.
 792 \end_layout
 793
 794 \begin_layout Subsection
 795 How to write my results to file?
 796 \begin_inset LatexCommand label
 797 name "sub:How-to-write-result"
 798
 799 \end_inset
 800
 801
 802 \end_layout
 803
 804 \begin_layout Standard
 805 Saving the result of an analysis to file can be done implicitly or explicitly.
 806  The implicit way:
 807 \end_layout
 808
 809 \begin_layout LyX-Code
 810 <biotool> --no_stream > <file>
 811 \end_layout
 812
 813 \begin_layout Standard
 814 If you use '>' to redirect 'stdout' to file then it is important to use
 815  the
 816 \begin_inset ERT
 817 status collapsed
 818
 819 \begin_layout Standard
 820
 821 -
 822 \backslash
 823 /-
 824 \end_layout
 825
 826 \end_inset
 827
 828 no_stream switch to avoid writing a mix of biotools records and result to
 829  the same file causing havock.
 830  The safe way is to use the
 831 \begin_inset ERT
 832 status open
 833
 834 \begin_layout Standard
 835
 836 -
 837 \backslash
 838 /-
 839 \end_layout
 840
 841 \end_inset
 842
 843 result_out switch which explicetly tells the biotool to write the result
 844  to a given file:
 845 \end_layout
 846
 847 \begin_layout LyX-Code
 848 <biotool> --result_out=<file>
 849 \end_layout
 850
 851 \begin_layout Standard
 852 Using the above method will not terminate the stream, so it is possible
 853  to pipe that into another biotool generating different results:
 854 \end_layout
 855
 856 \begin_layout LyX-Code
 857 <biotool1> --result_out=<file1> | <biotool2> --result_out=<file2>
 858 \end_layout
 859
 860 \begin_layout Standard
 861 And still the data stream will continue unless terminated with
 862 \begin_inset ERT
 863 status collapsed
 864
 865 \begin_layout Standard
 866
 867 -
 868 \backslash
 869 /-
 870 \end_layout
 871
 872 \end_inset
 873
 874 no_stream:
 875 \end_layout
 876
 877 \begin_layout LyX-Code
 878 <biotool> --result_out=<file> --no_stream
 879 \end_layout
 880
 881 \begin_layout Standard
 882 Or written to file using implicitly or explicity
 883 \begin_inset LatexCommand eqref
 884 reference "sub:How-to-write-result"
 885
 886 \end_inset
 887
 888 .
 889  The explicit way:
 890 \end_layout
 891
 892 \begin_layout LyX-Code
 893 <biotool> --result_out=<file1> --stream_out=<file2>
 894 \end_layout
 895
 896 \begin_layout Subsection
 897 How to read data from multiple sources?
 898 \end_layout
 899
 900 \begin_layout Standard
 901 To read multiple data sources, with the same type or different type of data
 902  do:
 903 \end_layout
 904
 905 \begin_layout LyX-Code
 906 <biotool1> --data_in=<file1> | <biotool2> --data_in=<file2>
 907 \end_layout
 908
 909 \begin_layout Standard
 910 where type is the data type a specific biotool reads.
 911 \end_layout
 912
 913 \begin_layout Section
 914 Reading input
 915 \end_layout
 916
 917 \begin_layout Subsection
 918 How to read biotools input?
 919 \end_layout
 920
 921 \begin_layout Standard
 922 See
 923 \begin_inset LatexCommand eqref
 924 reference "sub:How-to-read-stream"
 925
 926 \end_inset
 927
 928 .
 929 \end_layout
 930
 931 \begin_layout Subsection
 932 How to read in data?
 933 \end_layout
 934
 935 \begin_layout Standard
 936 Data in different formats can be read with the appropriate biotool for that
 937  format.
 938  The biotools are typicalled named 'read_<data type>' such as
 939 \series bold
 940 read_fasta
 941 \series default
 942 ,
 943 \series bold
 944 read_bed
 945 \series default
 946 ,
 947 \series bold
 948 read_tab
 949 \series default
 950 , etc., and all behave in a similar manner.
 951  Data can be read by supplying the
 952 \begin_inset ERT
 953 status collapsed
 954
 955 \begin_layout Standard
 956
 957 -
 958 \backslash
 959 /-
 960 \end_layout
 961
 962 \end_inset
 963
 964 data_in switch and a file name to the file containing the data:
 965 \end_layout
 966
 967 \begin_layout LyX-Code
 968 <biotool> --data_in=<file>
 969 \end_layout
 970
 971 \begin_layout Standard
 972 It is also possible to read in a saved biotools stream (see
 973 \begin_inset LatexCommand ref
 974 reference "sub:How-to-read-stream"
 975
 976 \end_inset
 977
 978 ) as well as reading data in one go:
 979 \end_layout
 980
 981 \begin_layout LyX-Code
 982 <biotool> --stream_in=<file1> --data_in=<file2>
 983 \end_layout
 984
 985 \begin_layout Standard
 986 If you want to read data from several files you can do this:
 987 \end_layout
 988
 989 \begin_layout LyX-Code
 990 <biotool> --data_in=<file1> | <biotool> --data_in=<file2>
 991 \end_layout
 992
 993 \begin_layout Standard
 994 If you have several data files you can read in all explicitly with a comma
 995  separated list:
 996 \end_layout
 997
 998 \begin_layout LyX-Code
 999 <biotool> --data_in=file1,file2,file3
1000 \end_layout
1001
1002 \begin_layout Standard
1003 And it is also possible to use file globbing
1004 \begin_inset Foot
1005 status open
1006
1007 \begin_layout Standard
1008 using the short option will only work if you quote the argument -i '*.fna'
1009 \end_layout
1010
1011 \end_inset
1012
1013 :
1014 \end_layout
1015
1016 \begin_layout LyX-Code
1017 <biotool> --data_in=*.fna
1018 \end_layout
1019
1020 \begin_layout Standard
1021 Or in a combination:
1022 \end_layout
1023
1024 \begin_layout LyX-Code
1025 <biotool> --data_in=file1,/dir/*.fna
1026 \end_layout
1027
1028 \begin_layout Standard
1029 Finally, it is possible to read in data in different formats using the appropria
1030 te biotool for each format:
1031 \end_layout
1032
1033 \begin_layout LyX-Code
1034 <biotool1> --data_in=<file1> | <biotool2> --data_in=<file2> ...
1035 \end_layout
1036
1037 \begin_layout Subsection
1038 How to read FASTA input?
1039 \end_layout
1040
1041 \begin_layout Standard
1042 Sequences in FASTA format can be read explicitly using
1043 \series bold
1044 read_fasta
1045 \series default
1046 :
1047 \end_layout
1048
1049 \begin_layout LyX-Code
1050 read_fasta --data_in=<file>
1051 \end_layout
1052
1053 \begin_layout Subsection
1054 How to read alignment input?
1055 \end_layout
1056
1057 \begin_layout Standard
1058 If your alignment if FASTA formatted then you can
1059 \series bold
1060 read_align
1061 \series default
1062 .
1063  It is also possible to use
1064 \series bold
1065 read_fasta
1066 \series default
1067  since the data is FASTA formatted, however, with
1068 \series bold
1069 read_fasta
1070 \series default
1071  the key ALIGN will be omitted.
1072  The ALIGN key is used to determine which sequences belong to what alignment
1073  which is required for
1074 \series bold
1075 write_align
1076 \series default
1077 .
1078 \end_layout
1079
1080 \begin_layout LyX-Code
1081 read_align --data_in=<file>
1082 \end_layout
1083
1084 \begin_layout Subsection
1085 How to read tabular input?
1086 \begin_inset LatexCommand label
1087 name "sub:How-to-read-table"
1088
1089 \end_inset
1090
1091
1092 \end_layout
1093
1094 \begin_layout Standard
1095 Tabular input can be read with
1096 \series bold
1097 read_tab
1098 \series default
1099  which will read in all rows and chosen columns (separated by a given delimter)
1100  from a table in text format.
1101 \end_layout
1102
1103 \begin_layout Standard
1104 The table below:
1105 \end_layout
1106
1107 \begin_layout Standard
1108 \noindent
1109 \align center
1110 \begin_inset Tabular
1111 <lyxtabular version="3" rows="4" columns="3">
1112 <features>
1113 <column alignment="left" valignment="top" width="0">
1114 <column alignment="left" valignment="top" width="0">
1115 <column alignment="left" valignment="top" width="0">
1116 <row>
1117 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1118 \begin_inset Text
1119
1120 \begin_layout Standard
1121 Human
1122 \end_layout
1123
1124 \end_inset
1125 </cell>
1126 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1127 \begin_inset Text
1128
1129 \begin_layout Standard
1130 ATACGTCAG
1131 \end_layout
1132
1133 \end_inset
1134 </cell>
1135 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1136 \begin_inset Text
1137
1138 \begin_layout Standard
1139 23524
1140 \end_layout
1141
1142 \end_inset
1143 </cell>
1144 </row>
1145 <row>
1146 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1147 \begin_inset Text
1148
1149 \begin_layout Standard
1150 Dog
1151 \end_layout
1152
1153 \end_inset
1154 </cell>
1155 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1156 \begin_inset Text
1157
1158 \begin_layout Standard
1159 AGCATGAC
1160 \end_layout
1161
1162 \end_inset
1163 </cell>
1164 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1165 \begin_inset Text
1166
1167 \begin_layout Standard
1168 2442
1169 \end_layout
1170
1171 \end_inset
1172 </cell>
1173 </row>
1174 <row>
1175 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1176 \begin_inset Text
1177
1178 \begin_layout Standard
1179 Mouse
1180 \end_layout
1181
1182 \end_inset
1183 </cell>
1184 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1185 \begin_inset Text
1186
1187 \begin_layout Standard
1188 GACTG
1189 \end_layout
1190
1191 \end_inset
1192 </cell>
1193 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1194 \begin_inset Text
1195
1196 \begin_layout Standard
1197 234
1198 \end_layout
1199
1200 \end_inset
1201 </cell>
1202 </row>
1203 <row>
1204 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1205 \begin_inset Text
1206
1207 \begin_layout Standard
1208 Cat
1209 \end_layout
1210
1211 \end_inset
1212 </cell>
1213 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1214 \begin_inset Text
1215
1216 \begin_layout Standard
1217 AAATGCA
1218 \end_layout
1219
1220 \end_inset
1221 </cell>
1222 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1223 \begin_inset Text
1224
1225 \begin_layout Standard
1226 2342
1227 \end_layout
1228
1229 \end_inset
1230 </cell>
1231 </row>
1232 </lyxtabular>
1233
1234 \end_inset
1235
1236
1237 \end_layout
1238
1239 \begin_layout Standard
1240 Can be read using the command:
1241 \end_layout
1242
1243 \begin_layout LyX-Code
1244 read_tab --data_in=<file>
1245 \end_layout
1246
1247 \begin_layout Standard
1248 Which will result in four records, one for each row, where the keys V0,
1249  V1, V2 are the default keys for the organism, sequence, and count, respectively.
1250  It is possible to select a subset of colums to read by using the
1251 \begin_inset ERT
1252 status collapsed
1253
1254 \begin_layout Standard
1255
1256 -
1257 \backslash
1258 /-
1259 \end_layout
1260
1261 \end_inset
1262
1263 cols switch which takes a comma separated list of columns numbers (first
1264  column is designated 0) as argument.
1265  So to read in only the sequence and the count so that the count comes before
1266  the sequence do:
1267 \end_layout
1268
1269 \begin_layout LyX-Code
1270 read_tab --data_in=<file> --cols=2,1
1271 \end_layout
1272
1273 \begin_layout Standard
1274 It is also possible to name the columns with the
1275 \begin_inset ERT
1276 status collapsed
1277
1278 \begin_layout Standard
1279
1280 -
1281 \backslash
1282 /-
1283 \end_layout
1284
1285 \end_inset
1286
1287 keys switch:
1288 \end_layout
1289
1290 \begin_layout LyX-Code
1291 read_tab --data_in=<file> --cols=2,1 --keys=COUNT,SEQ
1292 \end_layout
1293
1294 \begin_layout Subsection
1295 How to read BED input?
1296 \end_layout
1297
1298 \begin_layout Standard
1299 The BED (Browser Extensible Data
1300 \begin_inset Foot
1301 status open
1302
1303 \begin_layout Standard
1304 \begin_inset LatexCommand url
1305 target "http://genome.ucsc.edu/FAQ/FAQformat"
1306
1307 \end_inset
1308
1309
1310 \end_layout
1311
1312 \end_inset
1313
1314 ) format is a tabular format for data pertaining to one of the Eukaryotic
1315  genomes in the UCSC genome brower
1316 \begin_inset Foot
1317 status collapsed
1318
1319 \begin_layout Standard
1320 \begin_inset LatexCommand url
1321 target "http://genome.ucsc.edu/"
1322
1323 \end_inset
1324
1325
1326 \end_layout
1327
1328 \end_inset
1329
1330 .
1331  The BED format consists of up to 12 columns, where the first three are
1332  mandatory CHR, CHR_BEG, and CHR_END.
1333  The mandatory columns and any of the optional columns can all be read in
1334  easily with the
1335 \series bold
1336 read_bed
1337 \series default
1338  biotool.
1339 \end_layout
1340
1341 \begin_layout LyX-Code
1342 read_bed --data_in=<file>
1343 \end_layout
1344
1345 \begin_layout Standard
1346 It is also possible to read the BED file with
1347 \series bold
1348 read_tab
1349 \series default
1350  (see\InsetSpace ~
1351
1352 \begin_inset LatexCommand ref
1353 reference "sub:How-to-read-table"
1354
1355 \end_inset
1356
1357 ), however, that will be more cumbersome because you need to specify the
1358  keys:
1359 \end_layout
1360
1361 \begin_layout LyX-Code
1362 read_tab --data_in=<file> --keys=CHR,CHR_BEG,CHR_END ...
1363 \end_layout
1364
1365 \begin_layout Subsection
1366 How to read PSL input?
1367 \end_layout
1368
1369 \begin_layout Standard
1370 The PSL format is the output from BLAT and contains 21 mandatory fields
1371  that can be read with
1372 \series bold
1373 read_psl
1374 \series default
1375 :
1376 \end_layout
1377
1378 \begin_layout LyX-Code
1379 read_psl --data_in=<file>
1380 \end_layout
1381
1382 \begin_layout Section
1383 Writing output
1384 \end_layout
1385
1386 \begin_layout Standard
1387 All result output can be written explicitly to file using the
1388 \begin_inset ERT
1389 status collapsed
1390
1391 \begin_layout Standard
1392
1393 -
1394 \backslash
1395 /-
1396 \end_layout
1397
1398 \end_inset
1399
1400 result_out switch which all result generating biotools have.
1401  It is also possible to write the result to file implicetly by directing
1402  'stdout' to file using '>', however, that requires the
1403 \begin_inset ERT
1404 status collapsed
1405
1406 \begin_layout Standard
1407
1408 -
1409 \backslash
1410 /-
1411 \end_layout
1412
1413 \end_inset
1414
1415 no_stream swich to prevent a mixture of data stream and results in the file.
1416  The explicit (and safe) way:
1417 \end_layout
1418
1419 \begin_layout LyX-Code
1420 ...
1421  | <biotool> --result_out=<file>
1422 \end_layout
1423
1424 \begin_layout Standard
1425 The implicit way:
1426 \end_layout
1427
1428 \begin_layout LyX-Code
1429 ...
1430  | <biotool> --no_stream > <file>
1431 \end_layout
1432
1433 \begin_layout Subsection
1434 How to write biotools output?
1435 \end_layout
1436
1437 \begin_layout Standard
1438 See
1439 \begin_inset LatexCommand eqref
1440 reference "sub:How-to-write-stream"
1441
1442 \end_inset
1443
1444 .
1445 \end_layout
1446
1447 \begin_layout Subsection
1448 How to write FASTA output?
1449 \begin_inset LatexCommand label
1450 name "sub:How-to-write-fasta"
1451
1452 \end_inset
1453
1454
1455 \end_layout
1456
1457 \begin_layout Standard
1458 FASTA output can be written with
1459 \series bold
1460 write_fasta
1461 \series default
1462 .
1463 \end_layout
1464
1465 \begin_layout LyX-Code
1466 ...
1467  | write_fasta --result_out=<file>
1468 \end_layout
1469
1470 \begin_layout Standard
1471 It is also possible to wrap the sequences to a given width using the
1472 \begin_inset ERT
1473 status collapsed
1474
1475 \begin_layout Standard
1476
1477 -
1478 \backslash
1479 /-
1480 \end_layout
1481
1482 \end_inset
1483
1484 wrap switch allthough wrapping of sequence is generally an evil thing:
1485 \end_layout
1486
1487 \begin_layout LyX-Code
1488 ...
1489  | write_fasta --no_stream --wrap=80
1490 \end_layout
1491
1492 \begin_layout Subsection
1493 How to write alignment output?
1494 \begin_inset LatexCommand label
1495 name "sub:How-to-write-alignment"
1496
1497 \end_inset
1498
1499
1500 \end_layout
1501
1502 \begin_layout Standard
1503 Pretty alignments with ruler
1504 \begin_inset Foot
1505 status collapsed
1506
1507 \begin_layout Standard
1508 '.' for every 10 residues, ':' for every 50, and '|' for every 100
1509 \end_layout
1510
1511 \end_inset
1512
1513  and consensus sequence
1514 \begin_inset Note Note
1515 status collapsed
1516
1517 \begin_layout Standard
1518 which reminds me to make that an option.
1519 \end_layout
1520
1521 \end_inset
1522
1523  can be created with
1524 \series bold
1525 write_align
1526 \series default
1527 , what also have the optional
1528 \begin_inset ERT
1529 status collapsed
1530
1531 \begin_layout Standard
1532
1533 -
1534 \backslash
1535 /-
1536 \end_layout
1537
1538 \end_inset
1539
1540 wrap switch to break the alignment into blocks of a given width:
1541 \end_layout
1542
1543 \begin_layout LyX-Code
1544 ...
1545  | write_align --result_out=<file> --wrap=80
1546 \end_layout
1547
1548 \begin_layout Standard
1549 If the number of sequnces in the alignment is 2 then a pairwise alignment
1550  will be output otherwise a multiple alignment.
1551  And if the sequence type, determined automagically, is protein, then residues
1552  and symbols (+,\InsetSpace ~
1553 :,\InsetSpace ~
1554 .) will be used to show consensus according to the Blosum62
1555  matrix.
1556 \end_layout
1557
1558 \begin_layout Subsection
1559 How to write tabular output?
1560 \begin_inset LatexCommand label
1561 name "sub:How-to-write-tab"
1562
1563 \end_inset
1564
1565
1566 \end_layout
1567
1568 \begin_layout Standard
1569 Outputting the data stream as a table can be done with
1570 \series bold
1571 write_tab
1572 \series default
1573 , which will write generate one row per record with the values as columns.
1574  If you supply the optional
1575 \begin_inset ERT
1576 status collapsed
1577
1578 \begin_layout Standard
1579
1580 -
1581 \backslash
1582 /-
1583 \end_layout
1584
1585 \end_inset
1586
1587 comment switch, when the first row in the table will be a 'comment' line
1588  prefixed with a '#':
1589 \end_layout
1590
1591 \begin_layout LyX-Code
1592 ...
1593  | write_tab --result_out=<file> --comment
1594 \end_layout
1595
1596 \begin_layout Standard
1597 You can also change the delimiter from the default (tab) to
1598 \emph on
1599 e.g.
1600
1601 \emph default
1602  ',':
1603 \end_layout
1604
1605 \begin_layout LyX-Code
1606 ...
1607  | write_tab --result_out=<file> --delimit=','
1608 \end_layout
1609
1610 \begin_layout Standard
1611 If you want the values output in a specific order you have to supply a comma
1612  separated list using the
1613 \begin_inset ERT
1614 status collapsed
1615
1616 \begin_layout Standard
1617
1618 -
1619 \backslash
1620 /-
1621 \end_layout
1622
1623 \end_inset
1624
1625 keys switch that will print only those keys in that order:
1626 \end_layout
1627
1628 \begin_layout LyX-Code
1629 ...
1630  | write_tab --result_out=<file> --keys=SEQ_NAME,COUNT
1631 \end_layout
1632
1633 \begin_layout Standard
1634 Alternatively, if you have some keys that you don't want in the tabular
1635  output, use the
1636 \begin_inset ERT
1637 status collapsed
1638
1639 \begin_layout Standard
1640
1641 -
1642 \backslash
1643 /-
1644 \end_layout
1645
1646 \end_inset
1647
1648 no_keys switch.
1649  So to print all keys except SEQ and SEQ_TYPE do:
1650 \end_layout
1651
1652 \begin_layout LyX-Code
1653 ...
1654  | write_tab --result_out=<file> --no_keys=SEQ,SEQ_TYPE
1655 \end_layout
1656
1657 \begin_layout Standard
1658 Finally, if you have a stream containing a mix of different records types,
1659
1660 \emph on
1661 e.g.
1662
1663 \emph default
1664  records with sequences and records with matches, then you can use
1665 \series bold
1666 write_tab
1667 \series default
1668  to output all the records in tabluar format, however, the
1669 \begin_inset ERT
1670 status collapsed
1671
1672 \begin_layout Standard
1673
1674 -
1675 \backslash
1676 /-
1677 \end_layout
1678
1679 \end_inset
1680
1681 comment,
1682 \begin_inset ERT
1683 status collapsed
1684
1685 \begin_layout Standard
1686
1687 -
1688 \backslash
1689 /-
1690 \end_layout
1691
1692 \end_inset
1693
1694 keys, and
1695 \begin_inset ERT
1696 status collapsed
1697
1698 \begin_layout Standard
1699
1700 -
1701 \backslash
1702 /-
1703 \end_layout
1704
1705 \end_inset
1706
1707 no_keys switches will only respond to records of the first type encountered.
1708  The reason is that outputting mixed records is probably not what you want
1709  anyway, and you should remove all the unwanted records from the stream
1710  before outputting the table:
1711 \series bold
1712 grab
1713 \series default
1714  is your friend (see\InsetSpace ~
1715
1716 \begin_inset LatexCommand ref
1717 reference "sub:How-to-grab"
1718
1719 \end_inset
1720
1721 ).
1722 \end_layout
1723
1724 \begin_layout Subsection
1725 How to write a BED output?
1726 \begin_inset LatexCommand label
1727 name "sub:How-to-write-BED"
1728
1729 \end_inset
1730
1731
1732 \end_layout
1733
1734 \begin_layout Standard
1735 Data in BED format can be output if the records contain the mandatory keys
1736  CHR, CHR_BEG, and CHR_END using
1737 \series bold
1738 write_bed
1739 \series default
1740 .
1741  If the optional keys are also present, they will be output as well:
1742 \end_layout
1743
1744 \begin_layout LyX-Code
1745 write_bed --result_out=<file>
1746 \end_layout
1747
1748 \begin_layout Subsection
1749 How to write PSL output?
1750 \begin_inset LatexCommand label
1751 name "sub:How-to-write-PSL"
1752
1753 \end_inset
1754
1755
1756 \end_layout
1757
1758 \begin_layout Standard
1759 Data in PSL format can be output using
1760 \series bold
1761 write_psl:
1762 \end_layout
1763
1764 \begin_layout LyX-Code
1765 write_psl --result_out=<file>
1766 \end_layout
1767
1768 \begin_layout Section
1769 Manipulating Records
1770 \end_layout
1771
1772 \begin_layout Subsection
1773 How to select a few records?
1774 \begin_inset LatexCommand label
1775 name "sub:How-to-select-a-few-records"
1776
1777 \end_inset
1778
1779
1780 \end_layout
1781
1782 \begin_layout Standard
1783 To quickly get an overview of your data you can limit the data stream to
1784  show a few records.
1785  This also very useful to test the pipeline with a few records if you are
1786  setting up a complex analysis using several biotools.
1787  That way you can inspect that all goes well before analyzing and waiting
1788  for the full data set.
1789  All of the read_<type> biotools have the
1790 \begin_inset ERT
1791 status collapsed
1792
1793 \begin_layout Standard
1794
1795 -
1796 \backslash
1797 /-
1798 \end_layout
1799
1800 \end_inset
1801
1802 num switch which will take a number as argument and only that number of
1803  records will be read.
1804  So to read in the first 10 FASTA entries from a file:
1805 \end_layout
1806
1807 \begin_layout LyX-Code
1808 read_fasta --data_in=test.fna --num=10
1809 \end_layout
1810
1811 \begin_layout Standard
1812 Another way of doing this is to use
1813 \series bold
1814 head_records
1815 \series default
1816  will limit the stream to show the first 10 records (default):
1817 \end_layout
1818
1819 \begin_layout LyX-Code
1820 ...
1821  | head_records
1822 \end_layout
1823
1824 \begin_layout Standard
1825 Using
1826 \series bold
1827 head_records
1828 \series default
1829  directly after one of the read_<type> biotools will be a lot slower than
1830  using the
1831 \begin_inset ERT
1832 status collapsed
1833
1834 \begin_layout Standard
1835
1836 -
1837 \backslash
1838 /-
1839 \end_layout
1840
1841 \end_inset
1842
1843 num switch with the read_<type> biotools, however,
1844 \series bold
1845 head_records
1846 \series default
1847  can also be used to limit the output from all the other biotools.
1848  It is also possible to give
1849 \series bold
1850 head_records
1851 \series default
1852  a number of records to show using the
1853 \begin_inset ERT
1854 status collapsed
1855
1856 \begin_layout Standard
1857
1858 -
1859 \backslash
1860 /-
1861 \end_layout
1862
1863 \end_inset
1864
1865 num switch.
1866  So to display the first 100 records do:
1867 \end_layout
1868
1869 \begin_layout LyX-Code
1870 ...
1871  | head_records --num=100
1872 \end_layout
1873
1874 \begin_layout Subsection
1875 How to select random records?
1876 \begin_inset LatexCommand label
1877 name "sub:How-to-select-random-records"
1878
1879 \end_inset
1880
1881
1882 \end_layout
1883
1884 \begin_layout Standard
1885 If you want to inspect a number of random records from the stream this can
1886  be done with the
1887 \series bold
1888 random_records
1889 \series default
1890  biotool.
1891  So if you have 1 mio records in the stream and you want to select 1000
1892  random records do:
1893 \end_layout
1894
1895 \begin_layout LyX-Code
1896 ...
1897  | random_records --num=1000
1898 \end_layout
1899
1900 \begin_layout Subsection
1901 How to count all records in the data stream?
1902 \end_layout
1903
1904 \begin_layout Standard
1905 To count all the records in the data stream use
1906 \series bold
1907 count_records
1908 \series default
1909 , which adds one record (which is not included in the count) to the data
1910  stream.
1911  So to count the number of sequences in a FASTA file you can do this:
1912 \end_layout
1913
1914 \begin_layout LyX-Code
1915 cat test.fna | read_fasta | count_records --no_stream
1916 \end_layout
1917
1918 \begin_layout Standard
1919 Which will write the last record containing the count to 'stdout':
1920 \end_layout
1921
1922 \begin_layout LyX-Code
1923
1924 \size scriptsize
1925 count_records: 630
1926 \end_layout
1927
1928 \begin_layout LyX-Code
1929
1930 \size scriptsize
1931 ---
1932 \end_layout
1933
1934 \begin_layout Standard
1935 It is also possible to write the count to file using the
1936 \begin_inset ERT
1937 status collapsed
1938
1939 \begin_layout Standard
1940
1941 -
1942 \backslash
1943 /-
1944 \end_layout
1945
1946 \end_inset
1947
1948 result_out switch.
1949 \end_layout
1950
1951 \begin_layout Subsection
1952 How to get the length of record values?
1953 \begin_inset LatexCommand label
1954 name "sub:How-to-get-value_length"
1955
1956 \end_inset
1957
1958
1959 \end_layout
1960
1961 \begin_layout Standard
1962 Use the
1963 \series bold
1964 length_vals
1965 \series default
1966  biotool to get the length of each value for a comma separated list of keys:
1967 \end_layout
1968
1969 \begin_layout LyX-Code
1970 ...
1971  | length_vals --keys=HIT,PATTERN
1972 \end_layout
1973
1974 \begin_layout Subsection
1975 How to grab specific records?
1976 \begin_inset LatexCommand label
1977 name "sub:How-to-grab"
1978
1979 \end_inset
1980
1981
1982 \end_layout
1983
1984 \begin_layout Standard
1985 The biotool
1986 \series bold
1987 grab
1988 \series default
1989  is related to the Unix grep and locates records based on matching keys
1990  and/or values using either a pattern, a Perl regex, or a numerical evaluation.
1991  To easily
1992 \series bold
1993 grab
1994 \series default
1995  all records in the stream that has any mentioning of the pattern 'human'
1996  just pipe the data stream through
1997 \series bold
1998 grab
1999 \series default
2000  like this:
2001 \end_layout
2002
2003 \begin_layout LyX-Code
2004 ...
2005  | grab --pattern=human
2006 \end_layout
2007
2008 \begin_layout Standard
2009 This will search for the pattern 'human' in all keys and all values.
2010  The
2011 \begin_inset ERT
2012 status collapsed
2013
2014 \begin_layout Standard
2015
2016 -
2017 \backslash
2018 /-
2019 \end_layout
2020
2021 \end_inset
2022
2023 pattern switch takes a comma separated list of patterns, so in order to
2024  match multiple patterns do:
2025 \end_layout
2026
2027 \begin_layout LyX-Code
2028 ...
2029  | grab --pattern=human,mouse
2030 \end_layout
2031
2032 \begin_layout Standard
2033 It is also possible to use the
2034 \begin_inset ERT
2035 status collapsed
2036
2037 \begin_layout Standard
2038
2039 -
2040 \backslash
2041 /-
2042 \end_layout
2043
2044 \end_inset
2045
2046 pattern_in switch instead of
2047 \begin_inset ERT
2048 status collapsed
2049
2050 \begin_layout Standard
2051
2052 -
2053 \backslash
2054 /-
2055 \end_layout
2056
2057 \end_inset
2058
2059 pattern.
2060
2061 \begin_inset ERT
2062 status collapsed
2063
2064 \begin_layout Standard
2065
2066 -
2067 \backslash
2068 /-
2069 \end_layout
2070
2071 \end_inset
2072
2073 pattern_in is used to read a file with one pattern per line:
2074 \end_layout
2075
2076 \begin_layout LyX-Code
2077 ...
2078  | grab --pattern_in=patterns.txt
2079 \end_layout
2080
2081 \begin_layout Standard
2082 If you want the opposite result --- to find all records that does not match
2083  the patterns, add the
2084 \begin_inset ERT
2085 status collapsed
2086
2087 \begin_layout Standard
2088
2089 -
2090 \backslash
2091 /-
2092 \end_layout
2093
2094 \end_inset
2095
2096 invert switch, which not only works with the
2097 \begin_inset ERT
2098 status collapsed
2099
2100 \begin_layout Standard
2101
2102 -
2103 \backslash
2104 /-
2105 \end_layout
2106
2107 \end_inset
2108
2109 pattern switch, but also with
2110 \begin_inset ERT
2111 status collapsed
2112
2113 \begin_layout Standard
2114
2115 -
2116 \backslash
2117 /-
2118 \end_layout
2119
2120 \end_inset
2121
2122 regex and
2123 \begin_inset ERT
2124 status collapsed
2125
2126 \begin_layout Standard
2127
2128 -
2129 \backslash
2130 /-
2131 \end_layout
2132
2133 \end_inset
2134
2135 eval:
2136 \end_layout
2137
2138 \begin_layout LyX-Code
2139 ...
2140  | grab --pattern=human --invert
2141 \end_layout
2142
2143 \begin_layout Standard
2144 If you want to search the record keys only,
2145 \emph on
2146 e.g.
2147
2148 \emph default
2149  to find all records containing the key SEQ you can add the
2150 \begin_inset ERT
2151 status collapsed
2152
2153 \begin_layout Standard
2154
2155 -
2156 \backslash
2157 /-
2158 \end_layout
2159
2160 \end_inset
2161
2162 keys_only switch.
2163  This will prevent matching of SEQ in any record value, and in fact SEQ
2164  is a not uncommon peptide sequence you could get an unwanted record.
2165  Also, this will give an increase in speed since only the keys are searched:
2166 \end_layout
2167
2168 \begin_layout LyX-Code
2169 ...
2170  | grab --pattern=SEQ --keys_only
2171 \end_layout
2172
2173 \begin_layout Standard
2174 However, if you are interested in finding the peptide sequence SEQ and not
2175  the SEQ key, just add the
2176 \begin_inset ERT
2177 status collapsed
2178
2179 \begin_layout Standard
2180
2181 -
2182 \backslash
2183 /-
2184 \end_layout
2185
2186 \end_inset
2187
2188 vals_only switch instead:
2189 \end_layout
2190
2191 \begin_layout LyX-Code
2192 ...
2193  | grab --pattern=SEQ --vals_only
2194 \end_layout
2195
2196 \begin_layout Standard
2197 Also, if you want to grab for certain key/value pairs you can supply a comma
2198  separated list of keys whos values will then be searched using the
2199 \begin_inset ERT
2200 status collapsed
2201
2202 \begin_layout Standard
2203
2204 -
2205 \backslash
2206 /-
2207 \end_layout
2208
2209 \end_inset
2210
2211 keys switch.
2212  This is handy if your records contain large genomic sequences and you dont
2213  want to search the entire sequence for
2214 \emph on
2215 e.g.
2216
2217 \emph default
2218  the organism name --- it is much faster to tell
2219 \series bold
2220 grab
2221 \series default
2222  which keys to search the value for:
2223 \end_layout
2224
2225 \begin_layout LyX-Code
2226 ...
2227  | grab --pattern=human --keys=SEQ_NAME
2228 \end_layout
2229
2230 \begin_layout LyX-Code
2231
2232 \end_layout
2233
2234 \begin_layout Standard
2235 It is also possible to invoke flexible matching using regex (regular expressions
2236 ) instead of simple pattern matching.
2237  In
2238 \series bold
2239 grab
2240 \series default
2241  the regex engine is Perl based and allows use of different type of wild
2242  cards, alternatives,
2243 \emph on
2244 etc
2245 \emph default
2246
2247 \begin_inset Foot
2248 status open
2249
2250 \begin_layout Standard
2251 \begin_inset LatexCommand url
2252 target "http://perldoc.perl.org/perlreref.html"
2253
2254 \end_inset
2255
2256
2257 \end_layout
2258
2259 \end_inset
2260
2261 .
2262  If you want to
2263 \series bold
2264 grab
2265 \series default
2266  records withs the sequence ATCG or GCTA you can do this:
2267 \end_layout
2268
2269 \begin_layout LyX-Code
2270 ...
2271  | grab --regex='ATCG|GCTA'
2272 \end_layout
2273
2274 \begin_layout Standard
2275 Or if you want to find sequences beginning with ATCG:
2276 \end_layout
2277
2278 \begin_layout LyX-Code
2279 ...
2280  | grab --regex='^ATCG'
2281 \end_layout
2282
2283 \begin_layout Standard
2284 You can also use
2285 \series bold
2286 grab
2287 \series default
2288  to locate records that fulfill a numerical property using the
2289 \begin_inset ERT
2290 status collapsed
2291
2292 \begin_layout Standard
2293
2294 -
2295 \backslash
2296 /-
2297 \end_layout
2298
2299 \end_inset
2300
2301 eval switch witch takes an expression in three parts.
2302  The first part is the key that holds the value we want to evaluate, the
2303  second part holds one the six operators:
2304 \end_layout
2305
2306 \begin_layout Enumerate
2307 Greater than: >
2308 \end_layout
2309
2310 \begin_layout Enumerate
2311 Greater than or equal to: >=
2312 \end_layout
2313
2314 \begin_layout Enumerate
2315 Less than: <
2316 \end_layout
2317
2318 \begin_layout Enumerate
2319 Less than or equal to: <=
2320 \end_layout
2321
2322 \begin_layout Enumerate
2323 Equal to: =
2324 \end_layout
2325
2326 \begin_layout Enumerate
2327 Not equal to: !=
2328 \end_layout
2329
2330 \begin_layout Enumerate
2331 String wise equal to: eq
2332 \end_layout
2333
2334 \begin_layout Enumerate
2335 String wise not equal to: ne
2336 \end_layout
2337
2338 \begin_layout Standard
2339 And finally comes the number used in the evaluation.
2340  So to
2341 \series bold
2342 grab
2343 \series default
2344  all records with a sequence length greater than 30:
2345 \end_layout
2346
2347 \begin_layout LyX-Code
2348 ...
2349  length_seq | grab --eval='SEQ_LEN > 30'
2350 \end_layout
2351
2352 \begin_layout Standard
2353 If you want to locate all records containing the pattern 'human' and where
2354  the sequence length is greater that 30, you do this by running the stream
2355  through
2356 \series bold
2357 grab
2358 \series default
2359  twice:
2360 \end_layout
2361
2362 \begin_layout LyX-Code
2363 ...
2364  | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30'
2365 \end_layout
2366
2367 \begin_layout Standard
2368 Finally, it is possible to do fast matching of expressions from a file using
2369  the
2370 \begin_inset ERT
2371 status collapsed
2372
2373 \begin_layout Standard
2374
2375 -
2376 \backslash
2377 /-
2378 \end_layout
2379
2380 \end_inset
2381
2382 exact switch.
2383  Each of these expressions has to be matched exactly over the entrie length,
2384  which if useful if you have a file with accession numbers, that you want
2385  to locate in the stream:
2386 \end_layout
2387
2388 \begin_layout LyX-Code
2389 ...
2390  | grab --exact acc_no.txt | ...
2391 \end_layout
2392
2393 \begin_layout Standard
2394 Using
2395 \begin_inset ERT
2396 status collapsed
2397
2398 \begin_layout Standard
2399
2400 -
2401 \backslash
2402 /-
2403 \end_layout
2404
2405 \end_inset
2406
2407 exact is much faster than using
2408 \begin_inset ERT
2409 status collapsed
2410
2411 \begin_layout Standard
2412
2413 -
2414 \backslash
2415 /-
2416 \end_layout
2417
2418 \end_inset
2419
2420 pattern_in, because with
2421 \begin_inset ERT
2422 status collapsed
2423
2424 \begin_layout Standard
2425
2426 -
2427 \backslash
2428 /-
2429 \end_layout
2430
2431 \end_inset
2432
2433 exact the expression has to be complete matches, where
2434 \begin_inset ERT
2435 status collapsed
2436
2437 \begin_layout Standard
2438
2439 -
2440 \backslash
2441 /-
2442 \end_layout
2443
2444 \end_inset
2445
2446 pattern_in looks for subpatterns.
2447 \end_layout
2448
2449 \begin_layout Standard
2450 NB! To get the best speed performance, use the most restrictive
2451 \series bold
2452 grab
2453 \series default
2454  first.
2455 \end_layout
2456
2457 \begin_layout Subsection
2458 How to remove keys from records?
2459 \end_layout
2460
2461 \begin_layout Standard
2462 To remove one or more specific keys from all records in the data stream
2463  use
2464 \series bold
2465 remove_keys
2466 \series default
2467  like this:
2468 \end_layout
2469
2470 \begin_layout LyX-Code
2471 ...
2472  | remove_keys --keys=SEQ,SEQ_NAME
2473 \end_layout
2474
2475 \begin_layout Standard
2476 In the above example SEQ and SEQ_NAME will be removed from all records if
2477  they exists in these.
2478  If all keys are removed from a record, then the record will be removed.
2479 \end_layout
2480
2481 \begin_layout Subsection
2482 How to rename keys in records?
2483 \end_layout
2484
2485 \begin_layout Standard
2486 Sometimes you want to rename a record key,
2487 \emph on
2488 e.g.
2489
2490 \emph default
2491  if you have read in a two column table with sequence name and sequence
2492  in each column (see
2493 \begin_inset LatexCommand ref
2494 reference "sub:How-to-read-table"
2495
2496 \end_inset
2497
2498 ) without specifying the key names, then the sequence name will be called
2499  V0 and the sequence V1 as default in the
2500 \series bold
2501 read_tab
2502 \series default
2503  biotool.
2504  To rename the V0 and V1 keys we need to run the stream through
2505 \series bold
2506 rename_keys
2507 \series default
2508  twice (one for each key to rename):
2509 \end_layout
2510
2511 \begin_layout LyX-Code
2512 ...
2513  | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ
2514 \end_layout
2515
2516 \begin_layout Standard
2517 The first instance of
2518 \series bold
2519 rename_keys
2520 \series default
2521  replaces all the V0 keys with SEQ_NAME, and the second instance of
2522 \series bold
2523 rename_keys
2524 \series default
2525  replaces all the V1 keys with SEQ.
2526
2527 \emph on
2528 Et viola
2529 \emph default
2530  the data can now be used in the biotools that requires these keys.
2531 \end_layout
2532
2533 \begin_layout Section
2534 Manipulating Sequences
2535 \end_layout
2536
2537 \begin_layout Subsection
2538 How to get sequence lengths?
2539 \end_layout
2540
2541 \begin_layout Standard
2542 The length for sequences in records can be determined with
2543 \series bold
2544 length_seq
2545 \series default
2546 , which adds the key SEQ_LEN to each record with the sequence length as
2547  the value.
2548  It also generates an extra record that is emitted last with the key TOTAL_SEQ_L
2549 EN showing the total length of all the sequences.
2550 \end_layout
2551
2552 \begin_layout LyX-Code
2553 read_fasta --data_in=<file> | length_seq
2554 \end_layout
2555
2556 \begin_layout Standard
2557 It is also possible to determine the sequence length using the generic tool
2558
2559 \series bold
2560 length_vals
2561 \series default
2562
2563 \begin_inset LatexCommand eqref
2564 reference "sub:How-to-get-value_length"
2565
2566 \end_inset
2567
2568 , which determines the length of the values for a given list of keys:
2569 \end_layout
2570
2571 \begin_layout LyX-Code
2572 read_fasta --data_in=<file> | length_vals --keys=SEQ
2573 \end_layout
2574
2575 \begin_layout Standard
2576 To obtain the total length of all sequences use
2577 \series bold
2578 sum_vals
2579 \series default
2580  like this:
2581 \end_layout
2582
2583 \begin_layout LyX-Code
2584 read_fasta --data_in=<file> | length_vals --keys=SEQ
2585 \end_layout
2586
2587 \begin_layout LyX-Code
2588 | sum_vals --keys=SEQ_LEN
2589 \end_layout
2590
2591 \begin_layout Standard
2592 The biotool
2593 \series bold
2594 analyze_seq
2595 \series default
2596  will also determine the length of each sequence (see\InsetSpace ~
2597
2598 \begin_inset LatexCommand ref
2599 reference "sub:How-to-analyze"
2600
2601 \end_inset
2602
2603 ).
2604 \end_layout
2605
2606 \begin_layout Subsection
2607 How to analyze sequence composition?
2608 \begin_inset LatexCommand label
2609 name "sub:How-to-analyze"
2610
2611 \end_inset
2612
2613
2614 \end_layout
2615
2616 \begin_layout Standard
2617 If you want to find out the sequence type, composition, length, as well
2618  as GC content, indel content and proportions of soft and hard masked sequence,
2619  then use
2620 \series bold
2621 analyze_seq
2622 \series default
2623 .
2624  This handy biotool will determine all these things per sequence from which
2625  it is easy to get an overview using the
2626 \series bold
2627 write_tab
2628 \series default
2629  biotool to output a table (see\InsetSpace ~
2630
2631 \begin_inset LatexCommand ref
2632 reference "sub:How-to-write-tab"
2633
2634 \end_inset
2635
2636 ).
2637  So in order to determine the sequence composition of a FASTA file with
2638  just one entry containing the sequence 'ATCG' we just read the data with
2639
2640 \series bold
2641 read_fasta
2642 \series default
2643  and run the output through
2644 \series bold
2645 analyze_seq
2646 \series default
2647  which will add the analysis to the record like this:
2648 \end_layout
2649
2650 \begin_layout LyX-Code
2651 read_fasta --data_in=test.fna | analyze_seq ...
2652 \end_layout
2653
2654 \begin_layout LyX-Code
2655
2656 \end_layout
2657
2658 \begin_layout LyX-Code
2659
2660 \size scriptsize
2661 RES:D: 0
2662 \end_layout
2663
2664 \begin_layout LyX-Code
2665
2666 \size scriptsize
2667 MIX_INDEX: 0.55
2668 \end_layout
2669
2670 \begin_layout LyX-Code
2671
2672 \size scriptsize
2673 RES:W: 0
2674 \end_layout
2675
2676 \begin_layout LyX-Code
2677
2678 \size scriptsize
2679 RES:G: 16
2680 \end_layout
2681
2682 \begin_layout LyX-Code
2683
2684 \size scriptsize
2685 SOFT_MASK%: 63.75
2686 \end_layout
2687
2688 \begin_layout LyX-Code
2689
2690 \size scriptsize
2691 RES:B: 0
2692 \end_layout
2693
2694 \begin_layout LyX-Code
2695
2696 \size scriptsize
2697 RES:V: 0
2698 \end_layout
2699
2700 \begin_layout LyX-Code
2701
2702 \size scriptsize
2703 HARD_MASK%: 0.00
2704 \end_layout
2705
2706 \begin_layout LyX-Code
2707
2708 \size scriptsize
2709 RES:H: 0
2710 \end_layout
2711
2712 \begin_layout LyX-Code
2713
2714 \size scriptsize
2715 RES:S: 0
2716 \end_layout
2717
2718 \begin_layout LyX-Code
2719
2720 \size scriptsize
2721 RES:N: 0
2722 \end_layout
2723
2724 \begin_layout LyX-Code
2725
2726 \size scriptsize
2727 RES:.: 0
2728 \end_layout
2729
2730 \begin_layout LyX-Code
2731
2732 \size scriptsize
2733 GC%: 35.00
2734 \end_layout
2735
2736 \begin_layout LyX-Code
2737
2738 \size scriptsize
2739 RES:A: 8
2740 \end_layout
2741
2742 \begin_layout LyX-Code
2743
2744 \size scriptsize
2745 RES:Y: 0
2746 \end_layout
2747
2748 \begin_layout LyX-Code
2749
2750 \size scriptsize
2751 RES:M: 0
2752 \end_layout
2753
2754 \begin_layout LyX-Code
2755
2756 \size scriptsize
2757 RES:T: 44
2758 \end_layout
2759
2760 \begin_layout LyX-Code
2761
2762 \size scriptsize
2763 SEQ_TYPE: DNA
2764 \end_layout
2765
2766 \begin_layout LyX-Code
2767
2768 \size scriptsize
2769 RES:K: 0
2770 \end_layout
2771
2772 \begin_layout LyX-Code
2773
2774 \size scriptsize
2775 RES:~: 0
2776 \end_layout
2777
2778 \begin_layout LyX-Code
2779
2780 \size scriptsize
2781 SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc
2782 tgtcg
2783 \end_layout
2784
2785 \begin_layout LyX-Code
2786
2787 \size scriptsize
2788 SEQ_LEN:
2789 \end_layout
2790
2791 \begin_layout LyX-Code
2792
2793 \size scriptsize
2794 80 RES:R: 0
2795 \end_layout
2796
2797 \begin_layout LyX-Code
2798
2799 \size scriptsize
2800 RES:C: 12
2801 \end_layout
2802
2803 \begin_layout LyX-Code
2804
2805 \size scriptsize
2806 RES:-: 0
2807 \end_layout
2808
2809 \begin_layout LyX-Code
2810
2811 \size scriptsize
2812 RES:U: 0
2813 \end_layout
2814
2815 \begin_layout LyX-Code
2816
2817 \size scriptsize
2818 ---
2819 \end_layout
2820
2821 \begin_layout LyX-Code
2822
2823 \end_layout
2824
2825 \begin_layout Standard
2826 Now to make a table of how may As, Ts, Cs, and Gs you can add the following:
2827 \end_layout
2828
2829 \begin_layout LyX-Code
2830 ...
2831  | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G
2832 \end_layout
2833
2834 \begin_layout Standard
2835 Or if you want to see the proportions of hard and soft masked sequence:
2836 \end_layout
2837
2838 \begin_layout LyX-Code
2839 ...
2840  | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK%
2841 \end_layout
2842
2843 \begin_layout Standard
2844 If you have a stack of sequences in one file and you want to determine the
2845  mean GC content you can do it using the
2846 \series bold
2847 mean_vals
2848 \series default
2849  biotool:
2850 \end_layout
2851
2852 \begin_layout LyX-Code
2853 read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC%
2854 \end_layout
2855
2856 \begin_layout Standard
2857 Or if you want the total count of Ns you can use
2858 \series bold
2859 sum_vals
2860 \series default
2861  like this:
2862 \end_layout
2863
2864 \begin_layout LyX-Code
2865 read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N
2866 \end_layout
2867
2868 \begin_layout Standard
2869 The MIX_INDEX key is calculated as the count of the most common residue
2870  over the sequence length, and can be used as a cut-off for removing sequence
2871  tags consisting of mostly one nucleotide:
2872 \end_layout
2873
2874 \begin_layout LyX-Code
2875 read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85'
2876 \end_layout
2877
2878 \begin_layout Subsection
2879 How to extract subsequences?
2880 \begin_inset LatexCommand label
2881 name "sub:How-to-extract"
2882
2883 \end_inset
2884
2885
2886 \end_layout
2887
2888 \begin_layout Standard
2889 In order to extract a subsequence from a longer sequence use the biotool
2890  extract_seq, which will replace the sequence in the record with the subsequence
2891  (this behaviour should probably be modified to be dependant of a
2892 \begin_inset ERT
2893 status collapsed
2894
2895 \begin_layout Standard
2896
2897 -
2898 \backslash
2899 /-
2900 \end_layout
2901
2902 \end_inset
2903
2904 replace or a
2905 \begin_inset ERT
2906 status collapsed
2907
2908 \begin_layout Standard
2909
2910 -
2911 \backslash
2912 /-
2913 \end_layout
2914
2915 \end_inset
2916
2917 no_replace switch
2918 \begin_inset Note Note
2919 status collapsed
2920
2921 \begin_layout Standard
2922 also in split_seq
2923 \end_layout
2924
2925 \end_inset
2926
2927 ).
2928  So to extract the first 20 residues from all sequences do (first residue
2929  is designated 1):
2930 \end_layout
2931
2932 \begin_layout LyX-Code
2933 ...
2934  | extract_seq --beg=1 --len=20
2935 \end_layout
2936
2937 \begin_layout Standard
2938 You can also specify a begin and end coordinate set:
2939 \end_layout
2940
2941 \begin_layout LyX-Code
2942 ...
2943  | extract_seq --beg=20 --end=40
2944 \end_layout
2945
2946 \begin_layout Standard
2947 If you want the subsequences from position 20 to the sequence end do:
2948 \end_layout
2949
2950 \begin_layout LyX-Code
2951 ...
2952  | extract_seq --beg=20
2953 \end_layout
2954
2955 \begin_layout Standard
2956 If you want to extract subsequences a given distance from the sequence end
2957  you can do this by reversing the sequence with the biotool
2958 \series bold
2959 reverse_seq
2960 \series default
2961
2962 \begin_inset LatexCommand eqref
2963 reference "sub:How-to-reverse-seq"
2964
2965 \end_inset
2966
2967 , followed by
2968 \series bold
2969 extract_seq
2970 \series default
2971  to get the subsequence, and then
2972 \series bold
2973 reverse_seq
2974 \series default
2975  again to get the subsequence back in the original orientation:
2976 \end_layout
2977
2978 \begin_layout LyX-Code
2979 read_fasta --data_in=test.fna | reverse_seq
2980 \end_layout
2981
2982 \begin_layout LyX-Code
2983 | extract_seq --beg=10 --len=10 | reverse_seq
2984 \end_layout
2985
2986 \begin_layout Subsection
2987 How to get genomic sequence?
2988 \begin_inset LatexCommand label
2989 name "sub:How-to-get-genomic-sequence"
2990
2991 \end_inset
2992
2993
2994 \end_layout
2995
2996 \begin_layout Standard
2997 The biotool
2998 \series bold
2999 get_genomic_seq
3000 \series default
3001  can extract subsequences for a given genome specified with the
3002 \begin_inset ERT
3003 status collapsed
3004
3005 \begin_layout Standard
3006
3007 -
3008 \backslash
3009 /-
3010 \end_layout
3011
3012 \end_inset
3013
3014 genome switch explicitly using the
3015 \begin_inset ERT
3016 status collapsed
3017
3018 \begin_layout Standard
3019
3020 -
3021 \backslash
3022 /-
3023 \end_layout
3024
3025 \end_inset
3026
3027 beg and
3028 \begin_inset ERT
3029 status collapsed
3030
3031 \begin_layout Standard
3032
3033 -
3034 \backslash
3035 /-
3036 \end_layout
3037
3038 \end_inset
3039
3040 end/
3041 \begin_inset ERT
3042 status collapsed
3043
3044 \begin_layout Standard
3045
3046 -
3047 \backslash
3048 /-
3049 \end_layout
3050
3051 \end_inset
3052
3053 len switches:
3054 \end_layout
3055
3056 \begin_layout LyX-Code
3057 get_genome_seq --genome=<genome> --beg=1 --len=100
3058 \end_layout
3059
3060 \begin_layout Standard
3061 Alternatively,
3062 \series bold
3063 get_genome_seq
3064 \series default
3065  can be used to append the corresponding sequence to BED, PSL, and BLAST
3066  records:
3067 \end_layout
3068
3069 \begin_layout LyX-Code
3070 read_bed --data_in=<BED file> | get_genome_seq --genome=<genome>
3071 \end_layout
3072
3073 \begin_layout Standard
3074 It is also possible to include flaking sequence using the
3075 \begin_inset ERT
3076 status collapsed
3077
3078 \begin_layout Standard
3079
3080 -
3081 \backslash
3082 /-
3083 \end_layout
3084
3085 \end_inset
3086
3087 flank switch.
3088  So to include 50 nucleotides upstream and 50 nucleotides downstream for
3089  each BED entry do:
3090 \end_layout
3091
3092 \begin_layout LyX-Code
3093 read_bed --data_in=<BED file> | get_genome_seq --genome=<genome> --flank=50
3094 \end_layout
3095
3096 \begin_layout Subsection
3097 How to upper-case sequences?
3098 \end_layout
3099
3100 \begin_layout Standard
3101 Sequences can be shifted from lower case to upper case using
3102 \series bold
3103 uppercase_seq
3104 \series default
3105 :
3106 \end_layout
3107
3108 \begin_layout LyX-Code
3109 ...
3110  | uppercase_seq
3111 \end_layout
3112
3113 \begin_layout Subsection
3114 How to reverse sequences?
3115 \begin_inset LatexCommand label
3116 name "sub:How-to-reverse-seq"
3117
3118 \end_inset
3119
3120
3121 \end_layout
3122
3123 \begin_layout Standard
3124 The order of residues in a sequence can be reversed using reverse_seq:
3125 \end_layout
3126
3127 \begin_layout LyX-Code
3128 ...
3129  | reverse_seq
3130 \end_layout
3131
3132 \begin_layout Standard
3133 Note that in order to reverse/complement a sequence you also need the
3134 \series bold
3135 complement_seq
3136 \series default
3137  biotool (see\InsetSpace ~
3138
3139 \begin_inset LatexCommand ref
3140 reference "sub:How-to-complement"
3141
3142 \end_inset
3143
3144 ).
3145 \end_layout
3146
3147 \begin_layout Subsection
3148 How to complement sequences?
3149 \begin_inset LatexCommand label
3150 name "sub:How-to-complement"
3151
3152 \end_inset
3153
3154
3155 \end_layout
3156
3157 \begin_layout Standard
3158 DNA and RNA sequences can be complemented with
3159 \series bold
3160 complement_seq
3161 \series default
3162 , which automagically determines the sequence type:
3163 \end_layout
3164
3165 \begin_layout LyX-Code
3166 ...
3167  | complement_seq
3168 \end_layout
3169
3170 \begin_layout Standard
3171 Note that in order to reverse/complement a sequence you also need the
3172 \series bold
3173 reverse_seq
3174 \series default
3175  biotool (see\InsetSpace ~
3176
3177 \begin_inset LatexCommand ref
3178 reference "sub:How-to-reverse-seq"
3179
3180 \end_inset
3181
3182 ).
3183 \end_layout
3184
3185 \begin_layout Subsection
3186 How to remove indels from sequnces?
3187 \end_layout
3188
3189 \begin_layout Standard
3190 Indels can be removed from sequences with the
3191 \series bold
3192 remove_indels
3193 \series default
3194  biotool.
3195  This is useful if you have aligned some sequences (see\InsetSpace ~
3196
3197 \begin_inset LatexCommand ref
3198 reference "sub:How-to-align"
3199
3200 \end_inset
3201
3202 ) and extracted (see\InsetSpace ~
3203
3204 \begin_inset LatexCommand ref
3205 reference "sub:How-to-extract"
3206
3207 \end_inset
3208
3209 ) a block of subsequences from the alignment and you want to use these sequence
3210  in a search where you need to remove the indels first.
3211  '-', '~', and '.' are considered indels:
3212 \end_layout
3213
3214 \begin_layout LyX-Code
3215 ...
3216  | remove_indels
3217 \end_layout
3218
3219 \begin_layout Subsection
3220 How to shuffle sequences?
3221 \end_layout
3222
3223 \begin_layout Standard
3224 All residues in sequences in the stream can be shuffled to random positions
3225  using the
3226 \series bold
3227 shuffle_seq
3228 \series default
3229  biotool:
3230 \end_layout
3231
3232 \begin_layout LyX-Code
3233 ...
3234  | shuffle_seq
3235 \end_layout
3236
3237 \begin_layout Subsection
3238 How to split sequences into overlapping subsequences?
3239 \end_layout
3240
3241 \begin_layout Standard
3242 Sequences can be slit into overlapping subsequences with the
3243 \series bold
3244 split_seq
3245 \series default
3246  biotool.
3247 \end_layout
3248
3249 \begin_layout LyX-Code
3250 ...
3251  | split_seq --word_size=20 --uniq
3252 \end_layout
3253
3254 \begin_layout Subsection
3255 How to determine the oligo frequency?
3256 \end_layout
3257
3258 \begin_layout Standard
3259 In order to determine if any oligo usage is over represented in one or more
3260  sequences you can determine the frequency of oligos of a given size with
3261
3262 \series bold
3263 oligo_freq
3264 \series default
3265 :
3266 \end_layout
3267
3268 \begin_layout LyX-Code
3269 ...
3270  | oligo_freq --word_size=4
3271 \end_layout
3272
3273 \begin_layout Standard
3274 And if you have more than one sequence and want to accumulate the frequences
3275  you need the
3276 \begin_inset ERT
3277 status collapsed
3278
3279 \begin_layout Standard
3280
3281 -
3282 \backslash
3283 /-
3284 \end_layout
3285
3286 \end_inset
3287
3288 all switch:
3289 \end_layout
3290
3291 \begin_layout LyX-Code
3292 ...
3293  | oligo_freq --word_size=4 --all
3294 \end_layout
3295
3296 \begin_layout Standard
3297 To get a meaningful result you need to write the resulting frequencies as
3298  a table with
3299 \series bold
3300 write_tab
3301 \series default
3302  (see\InsetSpace ~
3303
3304 \begin_inset LatexCommand ref
3305 reference "sub:How-to-write-tab"
3306
3307 \end_inset
3308
3309 ), but first it is important to
3310 \series bold
3311 grab
3312 \series default
3313  (see\InsetSpace ~
3314
3315 \begin_inset LatexCommand ref
3316 reference "sub:How-to-grab"
3317
3318 \end_inset
3319
3320 ) the records with the frequencies to avoid full length sequences in the
3321  table:
3322 \end_layout
3323
3324 \begin_layout LyX-Code
3325 ...
3326  | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only
3327 \end_layout
3328
3329 \begin_layout LyX-Code
3330 | write_tab --no_stream
3331 \end_layout
3332
3333 \begin_layout Standard
3334 And the resulting frequency table can be sorted with Unix sort (man sort).
3335 \end_layout
3336
3337 \begin_layout Subsection
3338 How to search for sequences in genomes?
3339 \end_layout
3340
3341 \begin_layout Standard
3342 See the following biotool:
3343 \end_layout
3344
3345 \begin_layout Itemize
3346
3347 \series bold
3348 patscan_seq
3349 \series default
3350
3351 \begin_inset LatexCommand eqref
3352 reference "sub:How-to-use-patscan"
3353
3354 \end_inset
3355
3356
3357 \end_layout
3358
3359 \begin_layout Itemize
3360
3361 \series bold
3362 blat_seq
3363 \series default
3364
3365 \begin_inset LatexCommand eqref
3366 reference "sub:How-to-use-BLAT"
3367
3368 \end_inset
3369
3370
3371 \end_layout
3372
3373 \begin_layout Itemize
3374
3375 \series bold
3376 blast_seq
3377 \series default
3378
3379 \begin_inset LatexCommand eqref
3380 reference "sub:How-to-use-BLAST"
3381
3382 \end_inset
3383
3384
3385 \end_layout
3386
3387 \begin_layout Itemize
3388
3389 \series bold
3390 vmatch_seq
3391 \series default
3392
3393 \begin_inset LatexCommand eqref
3394 reference "sub:How-to-use-Vmatch"
3395
3396 \end_inset
3397
3398
3399 \end_layout
3400
3401 \begin_layout Subsection
3402 How to search sequences for a pattern?
3403 \begin_inset LatexCommand label
3404 name "sub:How-to-use-patscan"
3405
3406 \end_inset
3407
3408
3409 \end_layout
3410
3411 \begin_layout Standard
3412 It is possible to search sequences in the data stream for patterns using
3413  the
3414 \series bold
3415 patscan_seq
3416 \series default
3417  biotool which utilizes the powerful scan_for_matches engine.
3418  Consult the documentation for scan_for_matches in order to learn how to
3419  define patterns (the documentation is included in Appendix\InsetSpace ~
3420
3421 \begin_inset LatexCommand ref
3422 reference "sec:scan_for_matches-README"
3423
3424 \end_inset
3425
3426 ).
3427 \end_layout
3428
3429 \begin_layout Standard
3430 To search all sequences for a simple pattern consisting of the sequence
3431  ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion:
3432 \end_layout
3433
3434 \begin_layout LyX-Code
3435 read_fasta --data_in=<file> | patscan_seq --pattern='ATCGATCG[3,2,1]'
3436 \end_layout
3437
3438 \begin_layout Standard
3439 The
3440 \begin_inset ERT
3441 status collapsed
3442
3443 \begin_layout Standard
3444
3445 -
3446 \backslash
3447 /-
3448 \end_layout
3449
3450 \end_inset
3451
3452 pattern switch takes a comma seperated list of patterns, so if you want
3453  to search for more that one pattern do:
3454 \end_layout
3455
3456 \begin_layout LyX-Code
3457 ...
3458  | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]'
3459 \end_layout
3460
3461 \begin_layout Standard
3462 It is also possible to have a list of different patterns to search for in
3463  a file with one pattern per line.
3464  In order to get
3465 \series bold
3466 patscan_seq
3467 \series default
3468  to read these patterns use the
3469 \begin_inset ERT
3470 status collapsed
3471
3472 \begin_layout Standard
3473
3474 -
3475 \backslash
3476 /-
3477 \end_layout
3478
3479 \end_inset
3480
3481 pattern_in switch:
3482 \end_layout
3483
3484 \begin_layout LyX-Code
3485 ...
3486  | patscan_seq --pattern_in=<file>
3487 \end_layout
3488
3489 \begin_layout Standard
3490 To also scan the complementary strand in nucleotide sequences (
3491 \series bold
3492 patscan_seq
3493 \series default
3494  automagically determines the sequence type) you need to add the
3495 \begin_inset ERT
3496 status collapsed
3497
3498 \begin_layout Standard
3499
3500 -
3501 \backslash
3502 /-
3503 \end_layout
3504
3505 \end_inset
3506
3507 comp switch:
3508 \end_layout
3509
3510 \begin_layout LyX-Code
3511 ...
3512  | patscan_seq --pattern=<pattern> --comp
3513 \end_layout
3514
3515 \begin_layout Standard
3516 It is also possible to use
3517 \series bold
3518 patscan_seq
3519 \series default
3520  to output those records that does not contain a certain pattern by using
3521  the
3522 \begin_inset ERT
3523 status collapsed
3524
3525 \begin_layout Standard
3526
3527 -
3528 \backslash
3529 /-
3530 \end_layout
3531
3532 \end_inset
3533
3534 invert switch:
3535 \end_layout
3536
3537 \begin_layout LyX-Code
3538 ...
3539  | patscan_seq --pattern=<pattern> --invert
3540 \end_layout
3541
3542 \begin_layout Standard
3543 Finally,
3544 \series bold
3545 patscan_seq
3546 \series default
3547  can also scan for patterns in a given genome sequence, instead of sequences
3548  in the stream, using the
3549 \begin_inset ERT
3550 status open
3551
3552 \begin_layout Standard
3553
3554 -
3555 \backslash
3556 /-
3557 \end_layout
3558
3559 \end_inset
3560
3561 genome switch:
3562 \end_layout
3563
3564 \begin_layout LyX-Code
3565 patscan --pattern=<pattern> --genome=<genome>
3566 \end_layout
3567
3568 \begin_layout Subsection
3569 How to use BLAT for sequence search?
3570 \begin_inset LatexCommand label
3571 name "sub:How-to-use-BLAT"
3572
3573 \end_inset
3574
3575
3576 \end_layout
3577
3578 \begin_layout Standard
3579 Sequences in the data stream can be matched against supported genomes using
3580
3581 \series bold
3582 blat_seq
3583 \series default
3584  which is a biotool using BLAT as the name might suggest.
3585  Currently only Mouse and Human genomes are available and it is not possible
3586  to use OOC files since there is still a need for a local repository for
3587  genome files.
3588  Otherwise it is just:
3589 \end_layout
3590
3591 \begin_layout LyX-Code
3592 read_fasta --data_in=<file> | blat_seq --genome=<genome>
3593 \end_layout
3594
3595 \begin_layout Standard
3596 The search results can then be written to file with
3597 \series bold
3598 write_psl
3599 \series default
3600  (see\InsetSpace ~
3601
3602 \begin_inset LatexCommand ref
3603 reference "sub:How-to-write-PSL"
3604
3605 \end_inset
3606
3607 ) or
3608 \series bold
3609 write_bed
3610 \series default
3611  (see\InsetSpace ~
3612
3613 \begin_inset LatexCommand ref
3614 reference "sub:How-to-write-BED"
3615
3616 \end_inset
3617
3618 ) allthough with
3619 \series bold
3620 write_bed
3621 \series default
3622  some information will be lost).
3623  It is also possible to plot chromosome distribution of the search results
3624  using
3625 \series bold
3626 plot_chrdist
3627 \series default
3628  (see\InsetSpace ~
3629
3630 \begin_inset LatexCommand ref
3631 reference "sub:How-to-plot-chrdist"
3632
3633 \end_inset
3634
3635 ) or the distribution of the match lengths using
3636 \series bold
3637 plot_lendist
3638 \series default
3639  (see\InsetSpace ~
3640
3641 \begin_inset LatexCommand ref
3642 reference "sub:How-to-plot-lendist"
3643
3644 \end_inset
3645
3646 ) or a karyogram with the hits using
3647 \series bold
3648 plot_karyogram
3649 \series default
3650  (see\InsetSpace ~
3651
3652 \begin_inset LatexCommand ref
3653 reference "sub:How-to-plot-karyogram"
3654
3655 \end_inset
3656
3657 ).
3658 \end_layout
3659
3660 \begin_layout Subsection
3661 How to use BLAST for sequence search?
3662 \begin_inset LatexCommand label
3663 name "sub:How-to-use-BLAST"
3664
3665 \end_inset
3666
3667
3668 \end_layout
3669
3670 \begin_layout Standard
3671 Two biotools exist for blasting sequences:
3672 \series bold
3673 create_blast_db
3674 \series default
3675  is used to create the BLAST database required for BLAST which is queried
3676  using the biotool
3677 \series bold
3678 blast_seq
3679 \series default
3680 .
3681  So in order to create a BLAST database from sequences in the data stream
3682  you simple run:
3683 \end_layout
3684
3685 \begin_layout LyX-Code
3686 ...
3687  | create_blast_db --database=my_database --no_stream
3688 \end_layout
3689
3690 \begin_layout Standard
3691 The type of sequence to use for the database is automagically determined
3692  by
3693 \series bold
3694 create_blast_db
3695 \series default
3696 , but don't have a mixture of peptide and nucleic acids sequences in the
3697  stream.
3698  The
3699 \begin_inset ERT
3700 status collapsed
3701
3702 \begin_layout Standard
3703
3704 -
3705 \backslash
3706 /-
3707 \end_layout
3708
3709 \end_inset
3710
3711 database switch takes a path as argument, but will default to 'blastdb_<time_sta
3712 mp> if not set.
3713 \end_layout
3714
3715 \begin_layout Standard
3716 The resulting database can now be queried with sequences in another data
3717  stream using
3718 \series bold
3719 blast_seq
3720 \series default
3721 :
3722 \end_layout
3723
3724 \begin_layout LyX-Code
3725 ...
3726  | blast_seq --database=my_database
3727 \end_layout
3728
3729 \begin_layout Standard
3730 Again, the sequence type is determined automagically and the appropriate
3731  BLAST program is guessed (see below table), however, the program name can
3732  be overruled with the
3733 \begin_inset ERT
3734 status collapsed
3735
3736 \begin_layout Standard
3737
3738 -
3739 \backslash
3740 /-
3741 \end_layout
3742
3743 \end_inset
3744
3745 program switch.
3746 \end_layout
3747
3748 \begin_layout Standard
3749 \noindent
3750 \align center
3751 \begin_inset Tabular
3752 <lyxtabular version="3" rows="5" columns="3">
3753 <features>
3754 <column alignment="center" valignment="top" width="0">
3755 <column alignment="center" valignment="top" width="0">
3756 <column alignment="center" valignment="top" width="0">
3757 <row bottomline="true">
3758 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3759 \begin_inset Text
3760
3761 \begin_layout Standard
3762 Subject sequence
3763 \end_layout
3764
3765 \end_inset
3766 </cell>
3767 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3768 \begin_inset Text
3769
3770 \begin_layout Standard
3771 Query sequence
3772 \end_layout
3773
3774 \end_inset
3775 </cell>
3776 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3777 \begin_inset Text
3778
3779 \begin_layout Standard
3780 Program guess
3781 \end_layout
3782
3783 \end_inset
3784 </cell>
3785 </row>
3786 <row>
3787 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3788 \begin_inset Text
3789
3790 \begin_layout Standard
3791 Nucleotide
3792 \end_layout
3793
3794 \end_inset
3795 </cell>
3796 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3797 \begin_inset Text
3798
3799 \begin_layout Standard
3800 Nucleotide
3801 \end_layout
3802
3803 \end_inset
3804 </cell>
3805 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3806 \begin_inset Text
3807
3808 \begin_layout Standard
3809 blastn
3810 \end_layout
3811
3812 \end_inset
3813 </cell>
3814 </row>
3815 <row>
3816 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3817 \begin_inset Text
3818
3819 \begin_layout Standard
3820 Protein
3821 \end_layout
3822
3823 \end_inset
3824 </cell>
3825 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3826 \begin_inset Text
3827
3828 \begin_layout Standard
3829 Protein
3830 \end_layout
3831
3832 \end_inset
3833 </cell>
3834 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3835 \begin_inset Text
3836
3837 \begin_layout Standard
3838 blastp
3839 \end_layout
3840
3841 \end_inset
3842 </cell>
3843 </row>
3844 <row>
3845 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3846 \begin_inset Text
3847
3848 \begin_layout Standard
3849 Protein
3850 \end_layout
3851
3852 \end_inset
3853 </cell>
3854 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3855 \begin_inset Text
3856
3857 \begin_layout Standard
3858 Nucleotide
3859 \end_layout
3860
3861 \end_inset
3862 </cell>
3863 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3864 \begin_inset Text
3865
3866 \begin_layout Standard
3867 blastx
3868 \end_layout
3869
3870 \end_inset
3871 </cell>
3872 </row>
3873 <row>
3874 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3875 \begin_inset Text
3876
3877 \begin_layout Standard
3878 Nucleotide
3879 \end_layout
3880
3881 \end_inset
3882 </cell>
3883 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3884 \begin_inset Text
3885
3886 \begin_layout Standard
3887 Protein
3888 \end_layout
3889
3890 \end_inset
3891 </cell>
3892 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3893 \begin_inset Text
3894
3895 \begin_layout Standard
3896 tblastn
3897 \end_layout
3898
3899 \end_inset
3900 </cell>
3901 </row>
3902 </lyxtabular>
3903
3904 \end_inset
3905
3906
3907 \end_layout
3908
3909 \begin_layout Standard
3910 Finally, it is also possible to use
3911 \series bold
3912 blast_seq
3913 \series default
3914  for blasting sequences agains a preformatted genome using the
3915 \begin_inset ERT
3916 status collapsed
3917
3918 \begin_layout Standard
3919
3920 -
3921 \backslash
3922 /-
3923 \end_layout
3924
3925 \end_inset
3926
3927 genome switch instead of the
3928 \begin_inset ERT
3929 status collapsed
3930
3931 \begin_layout Standard
3932
3933 -
3934 \backslash
3935 /-
3936 \end_layout
3937
3938 \end_inset
3939
3940 database switch:
3941 \end_layout
3942
3943 \begin_layout LyX-Code
3944 ...
3945  | blast_seq --genome=<genome>
3946 \end_layout
3947
3948 \begin_layout Subsection
3949 How to use Vmatch for sequence search?
3950 \begin_inset LatexCommand label
3951 name "sub:How-to-use-Vmatch"
3952
3953 \end_inset
3954
3955
3956 \end_layout
3957
3958 \begin_layout Standard
3959 The powerful suffix array software package Vmatch
3960 \begin_inset Foot
3961 status collapsed
3962
3963 \begin_layout Standard
3964 \begin_inset LatexCommand url
3965 target "http://www.vmatch.de/"
3966
3967 \end_inset
3968
3969
3970 \end_layout
3971
3972 \end_inset
3973
3974  can be used for exact mapping of sequences against indexed genomes using
3975  the biotool
3976 \series bold
3977 vmatch_seq
3978 \series default
3979 , which will e.g.
3980  map 700000 ESTs to the human genome locating all 160 mio hits in less than
3981  an hour.
3982  Only nucleotide sequences and sequences longer than 11 nucleotides will
3983  be mapped.
3984  It is recommended that sequences consisting of mostly one nucleotide type
3985  are removed.
3986  This can be done with the
3987 \series bold
3988 analyze_seq
3989 \series default
3990  biotool
3991 \begin_inset LatexCommand eqref
3992 reference "sub:How-to-analyze"
3993
3994 \end_inset
3995
3996 .
3997 \end_layout
3998
3999 \begin_layout LyX-Code
4000 ...
4001  | vmatch_seq --genome=<genome>
4002 \end_layout
4003
4004 \begin_layout Standard
4005 It is also possible to allow for mismatches using the
4006 \begin_inset ERT
4007 status collapsed
4008
4009 \begin_layout Standard
4010
4011 -
4012 \backslash
4013 /-
4014 \end_layout
4015
4016 \end_inset
4017
4018 hamming_dist switch.
4019  So to allow for 2 mismatches:
4020 \end_layout
4021
4022 \begin_layout LyX-Code
4023 ...
4024  | vmatch_seq --genome=<genome> --hamming_dist=2
4025 \end_layout
4026
4027 \begin_layout Standard
4028 Or to allow for 10% mismathing nucleotides:
4029 \end_layout
4030
4031 \begin_layout LyX-Code
4032 ...
4033  | vmatch_seq --genome=<genome> --hamming_dist=10p
4034 \end_layout
4035
4036 \begin_layout Standard
4037 To allow both indels and mismatches use the
4038 \begin_inset ERT
4039 status collapsed
4040
4041 \begin_layout Standard
4042
4043 -
4044 \backslash
4045 /-
4046 \end_layout
4047
4048 \end_inset
4049
4050 edit_dist switch.
4051  So to allow for one mismatch or one indel:
4052 \end_layout
4053
4054 \begin_layout LyX-Code
4055 ...
4056  | vmatch_seq --genome=<genome> --hamming_dist=1
4057 \end_layout
4058
4059 \begin_layout Standard
4060 Or to allow for 5% indels or mismatches:
4061 \end_layout
4062
4063 \begin_layout LyX-Code
4064 ...
4065  | vmatch_seq --genome=<genome> --hamming_dist=5p
4066 \end_layout
4067
4068 \begin_layout Standard
4069 Note that using
4070 \begin_inset ERT
4071 status collapsed
4072
4073 \begin_layout Standard
4074
4075 -
4076 \backslash
4077 /-
4078 \end_layout
4079
4080 \end_inset
4081
4082 hamming_dist or
4083 \begin_inset ERT
4084 status collapsed
4085
4086 \begin_layout Standard
4087
4088 -
4089 \backslash
4090 /-
4091 \end_layout
4092
4093 \end_inset
4094
4095 edit_dist greatly slows down vmatch considerably --- use with care.
4096 \end_layout
4097
4098 \begin_layout Standard
4099 The resulting SCORE key can be replaced to hold the number of genome matches
4100  of a given sequence (multi-mappers) is the
4101 \begin_inset ERT
4102 status collapsed
4103
4104 \begin_layout Standard
4105
4106 -
4107 \backslash
4108 /-
4109 \end_layout
4110
4111 \end_inset
4112
4113 count switch is given.
4114 \end_layout
4115
4116 \begin_layout Subsection
4117 How to find all matches between sequences?
4118 \begin_inset LatexCommand label
4119 name "sub:How-to-find-matches"
4120
4121 \end_inset
4122
4123
4124 \end_layout
4125
4126 \begin_layout Standard
4127 All matches between two sequences can be determined with the biotool
4128 \series bold
4129 match_seq
4130 \series default
4131 .
4132  The match finding engine underneath the hood of
4133 \series bold
4134 match_seq
4135 \series default
4136  is the super fast suffix tree program MUMmer
4137 \begin_inset Foot
4138 status collapsed
4139
4140 \begin_layout Standard
4141 \begin_inset LatexCommand url
4142 target "http://mummer.sourceforge.net/"
4143
4144 \end_inset
4145
4146
4147 \end_layout
4148
4149 \end_inset
4150
4151 , which will locate all forward and reverse matches between huge sequences
4152  in a matter of minutes (if the repeat count is not too high and if the
4153  word size used is appropriate).
4154  Matching two
4155 \emph on
4156 Helicobacter pylori
4157 \emph default
4158  genomes (1.7Mbp) takes around 10 seconds:
4159 \end_layout
4160
4161 \begin_layout LyX-Code
4162 ...
4163  | match_seq --word_size=20 --direction=both
4164 \end_layout
4165
4166 \begin_layout Standard
4167 The output from
4168 \series bold
4169 match_seq
4170 \series default
4171  can be used to generate a dot plot with
4172 \series bold
4173 plot_matches
4174 \series default
4175  (see\InsetSpace ~
4176
4177 \begin_inset LatexCommand ref
4178 reference "sub:How-to-generate-dotplot"
4179
4180 \end_inset
4181
4182 ).
4183 \end_layout
4184
4185 \begin_layout Subsection
4186 How to align sequences?
4187 \begin_inset LatexCommand label
4188 name "sub:How-to-align"
4189
4190 \end_inset
4191
4192
4193 \end_layout
4194
4195 \begin_layout Standard
4196 Sequences in the stream can be aligned with the
4197 \series bold
4198 align_seq
4199 \series default
4200  biotool that uses Muscle
4201 \begin_inset Foot
4202 status open
4203
4204 \begin_layout Standard
4205 \begin_inset LatexCommand url
4206 target "http://www.drive5.com/muscle/muscle.html"
4207
4208 \end_inset
4209
4210
4211 \end_layout
4212
4213 \end_inset
4214
4215  as aligment engine.
4216  Currently you cannot change any of the Muscle alignment parameters and
4217
4218 \series bold
4219 align_seq
4220 \series default
4221  will create an alignment based on the defaults (which are really good!):
4222 \end_layout
4223
4224 \begin_layout LyX-Code
4225 ...
4226  | align_seq
4227 \end_layout
4228
4229 \begin_layout Standard
4230 The aligned output can be written to file in FASTA format using
4231 \series bold
4232 write_fasta
4233 \series default
4234  (see\InsetSpace ~
4235
4236 \begin_inset LatexCommand ref
4237 reference "sub:How-to-write-fasta"
4238
4239 \end_inset
4240
4241 ) or in pretty text using
4242 \series bold
4243 write_align
4244 \series default
4245  (see\InsetSpace ~
4246
4247 \begin_inset LatexCommand ref
4248 reference "sub:How-to-write-alignment"
4249
4250 \end_inset
4251
4252 ).
4253 \end_layout
4254
4255 \begin_layout Subsection
4256 How to create a weight matrix?
4257 \end_layout
4258
4259 \begin_layout Standard
4260 If you want a weight matrix to show the sequence composition of a stack
4261  of sequences you can use the biotool create_weight_matrix:
4262 \end_layout
4263
4264 \begin_layout LyX-Code
4265 ...
4266  | create_weight_matrix
4267 \end_layout
4268
4269 \begin_layout Standard
4270 The result can be output in percent using the
4271 \begin_inset ERT
4272 status open
4273
4274 \begin_layout Standard
4275
4276 -
4277 \backslash
4278 /-
4279 \end_layout
4280
4281 \end_inset
4282
4283 percent switch:
4284 \end_layout
4285
4286 \begin_layout LyX-Code
4287 ...
4288  | create_weight_matrix --percent
4289 \end_layout
4290
4291 \begin_layout Standard
4292 The weight matrix can be written as tabular output with
4293 \series bold
4294 write_tab
4295 \series default
4296  (see\InsetSpace ~
4297
4298 \begin_inset LatexCommand ref
4299 reference "sub:How-to-write-tab"
4300
4301 \end_inset
4302
4303 ) after removeing the records containing SEQ with
4304 \series bold
4305 grab
4306 \series default
4307  (see\InsetSpace ~
4308
4309 \begin_inset LatexCommand ref
4310 reference "sub:How-to-grab"
4311
4312 \end_inset
4313
4314 ):
4315 \end_layout
4316
4317 \begin_layout LyX-Code
4318 ...
4319  | create_weight_matrix | grab --invert --keys=SEQ --keys_only
4320 \end_layout
4321
4322 \begin_layout LyX-Code
4323 | write_tab --no_stream
4324 \end_layout
4325
4326 \begin_layout Standard
4327 The V0 column will hold the residue, while the rest of the columns will
4328  hold the frequencies for each sequence position.
4329 \end_layout
4330
4331 \begin_layout Section
4332 Plotting
4333 \end_layout
4334
4335 \begin_layout Standard
4336 There exists several biotools for plotting.
4337  Some of these are based on GNUplot
4338 \begin_inset Foot
4339 status open
4340
4341 \begin_layout Standard
4342 \begin_inset LatexCommand url
4343 target "http://www.gnuplot.info/"
4344
4345 \end_inset
4346
4347
4348 \end_layout
4349
4350 \end_inset
4351
4352 , which is an extremely powerful platform to generate all sorts of plots
4353  and even though GNUplot has quite a steep learning curve, the biotools
4354  utilizing GNUplot are simple to use.
4355  GNUplot is able to output a lot of different formats (called terminals
4356  in GNUplot), but the biotools focusses on three formats only:
4357 \end_layout
4358
4359 \begin_layout Enumerate
4360 The 'dumb' terminal is default to the GNUplot based biotools and will output
4361  a plot in crude ASCII text (Fig.\InsetSpace ~
4362
4363 \begin_inset LatexCommand ref
4364 reference "fig:Dumb-terminal"
4365
4366 \end_inset
4367
4368 ).
4369  This is quite nice for a quick and dirty plot to get an overview of your
4370  data .
4371 \end_layout
4372
4373 \begin_layout Enumerate
4374 The 'post' or 'postscript' terminal output postscript code which is publication
4375  grade graphics that can be viewed with applications such as Ghostview,
4376  Photoshop, and Preview.
4377 \end_layout
4378
4379 \begin_layout Enumerate
4380 The 'svg' terminal output's scalable vector graphics (SVG) which is a vector
4381  based format.
4382  SVG is great because you can edit the resulting plot using Photoshop or
4383  Inkscape
4384 \begin_inset Foot
4385 status collapsed
4386
4387 \begin_layout Standard
4388 Inkscape is a really handy drawing program that is free and open source.
4389  Availble at
4390 \begin_inset LatexCommand htmlurl
4391 target "http://www.inkscape.org"
4392
4393 \end_inset
4394
4395
4396 \end_layout
4397
4398 \end_inset
4399
4400  if you want to add additional labels, captions, arrows, and so on and then
4401  save the result in different formats, such as postscript without loosing
4402  resolution.
4403 \end_layout
4404
4405 \begin_layout Standard
4406 The biotools for plotting that are not based on GNUplot only output SVG
4407  (that may change in the future).
4408 \end_layout
4409
4410 \begin_layout Standard
4411 \begin_inset Float figure
4412 wide false
4413 sideways false
4414 status open
4415
4416 \begin_layout Standard
4417 \noindent
4418 \align center
4419 \begin_inset Graphics
4420         filename lendist_ascii.png
4421         lyxscale 70
4422         width 12cm
4423
4424 \end_inset
4425
4426
4427 \end_layout
4428
4429 \begin_layout Standard
4430 \begin_inset Caption
4431
4432 \begin_layout Standard
4433 \begin_inset LatexCommand label
4434 name "fig:Dumb-terminal"
4435
4436 \end_inset
4437
4438 Dumb terminal
4439 \end_layout
4440
4441 \end_inset
4442
4443
4444 \end_layout
4445
4446 \begin_layout Quote
4447 The output of a length distribution plot in the default 'dumb terminal'
4448  to the terminal window.
4449
4450 \end_layout
4451
4452 \end_inset
4453
4454
4455 \end_layout
4456
4457 \begin_layout Subsection
4458 How to plot a histogram?
4459 \begin_inset LatexCommand label
4460 name "How-to-plot-histogram"
4461
4462 \end_inset
4463
4464
4465 \end_layout
4466
4467 \begin_layout Standard
4468 A generic histogram for a given value can be plotted with the biotool
4469 \series bold
4470 plot_histogram
4471 \series default
4472  (Fig.\InsetSpace ~
4473
4474 \begin_inset LatexCommand ref
4475 reference "fig:Histogram"
4476
4477 \end_inset
4478
4479 ):
4480 \end_layout
4481
4482 \begin_layout LyX-Code
4483 ...
4484  | plot_histogram --key=TISSUE --no_stream
4485 \end_layout
4486
4487 \begin_layout Standard
4488 (Figure missing)
4489 \end_layout
4490
4491 \begin_layout Standard
4492 \noindent
4493 \align left
4494 \begin_inset Float figure
4495 wide false
4496 sideways false
4497 status open
4498
4499 \begin_layout Standard
4500 \noindent
4501 \align center
4502 \begin_inset Graphics
4503         filename histogram.png
4504         lyxscale 70
4505         width 12cm
4506
4507 \end_inset
4508
4509
4510 \end_layout
4511
4512 \begin_layout Standard
4513 \begin_inset Caption
4514
4515 \begin_layout Standard
4516 \begin_inset LatexCommand label
4517 name "fig:Histogram"
4518
4519 \end_inset
4520
4521 Histogram
4522 \end_layout
4523
4524 \end_inset
4525
4526
4527 \end_layout
4528
4529 \end_inset
4530
4531
4532 \end_layout
4533
4534 \begin_layout Subsection
4535 How to plot a length distribution?
4536 \begin_inset LatexCommand label
4537 name "sub:How-to-plot-lendist"
4538
4539 \end_inset
4540
4541
4542 \end_layout
4543
4544 \begin_layout Standard
4545 Plotting of length distributions, weather sequence lengths, patterns lengths,
4546  hit lengths,
4547 \emph on
4548 etc.
4549
4550 \emph default
4551  is a really handy thing and can be done with the the biotool
4552 \series bold
4553 plot_lendist
4554 \series default
4555 .
4556  If you have a file with FASTA entries and want to plot the length distribution
4557  you do it like this:
4558 \end_layout
4559
4560 \begin_layout LyX-Code
4561 read_fasta --data_in=<file> | length_seq
4562 \end_layout
4563
4564 \begin_layout LyX-Code
4565 | plot_lendist --key=SEQ_LEN --no_stream
4566 \end_layout
4567
4568 \begin_layout Standard
4569 The result will be written to the default dumb terminal and will look like
4570  Fig.\InsetSpace ~
4571
4572 \begin_inset LatexCommand ref
4573 reference "fig:Dumb-terminal"
4574
4575 \end_inset
4576
4577 .
4578 \end_layout
4579
4580 \begin_layout Standard
4581 If you instead want the result in postscript format you can do:
4582 \end_layout
4583
4584 \begin_layout LyX-Code
4585 ...
4586  | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps
4587 \end_layout
4588
4589 \begin_layout Standard
4590 That will generate the plot and save it to file, but not interrupt the data
4591  stream which can then be used in further analysis.
4592  You can also save the plot implicetly using '>', however, it is then important
4593  to terminate the stream with the
4594 \begin_inset ERT
4595 status collapsed
4596
4597 \begin_layout Standard
4598
4599 -
4600 \backslash
4601 /-
4602 \end_layout
4603
4604 \end_inset
4605
4606 no_stream switch:
4607 \end_layout
4608
4609 \begin_layout LyX-Code
4610 ...
4611  | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps
4612 \end_layout
4613
4614 \begin_layout Standard
4615 The resulting plot can be seen in Fig.\InsetSpace ~
4616
4617 \begin_inset LatexCommand ref
4618 reference "fig:Length-distribution"
4619
4620 \end_inset
4621
4622 .
4623 \end_layout
4624
4625 \begin_layout Standard
4626 \begin_inset Float figure
4627 wide false
4628 sideways false
4629 status open
4630
4631 \begin_layout Standard
4632
4633 \end_layout
4634
4635 \begin_layout Standard
4636 \noindent
4637 \align center
4638 \begin_inset Graphics
4639         filename lendist.ps
4640         lyxscale 50
4641         width 12cm
4642
4643 \end_inset
4644
4645
4646 \end_layout
4647
4648 \begin_layout Standard
4649 \begin_inset Caption
4650
4651 \begin_layout Standard
4652 \begin_inset LatexCommand label
4653 name "fig:Length-distribution"
4654
4655 \end_inset
4656
4657 Length distribution
4658 \end_layout
4659
4660 \end_inset
4661
4662
4663 \end_layout
4664
4665 \begin_layout Quote
4666 Length distribution of 630 piRNA like RNAs.
4667 \end_layout
4668
4669 \end_inset
4670
4671
4672 \end_layout
4673
4674 \begin_layout Subsection
4675 How to plot a chromosome distribution?
4676 \begin_inset LatexCommand label
4677 name "sub:How-to-plot-chrdist"
4678
4679 \end_inset
4680
4681
4682 \end_layout
4683
4684 \begin_layout Standard
4685 If you have the result of a sequence search against a multi chromosome genome,
4686  it is very practical to be able to plot the distribution of search hits
4687  on the different chromosomes.
4688  This can be done with
4689 \series bold
4690 plot_chrdist
4691 \series default
4692 :
4693 \end_layout
4694
4695 \begin_layout LyX-Code
4696 read_fasta --data_in=<file> | blat_genome | plot_chrdist --no_stream
4697 \end_layout
4698
4699 \begin_layout Standard
4700 The above example will result in a crude plot using the 'dumb' terminal,
4701  and if you want to mess around with the results from the BLAT search you
4702  probably want to save the result to file first (see\InsetSpace ~
4703
4704 \begin_inset LatexCommand ref
4705 reference "sub:How-to-write-PSL"
4706
4707 \end_inset
4708
4709 ).
4710  To plot the chromosome distribution from the saved search result you can
4711  do:
4712 \end_layout
4713
4714 \begin_layout LyX-Code
4715 read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps
4716 \end_layout
4717
4718 \begin_layout Standard
4719 That will result in the output show in Fig.\InsetSpace ~
4720
4721 \begin_inset LatexCommand ref
4722 reference "fig:Chromosome-distribution"
4723
4724 \end_inset
4725
4726 .
4727 \end_layout
4728
4729 \begin_layout Standard
4730 \begin_inset Float figure
4731 wide false
4732 sideways false
4733 status open
4734
4735 \begin_layout Standard
4736
4737 \end_layout
4738
4739 \begin_layout Standard
4740 \noindent
4741 \align center
4742 \begin_inset Graphics
4743         filename chrdist.ps
4744         lyxscale 50
4745         width 12cm
4746         rotateAngle 90
4747
4748 \end_inset
4749
4750
4751 \end_layout
4752
4753 \begin_layout Standard
4754 \begin_inset Caption
4755
4756 \begin_layout Standard
4757 \begin_inset LatexCommand label
4758 name "fig:Chromosome-distribution"
4759
4760 \end_inset
4761
4762 Chromosome distribution
4763 \end_layout
4764
4765 \end_inset
4766
4767
4768 \end_layout
4769
4770 \end_inset
4771
4772
4773 \end_layout
4774
4775 \begin_layout Subsection
4776 How to generate a dotplot?
4777 \begin_inset LatexCommand label
4778 name "sub:How-to-generate-dotplot"
4779
4780 \end_inset
4781
4782
4783 \end_layout
4784
4785 \begin_layout Standard
4786 A dotplot is a powerful way to get an overview of the size and location
4787  of sequence insertions, deletions, and duplications between two sequences.
4788  Generating a dotplot with biotools is a two step process where you initially
4789  find all matches between two sequences using the tool
4790 \series bold
4791 match_seq
4792 \series default
4793  (see\InsetSpace ~
4794
4795 \begin_inset LatexCommand ref
4796 reference "sub:How-to-find-matches"
4797
4798 \end_inset
4799
4800 ) and plot the resulting matches with
4801 \series bold
4802 plot_matches
4803 \series default
4804 .
4805  Matching and plotting two
4806 \emph on
4807 Helicobacter pylori
4808 \emph default
4809  genomes (1.7Mbp) takes around 10 seconds:
4810 \end_layout
4811
4812 \begin_layout LyX-Code
4813 ...
4814  | match_seq | plot_matches --terminal=post --result_out=plot.ps
4815 \end_layout
4816
4817 \begin_layout Standard
4818 The resulting dotplot is in Fig.\InsetSpace ~
4819
4820 \begin_inset LatexCommand ref
4821 reference "fig:Dotplot"
4822
4823 \end_inset
4824
4825 .
4826 \end_layout
4827
4828 \begin_layout Standard
4829 \begin_inset Float figure
4830 wide false
4831 sideways false
4832 status open
4833
4834 \begin_layout Standard
4835 \noindent
4836 \align center
4837 \begin_inset Graphics
4838         filename dotplot.ps
4839         lyxscale 50
4840         width 12cm
4841
4842 \end_inset
4843
4844
4845 \end_layout
4846
4847 \begin_layout Standard
4848 \begin_inset Caption
4849
4850 \begin_layout Standard
4851 \begin_inset LatexCommand label
4852 name "fig:Dotplot"
4853
4854 \end_inset
4855
4856 Dotplot
4857 \end_layout
4858
4859 \end_inset
4860
4861
4862 \end_layout
4863
4864 \begin_layout Quote
4865 Forward matches are displayed in green while reverse matches are displayed
4866  in red.
4867 \end_layout
4868
4869 \end_inset
4870
4871
4872 \end_layout
4873
4874 \begin_layout Subsection
4875 How to plot a sequence logo?
4876 \end_layout
4877
4878 \begin_layout Standard
4879 Sequence logos can be generate with
4880 \series bold
4881 plot_seqlogo
4882 \series default
4883 .
4884  The sequnce type is determined automagically and an entropy scale of 2
4885  bits and 4 bits is used for nucleotide and peptide sequences, respectively
4886 \begin_inset Foot
4887 status collapsed
4888
4889 \begin_layout Standard
4890 \begin_inset LatexCommand htmlurl
4891 target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html"
4892
4893 \end_inset
4894
4895
4896 \end_layout
4897
4898 \end_inset
4899
4900 .
4901 \end_layout
4902
4903 \begin_layout LyX-Code
4904 ...
4905  | plot_seqlogo --no_stream --result_out=seqlogo.svg
4906 \end_layout
4907
4908 \begin_layout Standard
4909 An example of a sequence logo can be seen in Fig.\InsetSpace ~
4910
4911 \begin_inset LatexCommand ref
4912 reference "fig:Sequence-logo"
4913
4914 \end_inset
4915
4916 .
4917 \end_layout
4918
4919 \begin_layout Standard
4920 \begin_inset Float figure
4921 wide false
4922 sideways false
4923 status open
4924
4925 \begin_layout Standard
4926 \noindent
4927 \align center
4928 \begin_inset Graphics
4929         filename seqlogo.png
4930         lyxscale 50
4931         width 12cm
4932
4933 \end_inset
4934
4935
4936 \end_layout
4937
4938 \begin_layout Standard
4939 \begin_inset Caption
4940
4941 \begin_layout Standard
4942 \begin_inset LatexCommand label
4943 name "fig:Sequence-logo"
4944
4945 \end_inset
4946
4947 Sequence logo
4948 \end_layout
4949
4950 \end_inset
4951
4952
4953 \end_layout
4954
4955 \end_inset
4956
4957
4958 \end_layout
4959
4960 \begin_layout Subsection
4961 How to plot a karyogram?
4962 \begin_inset LatexCommand label
4963 name "sub:How-to-plot-karyogram"
4964
4965 \end_inset
4966
4967
4968 \end_layout
4969
4970 \begin_layout Standard
4971 To plot search hits on genomes use
4972 \series bold
4973 plot_karyogram
4974 \series default
4975 , which will output a nice karyogram in SVG graphics:
4976 \end_layout
4977
4978 \begin_layout LyX-Code
4979 ...
4980  | plot_karyogram --result_out=karyogram.svg
4981 \end_layout
4982
4983 \begin_layout Standard
4984 The banding data is taken from the UCSC genome browser database and currently
4985  only Human and Mouse is supported.
4986  Fig.\InsetSpace ~
4987
4988 \begin_inset LatexCommand ref
4989 reference "fig:Karyogram"
4990
4991 \end_inset
4992
4993  shows the distribution of piRNA like RNAs matched to the Human genome.
4994 \end_layout
4995
4996 \begin_layout Standard
4997 \begin_inset Float figure
4998 wide false
4999 sideways false
5000 status open
5001
5002 \begin_layout Standard
5003 \noindent
5004 \align center
5005 \begin_inset Graphics
5006         filename karyogram.png
5007         lyxscale 35
5008         width 12cm
5009
5010 \end_inset
5011
5012
5013 \end_layout
5014
5015 \begin_layout Standard
5016 \begin_inset Caption
5017
5018 \begin_layout Standard
5019 \begin_inset LatexCommand label
5020 name "fig:Karyogram"
5021
5022 \end_inset
5023
5024 Karyogram
5025 \end_layout
5026
5027 \end_inset
5028
5029
5030 \end_layout
5031
5032 \begin_layout Quote
5033 Hits from a search of piRNA like RNAs in the Human genome is displayed as
5034  short horizontal bars.
5035 \end_layout
5036
5037 \end_inset
5038
5039
5040 \end_layout
5041
5042 \begin_layout Section
5043 Uploading Results
5044 \end_layout
5045
5046 \begin_layout Subsection
5047 How do I display my results in the UCSC Genome Browser?
5048 \end_layout
5049
5050 \begin_layout Standard
5051 Results from the list of biotools below can be uploaded directly to a local
5052  mirror of the UCSC Genome Browser using the biotool
5053 \series bold
5054 upload_to_ucsc
5055 \series default
5056 :
5057 \end_layout
5058
5059 \begin_layout Itemize
5060 patscan_seq
5061 \begin_inset LatexCommand eqref
5062 reference "sub:How-to-use-patscan"
5063
5064 \end_inset
5065
5066
5067 \end_layout
5068
5069 \begin_layout Itemize
5070 blat_seq
5071 \begin_inset LatexCommand eqref
5072 reference "sub:How-to-use-BLAT"
5073
5074 \end_inset
5075
5076
5077 \end_layout
5078
5079 \begin_layout Itemize
5080 blast_seq
5081 \begin_inset LatexCommand eqref
5082 reference "sub:How-to-use-BLAST"
5083
5084 \end_inset
5085
5086
5087 \end_layout
5088
5089 \begin_layout Itemize
5090 vmatch_seq
5091 \begin_inset LatexCommand eqref
5092 reference "sub:How-to-use-Vmatch"
5093
5094 \end_inset
5095
5096
5097 \end_layout
5098
5099 \begin_layout Standard
5100 The syntax for uploading data the most simple way requires two mandatory
5101  switches:
5102 \begin_inset ERT
5103 status collapsed
5104
5105 \begin_layout Standard
5106
5107 -
5108 \backslash
5109 /-
5110 \end_layout
5111
5112 \end_inset
5113
5114 database, which is the UCSC database name (such as hg18, mm9, etc.) and
5115 \begin_inset ERT
5116 status collapsed
5117
5118 \begin_layout Standard
5119
5120 -
5121 \backslash
5122 /-
5123 \end_layout
5124
5125 \end_inset
5126
5127 table which should be the users initials followed by an underscore and a
5128  short description of the data:
5129 \end_layout
5130
5131 \begin_layout LyX-Code
5132 ...
5133  | upload_to_ucsc --database=hg18 --table=mah_snoRNAs
5134 \end_layout
5135
5136 \begin_layout Standard
5137 The
5138 \series bold
5139 upload_to_ucsc
5140 \series default
5141  biotool modifies the users ~/ucsc/my_tracks.ra file automagically (a backup
5142  is created with the name ~/ucsc/my_tracks.ra~) with default values that
5143  can be overridden using the following switches:
5144 \end_layout
5145
5146 \begin_layout Itemize
5147 \begin_inset ERT
5148 status collapsed
5149
5150 \begin_layout Standard
5151
5152 -
5153 \backslash
5154 /-
5155 \end_layout
5156
5157 \end_inset
5158
5159 short_label - Short label for track - Default=database->table
5160 \end_layout
5161
5162 \begin_layout Itemize
5163 \begin_inset ERT
5164 status collapsed
5165
5166 \begin_layout Standard
5167
5168 -
5169 \backslash
5170 /-
5171 \end_layout
5172
5173 \end_inset
5174
5175 long_label - Long label for track - Default=database->table
5176 \end_layout
5177
5178 \begin_layout Itemize
5179 \begin_inset ERT
5180 status collapsed
5181
5182 \begin_layout Standard
5183
5184 -
5185 \backslash
5186 /-
5187 \end_layout
5188
5189 \end_inset
5190
5191 group - Track group name - Default=<user name as defined in env>
5192 \end_layout
5193
5194 \begin_layout Itemize
5195 \begin_inset ERT
5196 status collapsed
5197
5198 \begin_layout Standard
5199
5200 -
5201 \backslash
5202 /-
5203 \end_layout
5204
5205 \end_inset
5206
5207 priority - Track display priority - Default=1
5208 \end_layout
5209
5210 \begin_layout Itemize
5211 \begin_inset ERT
5212 status collapsed
5213
5214 \begin_layout Standard
5215
5216 -
5217 \backslash
5218 /-
5219 \end_layout
5220
5221 \end_inset
5222
5223 color - Track color - Default=147,73,42
5224 \end_layout
5225
5226 \begin_layout Itemize
5227 \begin_inset ERT
5228 status collapsed
5229
5230 \begin_layout Standard
5231
5232 -
5233 \backslash
5234 /-
5235 \end_layout
5236
5237 \end_inset
5238
5239 chunk_size - Chunks for loading - Default=10000000
5240 \end_layout
5241
5242 \begin_layout Itemize
5243 \begin_inset ERT
5244 status collapsed
5245
5246 \begin_layout Standard
5247
5248 -
5249 \backslash
5250 /-
5251 \end_layout
5252
5253 \end_inset
5254
5255 visibility - Track visibility - Default=pack
5256 \end_layout
5257
5258 \begin_layout Standard
5259 Also, data in BED or PSL format can be uploaded with
5260 \series bold
5261 upload_to_ucsc
5262 \series default
5263  as long as these reference to genomes and chromosomes existing in the UCSC
5264  Genome Browser:
5265 \end_layout
5266
5267 \begin_layout LyX-Code
5268 read_bed --data_in=<bed file> | upload_to_ucsc ...
5269 \end_layout
5270
5271 \begin_layout LyX-Code
5272
5273 \end_layout
5274
5275 \begin_layout LyX-Code
5276 read_psl --data_in=<psl file> | upload_to_ucsc ...
5277 \end_layout
5278
5279 \begin_layout Section
5280 Power Scripting
5281 \end_layout
5282
5283 \begin_layout Standard
5284 It is possible to do commandline scripting of biotool records using Perl.
5285  Because a biotool record essentially is a hash structure, you can pass
5286  records to
5287 \series bold
5288 bioscript
5289 \series default
5290  command, which is a wrapper around the Perl executable that allows direct
5291  manipulations of the records using the power of Perl.
5292 \end_layout
5293
5294 \begin_layout Standard
5295 In the below example we replace in all records the value to the CHR key
5296  with a forthrunning number:
5297 \end_layout
5298
5299 \begin_layout LyX-Code
5300 ...
5301  | bioscript 'while($r=get_record(
5302 \backslash
5303 *STDIN)){$r->{CHR}=$i++; put_record($r)}'
5304 \end_layout
5305
5306 \begin_layout Standard
5307 Something more useful would probably be to create custom FASTA headers.
5308  E.g.
5309  if we read in a BED file, lookup the genomic sequence, create a custom
5310  FASTA header with
5311 \series bold
5312 bioscript
5313 \series default
5314  and output FASTA entries:
5315 \end_layout
5316
5317 \begin_layout LyX-Code
5318 ...
5319  | bioscript 'while($r=get_record(
5320 \backslash
5321 *STDIN)){$r->{SEQ_NAME}= //
5322 \end_layout
5323
5324 \begin_layout LyX-Code
5325 join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}'
5326 \end_layout
5327
5328 \begin_layout Standard
5329 And the output:
5330 \end_layout
5331
5332 \begin_layout LyX-Code
5333 >chr2L_21567527_21567550
5334 \end_layout
5335
5336 \begin_layout LyX-Code
5337 taccaaacggatgcctcagacatc
5338 \end_layout
5339
5340 \begin_layout LyX-Code
5341 >chr2L_693380_693403
5342 \end_layout
5343
5344 \begin_layout LyX-Code
5345 taccaaacggatgcctcagacatc
5346 \end_layout
5347
5348 \begin_layout LyX-Code
5349 >chr2L_13859534_13859557
5350 \end_layout
5351
5352 \begin_layout LyX-Code
5353 taccaaacggatgcctcagacatc
5354 \end_layout
5355
5356 \begin_layout LyX-Code
5357 >chr2L_9005090_9005113
5358 \end_layout
5359
5360 \begin_layout LyX-Code
5361 taccaaacggatgcctcagacatc
5362 \end_layout
5363
5364 \begin_layout LyX-Code
5365 >chr2L_2106825_2106848
5366 \end_layout
5367
5368 \begin_layout LyX-Code
5369 taccaaacggatgcctcagacatc
5370 \end_layout
5371
5372 \begin_layout LyX-Code
5373 >chr2L_14649031_14649054
5374 \end_layout
5375
5376 \begin_layout LyX-Code
5377 taccaaacggatgcctcagacatc
5378 \end_layout
5379
5380 \begin_layout Section
5381 Trouble shooting
5382 \end_layout
5383
5384 \begin_layout Standard
5385 Shoot the messenger!
5386 \end_layout
5387
5388 \begin_layout Section
5389 \start_of_appendix
5390 Keys
5391 \begin_inset LatexCommand label
5392 name "sec:Keys"
5393
5394 \end_inset
5395
5396
5397 \end_layout
5398
5399 \begin_layout Standard
5400 HIT
5401 \end_layout
5402
5403 \begin_layout Standard
5404 HIT_BEG
5405 \end_layout
5406
5407 \begin_layout Standard
5408 HIT_END
5409 \end_layout
5410
5411 \begin_layout Standard
5412 HIT_LEN
5413 \end_layout
5414
5415 \begin_layout Standard
5416 HIT_NAME
5417 \end_layout
5418
5419 \begin_layout Standard
5420 PATTERN
5421 \end_layout
5422
5423 \begin_layout Section
5424 Switches
5425 \begin_inset LatexCommand label
5426 name "sec:Switches"
5427
5428 \end_inset
5429
5430
5431 \end_layout
5432
5433 \begin_layout Standard
5434 \begin_inset ERT
5435 status collapsed
5436
5437 \begin_layout Standard
5438
5439 -
5440 \backslash
5441 /-
5442 \end_layout
5443
5444 \end_inset
5445
5446 stream_in
5447 \end_layout
5448
5449 \begin_layout Standard
5450 \begin_inset ERT
5451 status collapsed
5452
5453 \begin_layout Standard
5454
5455 -
5456 \backslash
5457 /-
5458 \end_layout
5459
5460 \end_inset
5461
5462 stream_out
5463 \end_layout
5464
5465 \begin_layout Standard
5466 \begin_inset ERT
5467 status collapsed
5468
5469 \begin_layout Standard
5470
5471 -
5472 \backslash
5473 /-
5474 \end_layout
5475
5476 \end_inset
5477
5478 no_stream
5479 \end_layout
5480
5481 \begin_layout Standard
5482 \begin_inset ERT
5483 status collapsed
5484
5485 \begin_layout Standard
5486
5487 -
5488 \backslash
5489 /-
5490 \end_layout
5491
5492 \end_inset
5493
5494 data_in
5495 \end_layout
5496
5497 \begin_layout Standard
5498 \begin_inset ERT
5499 status collapsed
5500
5501 \begin_layout Standard
5502
5503 -
5504 \backslash
5505 /-
5506 \end_layout
5507
5508 \end_inset
5509
5510 result_out
5511 \end_layout
5512
5513 \begin_layout Standard
5514 \begin_inset ERT
5515 status collapsed
5516
5517 \begin_layout Standard
5518
5519 -
5520 \backslash
5521 /-
5522 \end_layout
5523
5524 \end_inset
5525
5526 num
5527 \end_layout
5528
5529 \begin_layout Section
5530 scan_for_matches README
5531 \begin_inset LatexCommand label
5532 name "sec:scan_for_matches-README"
5533
5534 \end_inset
5535
5536
5537 \end_layout
5538
5539 \begin_layout LyX-Code
5540                           scan_for_matches:
5541 \end_layout
5542
5543 \begin_layout LyX-Code
5544     A Program to Scan Nucleotide or Protein Sequences for Matching Patterns
5545 \end_layout
5546
5547 \begin_layout LyX-Code
5548                         Ross Overbeek
5549 \end_layout
5550
5551 \begin_layout LyX-Code
5552                         MCS
5553 \end_layout
5554
5555 \begin_layout LyX-Code
5556                         Argonne National Laboratory
5557 \end_layout
5558
5559 \begin_layout LyX-Code
5560                         Argonne, IL 60439
5561 \end_layout
5562
5563 \begin_layout LyX-Code
5564                         USA
5565 \end_layout
5566
5567 \begin_layout LyX-Code
5568 Scan_for_matches is a utility that we have written to search for
5569 \end_layout
5570
5571 \begin_layout LyX-Code
5572 patterns in DNA and protein sequences.
5573   I wrote most of the code,
5574 \end_layout
5575
5576 \begin_layout LyX-Code
5577 although David Joerg and Morgan Price wrote sections of an
5578 \end_layout
5579
5580 \begin_layout LyX-Code
5581 earlier version.
5582   The whole notion of pattern matching has a rich
5583 \end_layout
5584
5585 \begin_layout LyX-Code
5586 history, and we borrowed liberally from many sources.
5587   However, it is
5588 \end_layout
5589
5590 \begin_layout LyX-Code
5591 worth noting that we were strongly influenced by the elegant tools
5592 \end_layout
5593
5594 \begin_layout LyX-Code
5595 developed and distributed by David Searls.
5596   My intent is to make the
5597 \end_layout
5598
5599 \begin_layout LyX-Code
5600 existing tool available to anyone in the research community that might
5601 \end_layout
5602
5603 \begin_layout LyX-Code
5604 find it useful.
5605   I will continue to try to fix bugs and make suggested
5606 \end_layout
5607
5608 \begin_layout LyX-Code
5609 enhancements, at least until I feel that a superior tool exists.
5610 \end_layout
5611
5612 \begin_layout LyX-Code
5613 Hence, I would appreciate it if all bug reports and suggestions are
5614 \end_layout
5615
5616 \begin_layout LyX-Code
5617 directed to me at Overbeek@mcs.anl.gov.
5618
5619 \end_layout
5620
5621 \begin_layout LyX-Code
5622 I will try to log all bug fixes and report them to users that send me
5623 \end_layout
5624
5625 \begin_layout LyX-Code
5626 their email addresses.
5627   I do not require that you give me your name
5628 \end_layout
5629
5630 \begin_layout LyX-Code
5631 and address.
5632   However, if you do give it to me, I will try to notify
5633 \end_layout
5634
5635 \begin_layout LyX-Code
5636 you of serious problems as they are discovered.
5637 \end_layout
5638
5639 \begin_layout LyX-Code
5640 Getting Started:
5641 \end_layout
5642
5643 \begin_layout LyX-Code
5644     The distribution should contain at least the following programs:
5645 \end_layout
5646
5647 \begin_layout LyX-Code
5648                 README                  -     This document
5649 \end_layout
5650
5651 \begin_layout LyX-Code
5652                 ggpunit.c               -     One of the two source files
5653 \end_layout
5654
5655 \begin_layout LyX-Code
5656                 scan_for_matches.c      -     The second source file
5657 \end_layout
5658
5659 \begin_layout LyX-Code
5660
5661 \end_layout
5662
5663 \begin_layout LyX-Code
5664                 run_tests               -     A perl script to test things
5665 \end_layout
5666
5667 \begin_layout LyX-Code
5668                 show_hits               -     A handy perl script
5669 \end_layout
5670
5671 \begin_layout LyX-Code
5672                 test_dna_input          -     Test sequences for DNA
5673 \end_layout
5674
5675 \begin_layout LyX-Code
5676                 test_dna_patterns       -     Test patterns for DNA scan
5677 \end_layout
5678
5679 \begin_layout LyX-Code
5680                 test_output             -     Desired output from test
5681 \end_layout
5682
5683 \begin_layout LyX-Code
5684                 test_prot_input         -     Test protein sequences
5685 \end_layout
5686
5687 \begin_layout LyX-Code
5688                 test_prot_patterns      -     Test patterns for proteins
5689 \end_layout
5690
5691 \begin_layout LyX-Code
5692                 testit                  -     a perl script used for test
5693 \end_layout
5694
5695 \begin_layout LyX-Code
5696     Only the first three files are required.
5697   The others are useful,
5698 \end_layout
5699
5700 \begin_layout LyX-Code
5701     but only if you have Perl installed on your system.
5702   If you do
5703 \end_layout
5704
5705 \begin_layout LyX-Code
5706     have Perl, I suggest that you type
5707 \end_layout
5708
5709 \begin_layout LyX-Code
5710
5711 \end_layout
5712
5713 \begin_layout LyX-Code
5714                 which perl
5715 \end_layout
5716
5717 \begin_layout LyX-Code
5718     to find out where it installed.
5719   On my system, I get the following
5720 \end_layout
5721
5722 \begin_layout LyX-Code
5723     response:
5724 \end_layout
5725
5726 \begin_layout LyX-Code
5727
5728 \end_layout
5729
5730 \begin_layout LyX-Code
5731                 clone% which perl
5732 \end_layout
5733
5734 \begin_layout LyX-Code
5735                 /usr/local/bin/perl
5736 \end_layout
5737
5738 \begin_layout LyX-Code
5739     indicating that Perl is installed in /usr/local/bin.
5740   Anyway, once
5741 \end_layout
5742
5743 \begin_layout LyX-Code
5744     you know where it is installed, edit the first line of files
5745 \end_layout
5746
5747 \begin_layout LyX-Code
5748         testit
5749 \end_layout
5750
5751 \begin_layout LyX-Code
5752         show_hits
5753 \end_layout
5754
5755 \begin_layout LyX-Code
5756     replacing /usr/local/bin/perl with the appropriate location.
5757   I
5758 \end_layout
5759
5760 \begin_layout LyX-Code
5761     will assume that you can do this, although it is not critical (it
5762 \end_layout
5763
5764 \begin_layout LyX-Code
5765     is needed only to test the installation and to use the "show_hits"
5766 \end_layout
5767
5768 \begin_layout LyX-Code
5769     utility).
5770   Perl is not required to actually install and run
5771 \end_layout
5772
5773 \begin_layout LyX-Code
5774     scan_for_matches.
5775
5776 \end_layout
5777
5778 \begin_layout LyX-Code
5779     If you do not have Perl, I suggest you get it and install it (it
5780 \end_layout
5781
5782 \begin_layout LyX-Code
5783     is a wonderful utility).
5784   Information about Perl and how to get it
5785 \end_layout
5786
5787 \begin_layout LyX-Code
5788     can be found in the book "Programming Perl" by Larry Wall and
5789 \end_layout
5790
5791 \begin_layout LyX-Code
5792     Randall L.
5793  Schwartz, published by O'Reilly & Associates, Inc.
5794 \end_layout
5795
5796 \begin_layout LyX-Code
5797     To get started, you will need to compile the program.
5798    I do this
5799 \end_layout
5800
5801 \begin_layout LyX-Code
5802     using
5803 \end_layout
5804
5805 \begin_layout LyX-Code
5806         gcc -O -o scan_for_matches  ggpunit.c scan_for_matches.c
5807 \end_layout
5808
5809 \begin_layout LyX-Code
5810     If you do not use GNU C, use
5811 \end_layout
5812
5813 \begin_layout LyX-Code
5814         cc -O -DCC -o scan_for_matches  ggpunit.c scan_for_matches.c
5815 \end_layout
5816
5817 \begin_layout LyX-Code
5818     which works on my Sun.
5819
5820 \end_layout
5821
5822 \begin_layout LyX-Code
5823     Once you have compiled scan_for_matches, you can verify that it
5824 \end_layout
5825
5826 \begin_layout LyX-Code
5827     works with
5828 \end_layout
5829
5830 \begin_layout LyX-Code
5831         clone% run_tests tmp
5832 \end_layout
5833
5834 \begin_layout LyX-Code
5835         clone% diff tmp test_output
5836 \end_layout
5837
5838 \begin_layout LyX-Code
5839     You may get a few strange lines of the sort
5840 \end_layout
5841
5842 \begin_layout LyX-Code
5843         clone% run_tests tmp
5844 \end_layout
5845
5846 \begin_layout LyX-Code
5847         rm: tmp: No such file or directory
5848 \end_layout
5849
5850 \begin_layout LyX-Code
5851         clone% diff tmp test_output
5852 \end_layout
5853
5854 \begin_layout LyX-Code
5855     These should cause no concern.
5856   However, if the "diff" shows that
5857 \end_layout
5858
5859 \begin_layout LyX-Code
5860     tmp and test_output are different, contact me (you have a
5861 \end_layout
5862
5863 \begin_layout LyX-Code
5864     problem).
5865
5866 \end_layout
5867
5868 \begin_layout LyX-Code
5869     You should now be able to use scan_for_matches by following the
5870 \end_layout
5871
5872 \begin_layout LyX-Code
5873     instructions given below (which is all the normal user should have
5874 \end_layout
5875
5876 \begin_layout LyX-Code
5877     to understand, once things are installed properly).
5878 \end_layout
5879
5880 \begin_layout LyX-Code
5881  ==============================================================
5882 \end_layout
5883
5884 \begin_layout LyX-Code
5885 How to run scan_for_matches:
5886 \end_layout
5887
5888 \begin_layout LyX-Code
5889     To run the program, you type need to create two files
5890 \end_layout
5891
5892 \begin_layout LyX-Code
5893     1.
5894   the first file contains the pattern you wish to scan for; I'll
5895 \end_layout
5896
5897 \begin_layout LyX-Code
5898         call this file pat_file in what follows (but any name is ok)
5899 \end_layout
5900
5901 \begin_layout LyX-Code
5902     2.
5903   the second file contains a set of sequences to scan.
5904   These
5905 \end_layout
5906
5907 \begin_layout LyX-Code
5908         should be in "fasta format".
5909   Just look at the contents of
5910 \end_layout
5911
5912 \begin_layout LyX-Code
5913         test_dna_input to see examples of this format.
5914   Basically,
5915 \end_layout
5916
5917 \begin_layout LyX-Code
5918         each sequence begins with a line of the form
5919 \end_layout
5920
5921 \begin_layout LyX-Code
5922            >sequence_id
5923 \end_layout
5924
5925 \begin_layout LyX-Code
5926         and is followed by one or more lines containing the sequence.
5927 \end_layout
5928
5929 \begin_layout LyX-Code
5930     Once these files have been created, you just use
5931 \end_layout
5932
5933 \begin_layout LyX-Code
5934         scan_for_matches pat_file < input_file
5935 \end_layout
5936
5937 \begin_layout LyX-Code
5938     to scan all of the input sequences for the given pattern.
5939   As an
5940 \end_layout
5941
5942 \begin_layout LyX-Code
5943     example, suppose that pat_file contains a single line of the form
5944 \end_layout
5945
5946 \begin_layout LyX-Code
5947                 p1=4...7 3...8 ~p1
5948 \end_layout
5949
5950 \begin_layout LyX-Code
5951     Then,
5952 \end_layout
5953
5954 \begin_layout LyX-Code
5955                 scan_for_matches pat_file < test_dna_input
5956 \end_layout
5957
5958 \begin_layout LyX-Code
5959     should produce two "hits".
5960   When I run this on my machine, I get
5961 \end_layout
5962
5963 \begin_layout LyX-Code
5964         clone% scan_for_matches pat_file < test_dna_input
5965 \end_layout
5966
5967 \begin_layout LyX-Code
5968         >tst1:[6,27]
5969 \end_layout
5970
5971 \begin_layout LyX-Code
5972         cguaacc ggttaacc gguuacg
5973 \end_layout
5974
5975 \begin_layout LyX-Code
5976         >tst2:[6,27]
5977 \end_layout
5978
5979 \begin_layout LyX-Code
5980         CGUAACC GGTTAACC GGUUACG
5981 \end_layout
5982
5983 \begin_layout LyX-Code
5984         clone%
5985 \end_layout
5986
5987 \begin_layout LyX-Code
5988 Simple Patterns Built by Matching Ranges and Reverse Complements
5989 \end_layout
5990
5991 \begin_layout LyX-Code
5992     Let me first explain this simple pattern:
5993 \end_layout
5994
5995 \begin_layout LyX-Code
5996
5997 \end_layout
5998
5999 \begin_layout LyX-Code
6000                 p1=4...7 3...8 ~p1
6001 \end_layout
6002
6003 \begin_layout LyX-Code
6004     The pattern consists of three "pattern units" separated by spaces.
6005 \end_layout
6006
6007 \begin_layout LyX-Code
6008     The first pattern unit is
6009 \end_layout
6010
6011 \begin_layout LyX-Code
6012                 p1=4...7
6013 \end_layout
6014
6015 \begin_layout LyX-Code
6016     which means "match 4 to 7 characters and call them p1".
6017   The
6018 \end_layout
6019
6020 \begin_layout LyX-Code
6021     second pattern unit is
6022 \end_layout
6023
6024 \begin_layout LyX-Code
6025                 3...8
6026 \end_layout
6027
6028 \begin_layout LyX-Code
6029     which means "then match 3 to 8 characters".
6030   The last pattern unit
6031 \end_layout
6032
6033 \begin_layout LyX-Code
6034     is
6035 \end_layout
6036
6037 \begin_layout LyX-Code
6038                 ~p1
6039 \end_layout
6040
6041 \begin_layout LyX-Code
6042     which means "match the reverse complement of p1".
6043   The first
6044 \end_layout
6045
6046 \begin_layout LyX-Code
6047     reported hit is shown as
6048 \end_layout
6049
6050 \begin_layout LyX-Code
6051         >tst1:[6,27]
6052 \end_layout
6053
6054 \begin_layout LyX-Code
6055         cguaacc ggttaacc gguuacg
6056 \end_layout
6057
6058 \begin_layout LyX-Code
6059     which states that characters 6 through 27 of sequence tst1 were
6060 \end_layout
6061
6062 \begin_layout LyX-Code
6063     matched.
6064   "cguaac" matched the first pattern unit, "ggttaacc" the
6065 \end_layout
6066
6067 \begin_layout LyX-Code
6068     second, and "gguuacg" the third.
6069   This is an example of a common
6070 \end_layout
6071
6072 \begin_layout LyX-Code
6073     type of pattern used to search for sections of DNA or RNA that
6074 \end_layout
6075
6076 \begin_layout LyX-Code
6077     would fold into a hairpin loop.
6078 \end_layout
6079
6080 \begin_layout LyX-Code
6081 Searching Both Strands
6082 \end_layout
6083
6084 \begin_layout LyX-Code
6085     Now for a short aside: scan_for_matches only searched the
6086 \end_layout
6087
6088 \begin_layout LyX-Code
6089     sequences in the input file; it did not search the opposite
6090 \end_layout
6091
6092 \begin_layout LyX-Code
6093     strand.
6094   With a pattern of the sort we just used, there is not
6095 \end_layout
6096
6097 \begin_layout LyX-Code
6098     need o search the opposite strand.
6099   However, it is normally the
6100 \end_layout
6101
6102 \begin_layout LyX-Code
6103     case that you will wish to search both the sequence and the
6104 \end_layout
6105
6106 \begin_layout LyX-Code
6107     opposite strand (i.e., the reverse complement of the sequence).
6108 \end_layout
6109
6110 \begin_layout LyX-Code
6111     To do that, you would just use the "-c" command line.
6112   For example,
6113 \end_layout
6114
6115 \begin_layout LyX-Code
6116         scan_for_matches -c pat_file < test_dna_input
6117 \end_layout
6118
6119 \begin_layout LyX-Code
6120     Hits on the opposite strand will show a beginning location greater
6121 \end_layout
6122
6123 \begin_layout LyX-Code
6124     than te end location of the match.
6125 \end_layout
6126
6127 \begin_layout LyX-Code
6128 Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions
6129 \end_layout
6130
6131 \begin_layout LyX-Code
6132     Let us stop now and ask "What additional features would one need to
6133 \end_layout
6134
6135 \begin_layout LyX-Code
6136     really find the kinds of loop structures that characterize tRNAs,
6137 \end_layout
6138
6139 \begin_layout LyX-Code
6140     rRNAs, and so forth?"  I can immediately think of two:
6141 \end_layout
6142
6143 \begin_layout LyX-Code
6144         a) you will need to be able to allow non-standard pairings
6145 \end_layout
6146
6147 \begin_layout LyX-Code
6148            (those other than G-C and A-U), and
6149 \end_layout
6150
6151 \begin_layout LyX-Code
6152         b) you will need to be able to tolerate some number of
6153 \end_layout
6154
6155 \begin_layout LyX-Code
6156            mismatches and bulges.
6157 \end_layout
6158
6159 \begin_layout LyX-Code
6160
6161 \end_layout
6162
6163 \begin_layout LyX-Code
6164     Let me first show you how to handle non-standard "rules for
6165 \end_layout
6166
6167 \begin_layout LyX-Code
6168     pairing in reverse complements".
6169   Consider the following pattern,
6170 \end_layout
6171
6172 \begin_layout LyX-Code
6173     which I show as two line (you may use as many lines as you like in
6174 \end_layout
6175
6176 \begin_layout LyX-Code
6177     forming a pattern, although you can only break a pattern at points
6178 \end_layout
6179
6180 \begin_layout LyX-Code
6181     where space would be legal):
6182 \end_layout
6183
6184 \begin_layout LyX-Code
6185             r1={au,ua,gc,cg,gu,ug,ga,ag}
6186 \end_layout
6187
6188 \begin_layout LyX-Code
6189             p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1
6190 \end_layout
6191
6192 \begin_layout LyX-Code
6193     The first "pattern unit" does not actually match anything; rather,
6194 \end_layout
6195
6196 \begin_layout LyX-Code
6197     it defines a "pairing rule" in which standard pairings are
6198 \end_layout
6199
6200 \begin_layout LyX-Code
6201     allowed, as well as G-A and A-G (in case you wondered, Us and Ts
6202 \end_layout
6203
6204 \begin_layout LyX-Code
6205     and upper and lower case can be used interchangably; for example
6206 \end_layout
6207
6208 \begin_layout LyX-Code
6209     r1={AT,UA,gc,cg} could be used to define the "standard rule" for
6210 \end_layout
6211
6212 \begin_layout LyX-Code
6213     pairings).
6214   The second line consists of six pattern units which
6215 \end_layout
6216
6217 \begin_layout LyX-Code
6218     may be interpreted as follows:
6219 \end_layout
6220
6221 \begin_layout LyX-Code
6222             p1=2...3     match 2 or 3 characters (call it p1)
6223 \end_layout
6224
6225 \begin_layout LyX-Code
6226             0...4        match 0 to 4 characters
6227 \end_layout
6228
6229 \begin_layout LyX-Code
6230             p2=2...5     match 2 to 5 characters (call it p2)
6231 \end_layout
6232
6233 \begin_layout LyX-Code
6234             1...5        match 1 to 5 characters
6235 \end_layout
6236
6237 \begin_layout LyX-Code
6238             r1~p2        match the reverse complement of p2,
6239 \end_layout
6240
6241 \begin_layout LyX-Code
6242                             allowing G-A and A-G pairs
6243 \end_layout
6244
6245 \begin_layout LyX-Code
6246             0...4        match 0 to 4 characters
6247 \end_layout
6248
6249 \begin_layout LyX-Code
6250             ~p1          match the reverse complement of p1
6251 \end_layout
6252
6253 \begin_layout LyX-Code
6254                             allowing only G-C, C-G, A-T, and T-A pairs
6255 \end_layout
6256
6257 \begin_layout LyX-Code
6258     Thus, r1~p2 means "match the reverse complement of p2 using rule r1".
6259 \end_layout
6260
6261 \begin_layout LyX-Code
6262     Now let us consider the issue of tolerating mismatches and bulges.
6263 \end_layout
6264
6265 \begin_layout LyX-Code
6266     You may add a "qualifier" to the pattern unit that gives the
6267 \end_layout
6268
6269 \begin_layout LyX-Code
6270     tolerable number of "mismatches, deletions, and insertions".
6271 \end_layout
6272
6273 \begin_layout LyX-Code
6274     Thus,
6275 \end_layout
6276
6277 \begin_layout LyX-Code
6278                 p1=10...10 3...8 ~p1[1,2,1]
6279 \end_layout
6280
6281 \begin_layout LyX-Code
6282     means that the third pattern unit must match 10 characters,
6283 \end_layout
6284
6285 \begin_layout LyX-Code
6286     allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or
6287 \end_layout
6288
6289 \begin_layout LyX-Code
6290     T-A), two deletions (a deletion is a character that occurs in p1,
6291 \end_layout
6292
6293 \begin_layout LyX-Code
6294     but has been "deleted" from the string matched by ~p1), and one
6295 \end_layout
6296
6297 \begin_layout LyX-Code
6298     insertion (an "insertion" is a character that occurs in the string
6299 \end_layout
6300
6301 \begin_layout LyX-Code
6302     matched by ~p1, but not for which no corresponding character
6303 \end_layout
6304
6305 \begin_layout LyX-Code
6306     occurs in p1).
6307   In this case, the pattern would match
6308 \end_layout
6309
6310 \begin_layout LyX-Code
6311               ACGTACGTAC GGGGGGGG GCGTTACCT
6312 \end_layout
6313
6314 \begin_layout LyX-Code
6315     which is, you must admit, a fairly weak loop.
6316   It is common to
6317 \end_layout
6318
6319 \begin_layout LyX-Code
6320     allow mismatches, but you will find yourself using insertions and
6321 \end_layout
6322
6323 \begin_layout LyX-Code
6324     deletions much more rarely.
6325   In any event, you should note that
6326 \end_layout
6327
6328 \begin_layout LyX-Code
6329     allowing mismatches, insertions, and deletions does force the
6330 \end_layout
6331
6332 \begin_layout LyX-Code
6333     program to try many additional possible pairings, so it does slow
6334 \end_layout
6335
6336 \begin_layout LyX-Code
6337     things down a bit.
6338 \end_layout
6339
6340 \begin_layout LyX-Code
6341 How Patterns Are Matched
6342 \end_layout
6343
6344 \begin_layout LyX-Code
6345     Now is as good a time as any to discuss the basic flow of control
6346 \end_layout
6347
6348 \begin_layout LyX-Code
6349     when matching patterns.
6350   Recall that a "pattern" is a sequence of
6351 \end_layout
6352
6353 \begin_layout LyX-Code
6354     "pattern units".
6355   Suppose that the pattern units were
6356 \end_layout
6357
6358 \begin_layout LyX-Code
6359         u1 u2 u3 u4 ...
6360  un
6361 \end_layout
6362
6363 \begin_layout LyX-Code
6364     The scan of a sequence S begins by setting the current position
6365 \end_layout
6366
6367 \begin_layout LyX-Code
6368     to 1.
6369   Then, an attempt is made to match u1 starting at the
6370 \end_layout
6371
6372 \begin_layout LyX-Code
6373     current position.
6374   Each attempt to match a pattern unit can
6375 \end_layout
6376
6377 \begin_layout LyX-Code
6378     succeed or fail.
6379   If it succeeds, then an attempt is made to match
6380 \end_layout
6381
6382 \begin_layout LyX-Code
6383     the next unit.
6384   If it fails, then an attempt is made to find an
6385 \end_layout
6386
6387 \begin_layout LyX-Code
6388     alternative match for the immediately preceding pattern unit.
6389   If
6390 \end_layout
6391
6392 \begin_layout LyX-Code
6393     this succeeds, then we proceed forward again to the next unit.
6394   If
6395 \end_layout
6396
6397 \begin_layout LyX-Code
6398     it fails we go back to the preceding unit.
6399   This process is called
6400 \end_layout
6401
6402 \begin_layout LyX-Code
6403     "backtracking".
6404   If there are no previous units, then the current
6405 \end_layout
6406
6407 \begin_layout LyX-Code
6408     position is incremented by one, and everything starts again.
6409   This
6410 \end_layout
6411
6412 \begin_layout LyX-Code
6413     proceeds until either the current position goes past the end of
6414 \end_layout
6415
6416 \begin_layout LyX-Code
6417     the sequence or all of the pattern units succeed.
6418   On success,
6419 \end_layout
6420
6421 \begin_layout LyX-Code
6422     scan_for_matches reports the "hit", the current position is set
6423 \end_layout
6424
6425 \begin_layout LyX-Code
6426     just past the hit, and an attempt is made to find another hit.
6427 \end_layout
6428
6429 \begin_layout LyX-Code
6430     If you wish to limit the scan to simply finding a maximum of, say,
6431 \end_layout
6432
6433 \begin_layout LyX-Code
6434     10 hits, you can use the -n option (-n 10 would set the limit to
6435 \end_layout
6436
6437 \begin_layout LyX-Code
6438     10 reported hits).
6439   For example,
6440 \end_layout
6441
6442 \begin_layout LyX-Code
6443         scan_for_matches -c -n 1 pat_file < test_dna_input
6444 \end_layout
6445
6446 \begin_layout LyX-Code
6447     would search for just the first hit (and would stop searching the
6448 \end_layout
6449
6450 \begin_layout LyX-Code
6451     current sequences or any that follow in the input file).
6452 \end_layout
6453
6454 \begin_layout LyX-Code
6455 Searching for repeats:
6456 \end_layout
6457
6458 \begin_layout LyX-Code
6459     In the last section, I discussed almost all of the details
6460 \end_layout
6461
6462 \begin_layout LyX-Code
6463     required to allow you to look for repeats.
6464   Consider the following
6465 \end_layout
6466
6467 \begin_layout LyX-Code
6468     set of patterns:
6469 \end_layout
6470
6471 \begin_layout LyX-Code
6472         p1=6...6 3...8 p1   (find exact 6 character repeat separated
6473 \end_layout
6474
6475 \begin_layout LyX-Code
6476                              by to 8 characters)
6477 \end_layout
6478
6479 \begin_layout LyX-Code
6480         p1=6...6 3..8 p1[1,0,0]   (allow one mismatch)
6481 \end_layout
6482
6483 \begin_layout LyX-Code
6484         p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0]
6485 \end_layout
6486
6487 \begin_layout LyX-Code
6488                             (match 12 characters that are the remains
6489 \end_layout
6490
6491 \begin_layout LyX-Code
6492                              of a 3-character sequence occurring 4 times)
6493 \end_layout
6494
6495 \begin_layout LyX-Code
6496
6497 \end_layout
6498
6499 \begin_layout LyX-Code
6500         p1=4...8 0...3 p2=6...8 p1 0...3 p2
6501 \end_layout
6502
6503 \begin_layout LyX-Code
6504                             (This would match things like
6505 \end_layout
6506
6507 \begin_layout LyX-Code
6508                                 ATCT G TCTTT ATCT TG TCTTT
6509 \end_layout
6510
6511 \begin_layout LyX-Code
6512                             )
6513 \end_layout
6514
6515 \begin_layout LyX-Code
6516 Searching for particular sequences:
6517 \end_layout
6518
6519 \begin_layout LyX-Code
6520     Occasionally, one wishes to match a specific, known sequence.
6521 \end_layout
6522
6523 \begin_layout LyX-Code
6524     In such a case, you can just give the sequence (along with an
6525 \end_layout
6526
6527 \begin_layout LyX-Code
6528     optional statement of the allowable mismatches, insertions, and
6529 \end_layout
6530
6531 \begin_layout LyX-Code
6532     deletions).
6533   Thus,
6534 \end_layout
6535
6536 \begin_layout LyX-Code
6537         p1=6...8 GAGA ~p1    (match a hairpin with GAGA as the loop)
6538 \end_layout
6539
6540 \begin_layout LyX-Code
6541         RRRRYYYY             (match 4 purines followed by 4 pyrimidines)
6542 \end_layout
6543
6544 \begin_layout LyX-Code
6545         TATAA[1,0,0]         (match TATAA, allowing 1 mismatch)
6546 \end_layout
6547
6548 \begin_layout LyX-Code
6549
6550 \end_layout
6551
6552 \begin_layout LyX-Code
6553 Matches against a "weight matrix":
6554 \end_layout
6555
6556 \begin_layout LyX-Code
6557     I will conclude my examples of the types of pattern units
6558 \end_layout
6559
6560 \begin_layout LyX-Code
6561     available for matching against nucleotide sequences by discussing a
6562 \end_layout
6563
6564 \begin_layout LyX-Code
6565     crude implemetation of matching using a "weight matrix".
6566   While I
6567 \end_layout
6568
6569 \begin_layout LyX-Code
6570     am less than overwhelmed with the syntax that I chose, I think that
6571 \end_layout
6572
6573 \begin_layout LyX-Code
6574     the reader should be aware that I was thinking of generating
6575 \end_layout
6576
6577 \begin_layout LyX-Code
6578     patterns containing such pattern units automatically from
6579 \end_layout
6580
6581 \begin_layout LyX-Code
6582     alignments (and did not really plan on typing such things in by
6583 \end_layout
6584
6585 \begin_layout LyX-Code
6586     hand very often).
6587   Anyway, suppose that you wanted to match a
6588 \end_layout
6589
6590 \begin_layout LyX-Code
6591     sequence of eight characters.
6592   The "consensus" of these eight
6593 \end_layout
6594
6595 \begin_layout LyX-Code
6596     characters is GRCACCGS, but the actual "frequencies of occurrence"
6597 \end_layout
6598
6599 \begin_layout LyX-Code
6600     are given in the matrix below.
6601   Thus, the first character is an A
6602 \end_layout
6603
6604 \begin_layout LyX-Code
6605     16% the time and a G 84% of the time.
6606   The second is an A 57% of
6607 \end_layout
6608
6609 \begin_layout LyX-Code
6610     the time, a C 10% of the time, a G 29% of the time, and a T 4% of
6611 \end_layout
6612
6613 \begin_layout LyX-Code
6614     the time.
6615
6616 \end_layout
6617
6618 \begin_layout LyX-Code
6619              C1     C2    C3    C4   C5    C6    C7    C8
6620 \end_layout
6621
6622 \begin_layout LyX-Code
6623
6624 \end_layout
6625
6626 \begin_layout LyX-Code
6627        A     16     57     0    95    0    18     0     0
6628 \end_layout
6629
6630 \begin_layout LyX-Code
6631        C      0     10    80     0  100    60     0    50
6632 \end_layout
6633
6634 \begin_layout LyX-Code
6635        G     84     29     0     0    0    20   100    50
6636 \end_layout
6637
6638 \begin_layout LyX-Code
6639        T      0      4    20     5    0     2     0     0
6640 \end_layout
6641
6642 \begin_layout LyX-Code
6643
6644 \end_layout
6645
6646 \begin_layout LyX-Code
6647     One could use the following pattern unit to search for inexact
6648 \end_layout
6649
6650 \begin_layout LyX-Code
6651     matches related to such a "weight matrix":
6652 \end_layout
6653
6654 \begin_layout LyX-Code
6655         {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5),
6656 \end_layout
6657
6658 \begin_layout LyX-Code
6659          (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450
6660 \end_layout
6661
6662 \begin_layout LyX-Code
6663     This pattern unit will attempt to match exactly eight characters.
6664 \end_layout
6665
6666 \begin_layout LyX-Code
6667     For each character in the sequence, the entry in the corresponding
6668 \end_layout
6669
6670 \begin_layout LyX-Code
6671     tuple is added to an accumulated sum.
6672   If the sum is greater than
6673 \end_layout
6674
6675 \begin_layout LyX-Code
6676     450, the match succeeds; else it fails.
6677 \end_layout
6678
6679 \begin_layout LyX-Code
6680     Recently, this feature was upgraded to allow ranges.
6681   Thus,
6682 \end_layout
6683
6684 \begin_layout LyX-Code
6685   600 >  {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5),
6686 \end_layout
6687
6688 \begin_layout LyX-Code
6689          (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450
6690 \end_layout
6691
6692 \begin_layout LyX-Code
6693     will work, as well.
6694 \end_layout
6695
6696 \begin_layout LyX-Code
6697 Allowing Alternatives:
6698 \end_layout
6699
6700 \begin_layout LyX-Code
6701     Very occasionally, you may wish to allow alternative pattern units
6702 \end_layout
6703
6704 \begin_layout LyX-Code
6705     (i.e., "match either A or B").
6706   You can do this using something
6707 \end_layout
6708
6709 \begin_layout LyX-Code
6710     like
6711 \end_layout
6712
6713 \begin_layout LyX-Code
6714                 ( GAGA | GCGCA)
6715 \end_layout
6716
6717 \begin_layout LyX-Code
6718     which says "match either GAGA or GCGCA".
6719   You may take
6720 \end_layout
6721
6722 \begin_layout LyX-Code
6723     alternatives of a list of pattern units, for example
6724 \end_layout
6725
6726 \begin_layout LyX-Code
6727         (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG)
6728 \end_layout
6729
6730 \begin_layout LyX-Code
6731     would match one of two sequences of pattern units.
6732   There is one
6733 \end_layout
6734
6735 \begin_layout LyX-Code
6736     clumsy aspect of the syntax: to match a list of alternatives, you
6737 \end_layout
6738
6739 \begin_layout LyX-Code
6740     need to fully the request.
6741   Thus,
6742 \end_layout
6743
6744 \begin_layout LyX-Code
6745         (GAGA | (GCGCA | TTCGA))
6746 \end_layout
6747
6748 \begin_layout LyX-Code
6749     would be needed to try the three alternatives.
6750 \end_layout
6751
6752 \begin_layout LyX-Code
6753 One Minor Extension
6754 \end_layout
6755
6756 \begin_layout LyX-Code
6757     Sometimes a pattern will contain a sequence of distinct ranges,
6758 \end_layout
6759
6760 \begin_layout LyX-Code
6761     and you might wish to limit the sum of the lengths of the matched
6762 \end_layout
6763
6764 \begin_layout LyX-Code
6765     subsequences.
6766    For example, suppose that you basically wanted to
6767 \end_layout
6768
6769 \begin_layout LyX-Code
6770     match something like
6771 \end_layout
6772
6773 \begin_layout LyX-Code
6774     ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT
6775 \end_layout
6776
6777 \begin_layout LyX-Code
6778     but that the sum of the lengths of p1, p2, and p3 must not exceed
6779 \end_layout
6780
6781 \begin_layout LyX-Code
6782     eight characters.
6783   To do this, you could add
6784 \end_layout
6785
6786 \begin_layout LyX-Code
6787         length(p1+p2+p3) < 9
6788 \end_layout
6789
6790 \begin_layout LyX-Code
6791     as the last pattern unit.
6792   It will just succeed or fail (but does
6793 \end_layout
6794
6795 \begin_layout LyX-Code
6796     not actually match any characters in the sequence).
6797 \end_layout
6798
6799 \begin_layout LyX-Code
6800
6801 \end_layout
6802
6803 \begin_layout LyX-Code
6804 Matching Protein Sequences
6805 \end_layout
6806
6807 \begin_layout LyX-Code
6808     Suppose that the input file contains protein sequences.
6809   In this
6810 \end_layout
6811
6812 \begin_layout LyX-Code
6813     case, you must invoke scan_for_matches with the "-p" option.
6814   You
6815 \end_layout
6816
6817 \begin_layout LyX-Code
6818     cannot use aspects of the language that relate directly to
6819 \end_layout
6820
6821 \begin_layout LyX-Code
6822     nucleotide sequences (e.g., the -c command line option or pattern
6823 \end_layout
6824
6825 \begin_layout LyX-Code
6826     constructs referring to the reverse complement of a previously
6827 \end_layout
6828
6829 \begin_layout LyX-Code
6830     matched unit).
6831
6832 \end_layout
6833
6834 \begin_layout LyX-Code
6835     You also have two additional constructs that allow you to match
6836 \end_layout
6837
6838 \begin_layout LyX-Code
6839     either "one of a set of amino acids" or "any amino acid other than
6840 \end_layout
6841
6842 \begin_layout LyX-Code
6843     those a given set".
6844   For example,
6845 \end_layout
6846
6847 \begin_layout LyX-Code
6848         p1=0...4 any(HQD) 1...3 notany(HK) p1
6849 \end_layout
6850
6851 \begin_layout LyX-Code
6852     would successfully match a string like
6853 \end_layout
6854
6855 \begin_layout LyX-Code
6856            YWV D AA C YWV
6857 \end_layout
6858
6859 \begin_layout LyX-Code
6860 Using the show_hits Utility
6861 \end_layout
6862
6863 \begin_layout LyX-Code
6864     When viewing a large set of complex matches, you might find it
6865 \end_layout
6866
6867 \begin_layout LyX-Code
6868     convenient to post-process the scan_for_matches output to get a
6869 \end_layout
6870
6871 \begin_layout LyX-Code
6872     more readable version.
6873   We provide a simple post-processor called
6874 \end_layout
6875
6876 \begin_layout LyX-Code
6877     "show_hits".
6878   To see its effect, just pipe the output of a
6879 \end_layout
6880
6881 \begin_layout LyX-Code
6882     scan_for_matches into show_hits:
6883 \end_layout
6884
6885 \begin_layout LyX-Code
6886      Normal Output:
6887 \end_layout
6888
6889 \begin_layout LyX-Code
6890         clone% scan_for_matches -c pat_file < tmp
6891 \end_layout
6892
6893 \begin_layout LyX-Code
6894         >tst1:[1,28]
6895 \end_layout
6896
6897 \begin_layout LyX-Code
6898         gtacguaacc  ggttaac cgguuacgtac
6899 \end_layout
6900
6901 \begin_layout LyX-Code
6902         >tst1:[28,1]
6903 \end_layout
6904
6905 \begin_layout LyX-Code
6906         gtacgtaacc  ggttaac cggttacgtac
6907 \end_layout
6908
6909 \begin_layout LyX-Code
6910         >tst2:[2,31]
6911 \end_layout
6912
6913 \begin_layout LyX-Code
6914         CGTACGUAAC C GGTTAACC GGUUACGTACG
6915 \end_layout
6916
6917 \begin_layout LyX-Code
6918         >tst2:[31,2]
6919 \end_layout
6920
6921 \begin_layout LyX-Code
6922         CGTACGTAAC C GGTTAACC GGTTACGTACG
6923 \end_layout
6924
6925 \begin_layout LyX-Code
6926         >tst3:[3,32]
6927 \end_layout
6928
6929 \begin_layout LyX-Code
6930         gtacguaacc g gttaactt cgguuacgtac
6931 \end_layout
6932
6933 \begin_layout LyX-Code
6934         >tst3:[32,3]
6935 \end_layout
6936
6937 \begin_layout LyX-Code
6938         gtacgtaacc g aagttaac cggttacgtac
6939 \end_layout
6940
6941 \begin_layout LyX-Code
6942      Piped Through show_hits:
6943 \end_layout
6944
6945 \begin_layout LyX-Code
6946
6947 \end_layout
6948
6949 \begin_layout LyX-Code
6950         clone% scan_for_matches -c pat_file < tmp | show_hits
6951 \end_layout
6952
6953 \begin_layout LyX-Code
6954         tst1:[1,28]:  gtacguaacc   ggttaac  cgguuacgtac
6955 \end_layout
6956
6957 \begin_layout LyX-Code
6958         tst1:[28,1]:  gtacgtaacc   ggttaac  cggttacgtac
6959 \end_layout
6960
6961 \begin_layout LyX-Code
6962         tst2:[2,31]:  CGTACGUAAC C GGTTAACC GGUUACGTACG
6963 \end_layout
6964
6965 \begin_layout LyX-Code
6966         tst2:[31,2]:  CGTACGTAAC C GGTTAACC GGTTACGTACG
6967 \end_layout
6968
6969 \begin_layout LyX-Code
6970         tst3:[3,32]:  gtacguaacc g gttaactt cgguuacgtac
6971 \end_layout
6972
6973 \begin_layout LyX-Code
6974         tst3:[32,3]:  gtacgtaacc g aagttaac cggttacgtac
6975 \end_layout
6976
6977 \begin_layout LyX-Code
6978         clone%
6979 \end_layout
6980
6981 \begin_layout LyX-Code
6982     Optionally, you can specify which of the "fields" in the matches
6983 \end_layout
6984
6985 \begin_layout LyX-Code
6986     you wish to sort on, and show_hits will sort them.
6987   The field
6988 \end_layout
6989
6990 \begin_layout LyX-Code
6991     numbers start with 0.
6992   So, you might get something like
6993 \end_layout
6994
6995 \begin_layout LyX-Code
6996         clone% scan_for_matches -c pat_file < tmp | show_hits 2 1
6997 \end_layout
6998
6999 \begin_layout LyX-Code
7000         tst2:[2,31]:  CGTACGUAAC C GGTTAACC GGUUACGTACG
7001 \end_layout
7002
7003 \begin_layout LyX-Code
7004         tst2:[31,2]:  CGTACGTAAC C GGTTAACC GGTTACGTACG
7005 \end_layout
7006
7007 \begin_layout LyX-Code
7008         tst3:[32,3]:  gtacgtaacc g aagttaac cggttacgtac
7009 \end_layout
7010
7011 \begin_layout LyX-Code
7012         tst1:[1,28]:  gtacguaacc   ggttaac  cgguuacgtac
7013 \end_layout
7014
7015 \begin_layout LyX-Code
7016         tst1:[28,1]:  gtacgtaacc   ggttaac  cggttacgtac
7017 \end_layout
7018
7019 \begin_layout LyX-Code
7020         tst3:[3,32]:  gtacguaacc g gttaactt cgguuacgtac
7021 \end_layout
7022
7023 \begin_layout LyX-Code
7024         clone%
7025 \end_layout
7026
7027 \begin_layout LyX-Code
7028     In this case, the hits have been sorted on fields 2 and 1 (that is,
7029 \end_layout
7030
7031 \begin_layout LyX-Code
7032     the third and second matched subfields).
7033 \end_layout
7034
7035 \begin_layout LyX-Code
7036     show_hits is just one possible little post-processor, and you
7037 \end_layout
7038
7039 \begin_layout LyX-Code
7040     might well wish to write a customized one for yourself.
7041 \end_layout
7042
7043 \begin_layout LyX-Code
7044 Reducing the Cost of a Search
7045 \end_layout
7046
7047 \begin_layout LyX-Code
7048     The scan_for_matches utility uses a fairly simple search, and may
7049 \end_layout
7050
7051 \begin_layout LyX-Code
7052     consume large amounts of CPU time for complex patterns.
7053   Someday,
7054 \end_layout
7055
7056 \begin_layout LyX-Code
7057     I may decide to optimize the code.
7058   However, until then, let me
7059 \end_layout
7060
7061 \begin_layout LyX-Code
7062     mention one useful technique.
7063
7064 \end_layout
7065
7066 \begin_layout LyX-Code
7067     When you have a complex pattern that includes a number of varying
7068 \end_layout
7069
7070 \begin_layout LyX-Code
7071     ranges, imprecise matches, and so forth, it is useful to
7072 \end_layout
7073
7074 \begin_layout LyX-Code
7075     "pipeline" matches.
7076   That is, form a simpler pattern that can be
7077 \end_layout
7078
7079 \begin_layout LyX-Code
7080     used to scan through a large database extracting sections that
7081 \end_layout
7082
7083 \begin_layout LyX-Code
7084     might be matched by the more complex pattern.
7085   Let me illustrate
7086 \end_layout
7087
7088 \begin_layout LyX-Code
7089     with a short example.
7090   Suppose that you really wished to match the
7091 \end_layout
7092
7093 \begin_layout LyX-Code
7094     pattern
7095 \end_layout
7096
7097 \begin_layout LyX-Code
7098     p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0]
7099 \end_layout
7100
7101 \begin_layout LyX-Code
7102     In this case, the pattern units AGC 3...5 RYGC can be used to rapidly
7103 \end_layout
7104
7105 \begin_layout LyX-Code
7106     constrain the overall search.
7107   You can preprocess the overall
7108 \end_layout
7109
7110 \begin_layout LyX-Code
7111     database using the pattern:
7112 \end_layout
7113
7114 \begin_layout LyX-Code
7115           31...31 AGC 3...5 RYGC 7...7
7116 \end_layout
7117
7118 \begin_layout LyX-Code
7119     Put the complex pattern in pat_file1 and the simpler pattern in
7120 \end_layout
7121
7122 \begin_layout LyX-Code
7123     pat_file2.
7124   Then use,
7125 \end_layout
7126
7127 \begin_layout LyX-Code
7128         scan_for_matches -c pat_file2 < nucleotide_database |
7129 \end_layout
7130
7131 \begin_layout LyX-Code
7132         scan_for_matches pat_file1
7133 \end_layout
7134
7135 \begin_layout LyX-Code
7136     The output will show things like
7137 \end_layout
7138
7139 \begin_layout LyX-Code
7140     >seqid:[232,280][2,47]
7141 \end_layout
7142
7143 \begin_layout LyX-Code
7144     matches pieces
7145 \end_layout
7146
7147 \begin_layout LyX-Code
7148     Then, the actual section of the sequence that was matched can be
7149 \end_layout
7150
7151 \begin_layout LyX-Code
7152     easily computed as [233,278] (remember, the positions start from
7153 \end_layout
7154
7155 \begin_layout LyX-Code
7156     1, not 0).
7157 \end_layout
7158
7159 \begin_layout LyX-Code
7160     Let me finally add, you should do a few short experiments to see
7161 \end_layout
7162
7163 \begin_layout LyX-Code
7164     whether or not such pipelining actually improves performance -- it
7165 \end_layout
7166
7167 \begin_layout LyX-Code
7168     is not always obvious where the time is going, and I have
7169 \end_layout
7170
7171 \begin_layout LyX-Code
7172     sometimes found that the added complexity of pipelining actually
7173 \end_layout
7174
7175 \begin_layout LyX-Code
7176     slowed things up.
7177   It gets its best improvements when there are
7178 \end_layout
7179
7180 \begin_layout LyX-Code
7181     exact matches of more than just a few characters that can be
7182 \end_layout
7183
7184 \begin_layout LyX-Code
7185     rapidly used to eliminate large sections of the database.
7186 \end_layout
7187
7188 \begin_layout LyX-Code
7189 =============
7190 \end_layout
7191
7192 \begin_layout LyX-Code
7193 Additions:
7194 \end_layout
7195
7196 \begin_layout LyX-Code
7197 Feb 9, 1995:   the pattern units ^ and $ now work as in normal regular
7198 \end_layout
7199
7200 \begin_layout LyX-Code
7201                expressions.
7202   That is
7203 \end_layout
7204
7205 \begin_layout LyX-Code
7206                         TTF $
7207 \end_layout
7208
7209 \begin_layout LyX-Code
7210                matches only TTF at the end of the string and
7211 \end_layout
7212
7213 \begin_layout LyX-Code
7214                         ^ TTF
7215 \end_layout
7216
7217 \begin_layout LyX-Code
7218                matches only an initial TTF
7219 \end_layout
7220
7221 \begin_layout LyX-Code
7222                The pattern unit
7223 \end_layout
7224
7225 \begin_layout LyX-Code
7226                         <p1
7227 \end_layout
7228
7229 \begin_layout LyX-Code
7230                matches the reverse of the string named p1.
7231   That is,
7232 \end_layout
7233
7234 \begin_layout LyX-Code
7235                if p1 matched GCAT, then <p1 would match TACG.
7236   Thus,
7237 \end_layout
7238
7239 \begin_layout LyX-Code
7240                    p1=6...6 <p1
7241 \end_layout
7242
7243 \begin_layout LyX-Code
7244                matches a real palindrome (not the biologically common
7245 \end_layout
7246
7247 \begin_layout LyX-Code
7248                meaning of "reverse complement")
7249 \end_layout
7250
7251 \begin_layout LyX-Code
7252
7253 \end_layout
7254
7255 \end_body
7256 \end_document