bp_doc/biopieces_cookbook.lyx~

   1 #LyX 1.5.1 created this file. For more info see http://www.lyx.org/
   2 \lyxformat 276
   3 \begin_document
   4 \begin_header
   5 \textclass scrartcl
   6 \begin_preamble
   7 \usepackage[colorlinks=true, urlcolor=blue, linkcolor=black]{hyperref}
   8 \end_preamble
   9 \language english
  10 \inputencoding auto
  11 \font_roman default
  12 \font_sans default
  13 \font_typewriter default
  14 \font_default_family default
  15 \font_sc false
  16 \font_osf false
  17 \font_sf_scale 100
  18 \font_tt_scale 100
  19 \graphics default
  20 \paperfontsize default
  21 \spacing single
  22 \papersize default
  23 \use_geometry false
  24 \use_amsmath 1
  25 \use_esint 1
  26 \cite_engine basic
  27 \use_bibtopic false
  28 \paperorientation portrait
  29 \secnumdepth 3
  30 \tocdepth 3
  31 \paragraph_separation skip
  32 \defskip medskip
  33 \quotes_language english
  34 \papercolumns 1
  35 \papersides 1
  36 \paperpagestyle default
  37 \tracking_changes false
  38 \output_changes false
  39 \author ""
  40 \author ""
  41 \end_header
  42
  43 \begin_body
  44
  45 \begin_layout Title
  46 Biopieces Cookbook
  47 \end_layout
  48
  49 \begin_layout Author
  50 Martin Asser Hansen
  51 \end_layout
  52
  53 \begin_layout Publishers
  54 John Mattick Group
  55 \newline
  56 Institute for Molecular Bioscience
  57 \newline
  58 University of Queensland
  59 \newline
  60 Aust
  61 ralia
  62 \newline
  63 E-mail: mail@maasha.dk
  64 \end_layout
  65
  66 \begin_layout Standard
  67 \begin_inset ERT
  68 status open
  69
  70 \begin_layout Standard
  71
  72
  73 \backslash
  74 thispagestyle{empty}
  75 \end_layout
  76
  77 \end_inset
  78
  79
  80 \end_layout
  81
  82 \begin_layout Standard
  83
  84 \newpage
  85
  86 \end_layout
  87
  88 \begin_layout Standard
  89 \begin_inset LatexCommand tableofcontents
  90
  91 \end_inset
  92
  93
  94 \end_layout
  95
  96 \begin_layout Standard
  97 \begin_inset FloatList figure
  98
  99 \end_inset
 100
 101
 102 \end_layout
 103
 104 \begin_layout Standard
 105
 106 \newpage
 107
 108 \end_layout
 109
 110 \begin_layout Section
 111 Introduction
 112 \end_layout
 113
 114 \begin_layout Standard
 115 Biopieces is a collection of simple tools that can be linked together (piped
 116  as we shall call it) in a very flexible manner to perform both simple and
 117  complex tasks.
 118  The fundamental idea is that biopieces work on a data stream that will
 119  only terminate at the end of an analysis and that this data stream can
 120  be passed through several different biopieces, each performing one specific
 121  task.
 122  The advantage of this approach is that a user can perform simple and complex
 123  tasks without having to write advanced code.
 124  Moreover, since the data format used to pass data between biopieces is
 125  text based, biopieces can be written by different developers in their favorite
 126  programming language --- and still the biopieces will be able to work together.
 127 \end_layout
 128
 129 \begin_layout Standard
 130 In the most simple form bioools can be piped together on the command line
 131  like this (using the pipe character '|'):
 132 \end_layout
 133
 134 \begin_layout LyX-Code
 135 read_data | calculate_something | write_result
 136 \end_layout
 137
 138 \begin_layout Standard
 139 However, a more comprehensive analysis could be composed:
 140 \end_layout
 141
 142 \begin_layout LyX-Code
 143 read_data | select_entries | convert_entries | search_database
 144 \end_layout
 145
 146 \begin_layout LyX-Code
 147 evaluate_results | plot_diagram | plot_another_diagram |
 148 \end_layout
 149
 150 \begin_layout LyX-Code
 151 load_to_database
 152 \end_layout
 153
 154 \begin_layout Standard
 155 The data stream that is piped through the biopieces consists of records
 156  of key/value pairs in the same way a hash does in order to keep as simple
 157  a structure as possible.
 158  An example record can be seen below:
 159 \end_layout
 160
 161 \begin_layout LyX-Code
 162
 163 \end_layout
 164
 165 \begin_layout LyX-Code
 166
 167 \size scriptsize
 168 REC_TYPE: PATSCAN
 169 \end_layout
 170
 171 \begin_layout LyX-Code
 172
 173 \size scriptsize
 174 MATCH: AGATCAAGTG
 175 \end_layout
 176
 177 \begin_layout LyX-Code
 178
 179 \size scriptsize
 180 S_BEG: 7
 181 \end_layout
 182
 183 \begin_layout LyX-Code
 184
 185 \size scriptsize
 186 S_END: 16
 187 \end_layout
 188
 189 \begin_layout LyX-Code
 190
 191 \size scriptsize
 192 ALIGN_LEN: 9
 193 \end_layout
 194
 195 \begin_layout LyX-Code
 196
 197 \size scriptsize
 198 S_ID: piR-t6
 199 \end_layout
 200
 201 \begin_layout LyX-Code
 202
 203 \size scriptsize
 204 STRAND: +
 205 \end_layout
 206
 207 \begin_layout LyX-Code
 208
 209 \size scriptsize
 210 PATTERN: AGATCAAGTG
 211 \end_layout
 212
 213 \begin_layout LyX-Code
 214
 215 \size scriptsize
 216 ---
 217 \end_layout
 218
 219 \begin_layout Standard
 220 The '
 221 \begin_inset ERT
 222 status collapsed
 223
 224 \begin_layout Standard
 225
 226 -
 227 \backslash
 228 /-
 229 \backslash
 230 /-
 231 \end_layout
 232
 233 \end_inset
 234
 235 ' denotes the delimiter of the records, and each key is a word followed
 236  by a ':' and a white-space and then the value.
 237  By convention the biopieces only uses upper case keys (a list of used keys
 238  can be seen in Appendix\InsetSpace ~
 239
 240 \begin_inset LatexCommand ref
 241 reference "sec:Keys"
 242
 243 \end_inset
 244
 245 ).
 246  Since the records basically are hash structures this mean that the order
 247  of the keys in the stream is unordered, and in the above example it is
 248  pure coincidence that HIT_BEG is displayed before HIT_END, however, when
 249  the order of the keys is importent, the biopieces will automagically see
 250  to that.
 251 \end_layout
 252
 253 \begin_layout Standard
 254 All of the biopieces are able to read and write a data stream to and from
 255  file as long as the records are in the biopieces format.
 256  This means that if you are undertaking a lengthy analysis where one of
 257  the steps is time consuming, you may save the stream after this step, and
 258  subsequently start one or more analysis from that last step
 259 \begin_inset Foot
 260 status collapsed
 261
 262 \begin_layout Standard
 263 It is a goal that the biopieces at some point will be able to dump the data
 264  stream to file in case one of the tools fail critically.
 265 \end_layout
 266
 267 \end_inset
 268
 269 .
 270  If you are running a lengthy analysis it is highly recommended that you
 271  create a small test sample of the data and run that through the pipeline
 272  --- and once you are satisfied with the result proceed with the full data
 273  set (see\InsetSpace ~
 274
 275 \begin_inset LatexCommand ref
 276 reference "sub:How-to-select-a-few-records"
 277
 278 \end_inset
 279
 280 ).
 281 \end_layout
 282
 283 \begin_layout Standard
 284 All of the biopieces can be supplied with long arguments prefixed with
 285 \begin_inset ERT
 286 status collapsed
 287
 288 \begin_layout Standard
 289
 290 -
 291 \backslash
 292 /-
 293 \end_layout
 294
 295 \end_inset
 296
 297  switches or single character arguments prefixed with - switches that can
 298  be grouped together (e.g.
 299  -xok).
 300  In this cookbook only the long switches are used to emphasize what these
 301  switches do.
 302 \end_layout
 303
 304 \begin_layout Section
 305 Setup
 306 \end_layout
 307
 308 \begin_layout Standard
 309 In order to get the biopieces to work, you need to add environment settings
 310  to include the code binaries, scripts, and modules that constitute the
 311  biopieces package.
 312  Assuming that you are using bash, add the following to your '~/.bashrc'
 313  file using your favorite editor.
 314  After the changes has been saved you need to either run 'source ~/.bashrc'
 315  or relogin.
 316 \end_layout
 317
 318 \begin_layout LyX-Code
 319
 320 \size scriptsize
 321 if [ -f "/home/m.hansen/maasha/conf/bashrc" ]; then
 322 \end_layout
 323
 324 \begin_layout LyX-Code
 325
 326 \size scriptsize
 327     source "/home/m.hansen/maasha/conf/bashrc"
 328 \end_layout
 329
 330 \begin_layout LyX-Code
 331
 332 \size scriptsize
 333 fi
 334 \end_layout
 335
 336 \begin_layout Section
 337 Getting Started
 338 \end_layout
 339
 340 \begin_layout Standard
 341 The biopiece
 342 \series bold
 343 list_biopieces
 344 \series default
 345  lists all the biopieces along with a description:
 346 \end_layout
 347
 348 \begin_layout LyX-Code
 349
 350 \size scriptsize
 351 list_biopieces
 352 \end_layout
 353
 354 \begin_layout LyX-Code
 355
 356 \size scriptsize
 357 align_seq             Align sequences in stream using Muscle.
 358 \end_layout
 359
 360 \begin_layout LyX-Code
 361
 362 \size scriptsize
 363 analyze_seq           Analysis the residue composition of each sequence
 364  in stream.
 365 \end_layout
 366
 367 \begin_layout LyX-Code
 368
 369 \size scriptsize
 370 analyze_vals          Determine type, count, min, max, sum and mean for
 371  values in stream.
 372 \end_layout
 373
 374 \begin_layout LyX-Code
 375
 376 \size scriptsize
 377 blast_seq             BLAST sequences in stream against a specified database.
 378 \end_layout
 379
 380 \begin_layout LyX-Code
 381
 382 \size scriptsize
 383 blat_seq              BLAT sequences in stream against a specified genome.
 384 \end_layout
 385
 386 \begin_layout LyX-Code
 387
 388 \size scriptsize
 389 complement_seq        Complement sequences in stream.
 390 \end_layout
 391
 392 \begin_layout LyX-Code
 393
 394 \size scriptsize
 395 count_records         Count the number of records in stream.
 396 \end_layout
 397
 398 \begin_layout LyX-Code
 399
 400 \size scriptsize
 401 count_seq             Count sequences in stream.
 402 \end_layout
 403
 404 \begin_layout LyX-Code
 405
 406 \size scriptsize
 407 count_vals            Count the number of times values of given keys exists
 408  in stream.
 409 \end_layout
 410
 411 \begin_layout LyX-Code
 412
 413 \size scriptsize
 414 create_blast_db       Create a BLAST database from sequences in stream for
 415  use with BLAST.
 416 \end_layout
 417
 418 \begin_layout LyX-Code
 419
 420 \size scriptsize
 421 ...
 422 \end_layout
 423
 424 \begin_layout Standard
 425 To list the biopieces for writing different formats, you can use unix's
 426  grep like this:
 427 \end_layout
 428
 429 \begin_layout LyX-Code
 430
 431 \size scriptsize
 432 list_biopieces | grep write
 433 \end_layout
 434
 435 \begin_layout LyX-Code
 436
 437 \size scriptsize
 438 write_align           Write aligned sequences in pretty alignment format.
 439 \end_layout
 440
 441 \begin_layout LyX-Code
 442
 443 \size scriptsize
 444 write_bed             Write records from stream as BED lines.
 445 \end_layout
 446
 447 \begin_layout LyX-Code
 448
 449 \size scriptsize
 450 write_blast           Write BLAST records from stream in BLAST tabular format
 451  (-m8 and 9).
 452 \end_layout
 453
 454 \begin_layout LyX-Code
 455
 456 \size scriptsize
 457 write_fasta           Write sequences in FASTA format.
 458 \end_layout
 459
 460 \begin_layout LyX-Code
 461
 462 \size scriptsize
 463 write_psl             Write records from stream in PSL format.
 464 \end_layout
 465
 466 \begin_layout LyX-Code
 467
 468 \size scriptsize
 469 write_tab             Write records from stream as tab separated table.
 470 \end_layout
 471
 472 \begin_layout Standard
 473 In order to find out how a specific biopiece works, you just type the program
 474  name without any arguments and press return and the usage of the biopiece
 475  will be displayed.
 476  E.g.
 477
 478 \series bold
 479 read_fasta
 480 \series default
 481  <return>:
 482 \end_layout
 483
 484 \begin_layout Standard
 485 \begin_inset Box Frameless
 486 position "t"
 487 hor_pos "c"
 488 has_inner_box 1
 489 inner_pos "t"
 490 use_parbox 0
 491 width "100col%"
 492 special "none"
 493 height "1in"
 494 height_special "totalheight"
 495 status open
 496
 497 \begin_layout LyX-Code
 498
 499 \size scriptsize
 500 Program name:   read_fasta
 501 \end_layout
 502
 503 \begin_layout LyX-Code
 504
 505 \size scriptsize
 506 Author:         Martin Asser Hansen - Copyright (C) - All rights reserved
 507 \end_layout
 508
 509 \begin_layout LyX-Code
 510
 511 \size scriptsize
 512 Contact:        mail@maasha.dk
 513 \end_layout
 514
 515 \begin_layout LyX-Code
 516
 517 \size scriptsize
 518 Date:           August 2007
 519 \end_layout
 520
 521 \begin_layout LyX-Code
 522
 523 \size scriptsize
 524 License:        GNU General Public License version 2 (http://www.gnu.org/copyleft/
 525 gpl.html)
 526 \end_layout
 527
 528 \begin_layout LyX-Code
 529
 530 \size scriptsize
 531 Description:    Read FASTA entries.
 532 \end_layout
 533
 534 \begin_layout LyX-Code
 535
 536 \size scriptsize
 537 Usage:          read_fasta [options] -i <FASTA file(s)>
 538 \end_layout
 539
 540 \begin_layout LyX-Code
 541
 542 \size scriptsize
 543 Options:
 544 \end_layout
 545
 546 \begin_layout LyX-Code
 547
 548 \size scriptsize
 549    [-i <file(s)> | --data_in=<file(s)>]  -  Comma separated list of files
 550  to read.
 551 \end_layout
 552
 553 \begin_layout LyX-Code
 554
 555 \size scriptsize
 556    [-n <int>     | --num=<int>]          -  Limit number of records to read.
 557 \end_layout
 558
 559 \begin_layout LyX-Code
 560
 561 \size scriptsize
 562    [-I <file>    | --stream_in=<file>]   -  Read input stream from file
 563   -  Default=STDIN
 564 \end_layout
 565
 566 \begin_layout LyX-Code
 567
 568 \size scriptsize
 569    [-O <file>    | --stream_out=<file>]  -  Write output stream to file
 570   -  Default=STDOUT
 571 \end_layout
 572
 573 \begin_layout LyX-Code
 574
 575 \end_layout
 576
 577 \begin_layout LyX-Code
 578
 579 \size scriptsize
 580 Examples:
 581 \end_layout
 582
 583 \begin_layout LyX-Code
 584
 585 \size scriptsize
 586    read_fasta -i test.fna             -  Read FASTA entries from file.
 587 \end_layout
 588
 589 \begin_layout LyX-Code
 590
 591 \size scriptsize
 592    read_fasta -i test1.fna,test2,fna  -  Read FASTA entries from files.
 593 \end_layout
 594
 595 \begin_layout LyX-Code
 596
 597 \size scriptsize
 598    read_fasta -i '*.fna'              -  Read FASTA entries from files.
 599 \end_layout
 600
 601 \begin_layout LyX-Code
 602
 603 \size scriptsize
 604    read_fasta -i test.fna -n 10       -  Read first 10 FASTA entries from
 605  file.
 606 \end_layout
 607
 608 \end_inset
 609
 610
 611 \end_layout
 612
 613 \begin_layout Section
 614 The Data Stream
 615 \end_layout
 616
 617 \begin_layout Subsection
 618 How to read the data stream from file?
 619 \begin_inset LatexCommand label
 620 name "sub:How-to-read-stream"
 621
 622 \end_inset
 623
 624
 625 \end_layout
 626
 627 \begin_layout Standard
 628 You want to read a data stream that you previously have saved to file in
 629  biopieces format.
 630  This can be done implicetly or explicitly.
 631  The implicit way uses the 'stdout' stream of the Unix terminal:
 632 \end_layout
 633
 634 \begin_layout LyX-Code
 635 cat | <biopiece>
 636 \end_layout
 637
 638 \begin_layout Standard
 639 cat is the Unix command that reads a file and output the result to 'stdout'
 640  --- which in this case is piped to any biopiece represented by the <biopiece>.
 641  It is also possible to read the data stream using '<' to direct the 'stdout'
 642  stream into the biopiece like this:
 643 \end_layout
 644
 645 \begin_layout LyX-Code
 646 <biopiece> < <file>
 647 \end_layout
 648
 649 \begin_layout Standard
 650 However, that will not work if you pipe more biopieces together.
 651  Then it is much safer to read the stream from a file explicitly like this:
 652 \end_layout
 653
 654 \begin_layout LyX-Code
 655 <biopiece> --stream_in=<file>
 656 \end_layout
 657
 658 \begin_layout Standard
 659 Here the filename <file> is explicetly given to the biopiece <biopiece>
 660  with the switch
 661 \begin_inset ERT
 662 status collapsed
 663
 664 \begin_layout Standard
 665
 666 -
 667 \backslash
 668 /-
 669 \end_layout
 670
 671 \end_inset
 672
 673 stream_in.
 674  This switch works with all biopieces.
 675  It is also possible to read in data from multiple sources by repeating
 676  the explicit read step:
 677 \end_layout
 678
 679 \begin_layout LyX-Code
 680 <biopiece> --stream_in=<file1> | <biopiece> --stream_in=<file2>
 681 \end_layout
 682
 683 \begin_layout Subsection
 684 How to write the data stream to file?
 685 \begin_inset LatexCommand label
 686 name "sub:How-to-write-stream"
 687
 688 \end_inset
 689
 690
 691 \end_layout
 692
 693 \begin_layout Standard
 694 In order to save the output stream from a biopiece to file, so you can read
 695  in the stream again at a later time, you can do one of two things:
 696 \end_layout
 697
 698 \begin_layout LyX-Code
 699 <biopiece> > <file>
 700 \end_layout
 701
 702 \begin_layout Standard
 703 All, the biopieces write the data stream to 'stdout' by default which can
 704  be written to a file by redirecting 'stdout' to file using '>' , however,
 705  if one of the biopieces for writing other formats is used then the both
 706  the biopieces records as well as the result output will go to 'stdout'
 707  in a mixture causing havock! To avoid this you must use the switch
 708 \begin_inset ERT
 709 status collapsed
 710
 711 \begin_layout Standard
 712
 713 -
 714 \backslash
 715 /-
 716 \end_layout
 717
 718 \end_inset
 719
 720 stream_out that explictly tells the biopiece to write the output stream
 721  to file:
 722 \end_layout
 723
 724 \begin_layout LyX-Code
 725 <biopiece> --stream_out=<file>
 726 \end_layout
 727
 728 \begin_layout Standard
 729 The
 730 \begin_inset ERT
 731 status collapsed
 732
 733 \begin_layout Standard
 734
 735 -
 736 \backslash
 737 /-
 738 \end_layout
 739
 740 \end_inset
 741
 742 stream_out switch works with all biopieces.
 743 \end_layout
 744
 745 \begin_layout Subsection
 746 How to terminate the data stream?
 747 \end_layout
 748
 749 \begin_layout Standard
 750 The data stream is never stops unless the user want to save the stream or
 751  by supplying the
 752 \begin_inset ERT
 753 status collapsed
 754
 755 \begin_layout Standard
 756
 757 -
 758 \backslash
 759 /-
 760 \end_layout
 761
 762 \end_inset
 763
 764 no_stream switch that will terminate the stream:
 765 \end_layout
 766
 767 \begin_layout LyX-Code
 768 <biopiece> --no_stream
 769 \end_layout
 770
 771 \begin_layout Standard
 772 The
 773 \begin_inset ERT
 774 status collapsed
 775
 776 \begin_layout Standard
 777
 778 -
 779 \backslash
 780 /-
 781 \end_layout
 782
 783 \end_inset
 784
 785 no_stream switch only works with those biopieces where it makes sense that
 786  the user might want to terminale the data stream,
 787 \emph on
 788 i.e
 789 \emph default
 790 .
 791  after an analysis step where the user wants to output the result, but not
 792  the data stream.
 793 \end_layout
 794
 795 \begin_layout Subsection
 796 How to write my results to file?
 797 \begin_inset LatexCommand label
 798 name "sub:How-to-write-result"
 799
 800 \end_inset
 801
 802
 803 \end_layout
 804
 805 \begin_layout Standard
 806 Saving the result of an analysis to file can be done implicitly or explicitly.
 807  The implicit way:
 808 \end_layout
 809
 810 \begin_layout LyX-Code
 811 <biopiece> --no_stream > <file>
 812 \end_layout
 813
 814 \begin_layout Standard
 815 If you use '>' to redirect 'stdout' to file then it is important to use
 816  the
 817 \begin_inset ERT
 818 status collapsed
 819
 820 \begin_layout Standard
 821
 822 -
 823 \backslash
 824 /-
 825 \end_layout
 826
 827 \end_inset
 828
 829 no_stream switch to avoid writing a mix of biopieces records and result
 830  to the same file causing havock.
 831  The safe way is to use the
 832 \begin_inset ERT
 833 status open
 834
 835 \begin_layout Standard
 836
 837 -
 838 \backslash
 839 /-
 840 \end_layout
 841
 842 \end_inset
 843
 844 result_out switch which explicetly tells the biopiece to write the result
 845  to a given file:
 846 \end_layout
 847
 848 \begin_layout LyX-Code
 849 <biopiece> --result_out=<file>
 850 \end_layout
 851
 852 \begin_layout Standard
 853 Using the above method will not terminate the stream, so it is possible
 854  to pipe that into another biopiece generating different results:
 855 \end_layout
 856
 857 \begin_layout LyX-Code
 858 <biopiece1> --result_out=<file1> | <biopiece2> --result_out=<file2>
 859 \end_layout
 860
 861 \begin_layout Standard
 862 And still the data stream will continue unless terminated with
 863 \begin_inset ERT
 864 status collapsed
 865
 866 \begin_layout Standard
 867
 868 -
 869 \backslash
 870 /-
 871 \end_layout
 872
 873 \end_inset
 874
 875 no_stream:
 876 \end_layout
 877
 878 \begin_layout LyX-Code
 879 <biopiece> --result_out=<file> --no_stream
 880 \end_layout
 881
 882 \begin_layout Standard
 883 Or written to file using implicitly or explicity
 884 \begin_inset LatexCommand eqref
 885 reference "sub:How-to-write-result"
 886
 887 \end_inset
 888
 889 .
 890  The explicit way:
 891 \end_layout
 892
 893 \begin_layout LyX-Code
 894 <biopiece> --result_out=<file1> --stream_out=<file2>
 895 \end_layout
 896
 897 \begin_layout Subsection
 898 How to read data from multiple sources?
 899 \end_layout
 900
 901 \begin_layout Standard
 902 To read multiple data sources, with the same type or different type of data
 903  do:
 904 \end_layout
 905
 906 \begin_layout LyX-Code
 907 <biopiece1> --data_in=<file1> | <biopiece2> --data_in=<file2>
 908 \end_layout
 909
 910 \begin_layout Standard
 911 where type is the data type a specific biopiece reads.
 912 \end_layout
 913
 914 \begin_layout Section
 915 Reading input
 916 \end_layout
 917
 918 \begin_layout Subsection
 919 How to read biopieces input?
 920 \end_layout
 921
 922 \begin_layout Standard
 923 See
 924 \begin_inset LatexCommand eqref
 925 reference "sub:How-to-read-stream"
 926
 927 \end_inset
 928
 929 .
 930 \end_layout
 931
 932 \begin_layout Subsection
 933 How to read in data?
 934 \end_layout
 935
 936 \begin_layout Standard
 937 Data in different formats can be read with the appropriate biopiece for
 938  that format.
 939  The biopieces are typicalled named 'read_<data type>' such as
 940 \series bold
 941 read_fasta
 942 \series default
 943 ,
 944 \series bold
 945 read_bed
 946 \series default
 947 ,
 948 \series bold
 949 read_tab
 950 \series default
 951 , etc., and all behave in a similar manner.
 952  Data can be read by supplying the
 953 \begin_inset ERT
 954 status collapsed
 955
 956 \begin_layout Standard
 957
 958 -
 959 \backslash
 960 /-
 961 \end_layout
 962
 963 \end_inset
 964
 965 data_in switch and a file name to the file containing the data:
 966 \end_layout
 967
 968 \begin_layout LyX-Code
 969 <biopiece> --data_in=<file>
 970 \end_layout
 971
 972 \begin_layout Standard
 973 It is also possible to read in a saved biopieces stream (see
 974 \begin_inset LatexCommand ref
 975 reference "sub:How-to-read-stream"
 976
 977 \end_inset
 978
 979 ) as well as reading data in one go:
 980 \end_layout
 981
 982 \begin_layout LyX-Code
 983 <biopiece> --stream_in=<file1> --data_in=<file2>
 984 \end_layout
 985
 986 \begin_layout Standard
 987 If you want to read data from several files you can do this:
 988 \end_layout
 989
 990 \begin_layout LyX-Code
 991 <biopiece> --data_in=<file1> | <biopiece> --data_in=<file2>
 992 \end_layout
 993
 994 \begin_layout Standard
 995 If you have several data files you can read in all explicitly with a comma
 996  separated list:
 997 \end_layout
 998
 999 \begin_layout LyX-Code
1000 <biopiece> --data_in=file1,file2,file3
1001 \end_layout
1002
1003 \begin_layout Standard
1004 And it is also possible to use file globbing
1005 \begin_inset Foot
1006 status open
1007
1008 \begin_layout Standard
1009 using the short option will only work if you quote the argument -i '*.fna'
1010 \end_layout
1011
1012 \end_inset
1013
1014 :
1015 \end_layout
1016
1017 \begin_layout LyX-Code
1018 <biopiece> --data_in=*.fna
1019 \end_layout
1020
1021 \begin_layout Standard
1022 Or in a combination:
1023 \end_layout
1024
1025 \begin_layout LyX-Code
1026 <biopiece> --data_in=file1,/dir/*.fna
1027 \end_layout
1028
1029 \begin_layout Standard
1030 Finally, it is possible to read in data in different formats using the appropria
1031 te biopiece for each format:
1032 \end_layout
1033
1034 \begin_layout LyX-Code
1035 <biopiece1> --data_in=<file1> | <biopiece2> --data_in=<file2> ...
1036 \end_layout
1037
1038 \begin_layout Subsection
1039 How to read FASTA input?
1040 \end_layout
1041
1042 \begin_layout Standard
1043 Sequences in FASTA format can be read explicitly using
1044 \series bold
1045 read_fasta
1046 \series default
1047 :
1048 \end_layout
1049
1050 \begin_layout LyX-Code
1051 read_fasta --data_in=<file>
1052 \end_layout
1053
1054 \begin_layout Subsection
1055 How to read alignment input?
1056 \end_layout
1057
1058 \begin_layout Standard
1059 If your alignment if FASTA formatted then you can
1060 \series bold
1061 read_align
1062 \series default
1063 .
1064  It is also possible to use
1065 \series bold
1066 read_fasta
1067 \series default
1068  since the data is FASTA formatted, however, with
1069 \series bold
1070 read_fasta
1071 \series default
1072  the key ALIGN will be omitted.
1073  The ALIGN key is used to determine which sequences belong to what alignment
1074  which is required for
1075 \series bold
1076 write_align
1077 \series default
1078 .
1079 \end_layout
1080
1081 \begin_layout LyX-Code
1082 read_align --data_in=<file>
1083 \end_layout
1084
1085 \begin_layout Subsection
1086 How to read tabular input?
1087 \begin_inset LatexCommand label
1088 name "sub:How-to-read-table"
1089
1090 \end_inset
1091
1092
1093 \end_layout
1094
1095 \begin_layout Standard
1096 Tabular input can be read with
1097 \series bold
1098 read_tab
1099 \series default
1100  which will read in all rows and chosen columns (separated by a given delimter)
1101  from a table in text format.
1102 \end_layout
1103
1104 \begin_layout Standard
1105 The table below:
1106 \end_layout
1107
1108 \begin_layout Standard
1109 \noindent
1110 \align center
1111 \begin_inset Tabular
1112 <lyxtabular version="3" rows="4" columns="3">
1113 <features>
1114 <column alignment="left" valignment="top" width="0">
1115 <column alignment="left" valignment="top" width="0">
1116 <column alignment="left" valignment="top" width="0">
1117 <row>
1118 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1119 \begin_inset Text
1120
1121 \begin_layout Standard
1122 Human
1123 \end_layout
1124
1125 \end_inset
1126 </cell>
1127 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1128 \begin_inset Text
1129
1130 \begin_layout Standard
1131 ATACGTCAG
1132 \end_layout
1133
1134 \end_inset
1135 </cell>
1136 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1137 \begin_inset Text
1138
1139 \begin_layout Standard
1140 23524
1141 \end_layout
1142
1143 \end_inset
1144 </cell>
1145 </row>
1146 <row>
1147 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1148 \begin_inset Text
1149
1150 \begin_layout Standard
1151 Dog
1152 \end_layout
1153
1154 \end_inset
1155 </cell>
1156 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1157 \begin_inset Text
1158
1159 \begin_layout Standard
1160 AGCATGAC
1161 \end_layout
1162
1163 \end_inset
1164 </cell>
1165 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1166 \begin_inset Text
1167
1168 \begin_layout Standard
1169 2442
1170 \end_layout
1171
1172 \end_inset
1173 </cell>
1174 </row>
1175 <row>
1176 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1177 \begin_inset Text
1178
1179 \begin_layout Standard
1180 Mouse
1181 \end_layout
1182
1183 \end_inset
1184 </cell>
1185 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1186 \begin_inset Text
1187
1188 \begin_layout Standard
1189 GACTG
1190 \end_layout
1191
1192 \end_inset
1193 </cell>
1194 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1195 \begin_inset Text
1196
1197 \begin_layout Standard
1198 234
1199 \end_layout
1200
1201 \end_inset
1202 </cell>
1203 </row>
1204 <row>
1205 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1206 \begin_inset Text
1207
1208 \begin_layout Standard
1209 Cat
1210 \end_layout
1211
1212 \end_inset
1213 </cell>
1214 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
1215 \begin_inset Text
1216
1217 \begin_layout Standard
1218 AAATGCA
1219 \end_layout
1220
1221 \end_inset
1222 </cell>
1223 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
1224 \begin_inset Text
1225
1226 \begin_layout Standard
1227 2342
1228 \end_layout
1229
1230 \end_inset
1231 </cell>
1232 </row>
1233 </lyxtabular>
1234
1235 \end_inset
1236
1237
1238 \end_layout
1239
1240 \begin_layout Standard
1241 Can be read using the command:
1242 \end_layout
1243
1244 \begin_layout LyX-Code
1245 read_tab --data_in=<file>
1246 \end_layout
1247
1248 \begin_layout Standard
1249 Which will result in four records, one for each row, where the keys V0,
1250  V1, V2 are the default keys for the organism, sequence, and count, respectively.
1251  It is possible to select a subset of colums to read by using the
1252 \begin_inset ERT
1253 status collapsed
1254
1255 \begin_layout Standard
1256
1257 -
1258 \backslash
1259 /-
1260 \end_layout
1261
1262 \end_inset
1263
1264 cols switch which takes a comma separated list of columns numbers (first
1265  column is designated 0) as argument.
1266  So to read in only the sequence and the count so that the count comes before
1267  the sequence do:
1268 \end_layout
1269
1270 \begin_layout LyX-Code
1271 read_tab --data_in=<file> --cols=2,1
1272 \end_layout
1273
1274 \begin_layout Standard
1275 It is also possible to name the columns with the
1276 \begin_inset ERT
1277 status collapsed
1278
1279 \begin_layout Standard
1280
1281 -
1282 \backslash
1283 /-
1284 \end_layout
1285
1286 \end_inset
1287
1288 keys switch:
1289 \end_layout
1290
1291 \begin_layout LyX-Code
1292 read_tab --data_in=<file> --cols=2,1 --keys=COUNT,SEQ
1293 \end_layout
1294
1295 \begin_layout Subsection
1296 How to read BED input?
1297 \end_layout
1298
1299 \begin_layout Standard
1300 The BED (Browser Extensible Data
1301 \begin_inset Foot
1302 status open
1303
1304 \begin_layout Standard
1305 \begin_inset LatexCommand url
1306 target "http://genome.ucsc.edu/FAQ/FAQformat"
1307
1308 \end_inset
1309
1310
1311 \end_layout
1312
1313 \end_inset
1314
1315 ) format is a tabular format for data pertaining to one of the Eukaryotic
1316  genomes in the UCSC genome brower
1317 \begin_inset Foot
1318 status collapsed
1319
1320 \begin_layout Standard
1321 \begin_inset LatexCommand url
1322 target "http://genome.ucsc.edu/"
1323
1324 \end_inset
1325
1326
1327 \end_layout
1328
1329 \end_inset
1330
1331 .
1332  The BED format consists of up to 12 columns, where the first three are
1333  mandatory CHR, CHR_BEG, and CHR_END.
1334  The mandatory columns and any of the optional columns can all be read in
1335  easily with the
1336 \series bold
1337 read_bed
1338 \series default
1339  biopiece.
1340 \end_layout
1341
1342 \begin_layout LyX-Code
1343 read_bed --data_in=<file>
1344 \end_layout
1345
1346 \begin_layout Standard
1347 It is also possible to read the BED file with
1348 \series bold
1349 read_tab
1350 \series default
1351  (see\InsetSpace ~
1352
1353 \begin_inset LatexCommand ref
1354 reference "sub:How-to-read-table"
1355
1356 \end_inset
1357
1358 ), however, that will be more cumbersome because you need to specify the
1359  keys:
1360 \end_layout
1361
1362 \begin_layout LyX-Code
1363 read_tab --data_in=<file> --keys=CHR,CHR_BEG,CHR_END ...
1364 \end_layout
1365
1366 \begin_layout Subsection
1367 How to read PSL input?
1368 \end_layout
1369
1370 \begin_layout Standard
1371 The PSL format is the output from BLAT and contains 21 mandatory fields
1372  that can be read with
1373 \series bold
1374 read_psl
1375 \series default
1376 :
1377 \end_layout
1378
1379 \begin_layout LyX-Code
1380 read_psl --data_in=<file>
1381 \end_layout
1382
1383 \begin_layout Section
1384 Writing output
1385 \end_layout
1386
1387 \begin_layout Standard
1388 All result output can be written explicitly to file using the
1389 \begin_inset ERT
1390 status collapsed
1391
1392 \begin_layout Standard
1393
1394 -
1395 \backslash
1396 /-
1397 \end_layout
1398
1399 \end_inset
1400
1401 result_out switch which all result generating biopieces have.
1402  It is also possible to write the result to file implicetly by directing
1403  'stdout' to file using '>', however, that requires the
1404 \begin_inset ERT
1405 status collapsed
1406
1407 \begin_layout Standard
1408
1409 -
1410 \backslash
1411 /-
1412 \end_layout
1413
1414 \end_inset
1415
1416 no_stream swich to prevent a mixture of data stream and results in the file.
1417  The explicit (and safe) way:
1418 \end_layout
1419
1420 \begin_layout LyX-Code
1421 ...
1422  | <biopiece> --result_out=<file>
1423 \end_layout
1424
1425 \begin_layout Standard
1426 The implicit way:
1427 \end_layout
1428
1429 \begin_layout LyX-Code
1430 ...
1431  | <biopiece> --no_stream > <file>
1432 \end_layout
1433
1434 \begin_layout Subsection
1435 How to write biopieces output?
1436 \end_layout
1437
1438 \begin_layout Standard
1439 See
1440 \begin_inset LatexCommand eqref
1441 reference "sub:How-to-write-stream"
1442
1443 \end_inset
1444
1445 .
1446 \end_layout
1447
1448 \begin_layout Subsection
1449 How to write FASTA output?
1450 \begin_inset LatexCommand label
1451 name "sub:How-to-write-fasta"
1452
1453 \end_inset
1454
1455
1456 \end_layout
1457
1458 \begin_layout Standard
1459 FASTA output can be written with
1460 \series bold
1461 write_fasta
1462 \series default
1463 .
1464 \end_layout
1465
1466 \begin_layout LyX-Code
1467 ...
1468  | write_fasta --result_out=<file>
1469 \end_layout
1470
1471 \begin_layout Standard
1472 It is also possible to wrap the sequences to a given width using the
1473 \begin_inset ERT
1474 status collapsed
1475
1476 \begin_layout Standard
1477
1478 -
1479 \backslash
1480 /-
1481 \end_layout
1482
1483 \end_inset
1484
1485 wrap switch allthough wrapping of sequence is generally an evil thing:
1486 \end_layout
1487
1488 \begin_layout LyX-Code
1489 ...
1490  | write_fasta --no_stream --wrap=80
1491 \end_layout
1492
1493 \begin_layout Subsection
1494 How to write alignment output?
1495 \begin_inset LatexCommand label
1496 name "sub:How-to-write-alignment"
1497
1498 \end_inset
1499
1500
1501 \end_layout
1502
1503 \begin_layout Standard
1504 Pretty alignments with ruler
1505 \begin_inset Foot
1506 status collapsed
1507
1508 \begin_layout Standard
1509 '.' for every 10 residues, ':' for every 50, and '|' for every 100
1510 \end_layout
1511
1512 \end_inset
1513
1514  and consensus sequence
1515 \begin_inset Note Note
1516 status collapsed
1517
1518 \begin_layout Standard
1519 which reminds me to make that an option.
1520 \end_layout
1521
1522 \end_inset
1523
1524  can be created with
1525 \series bold
1526 write_align
1527 \series default
1528 , what also have the optional
1529 \begin_inset ERT
1530 status collapsed
1531
1532 \begin_layout Standard
1533
1534 -
1535 \backslash
1536 /-
1537 \end_layout
1538
1539 \end_inset
1540
1541 wrap switch to break the alignment into blocks of a given width:
1542 \end_layout
1543
1544 \begin_layout LyX-Code
1545 ...
1546  | write_align --result_out=<file> --wrap=80
1547 \end_layout
1548
1549 \begin_layout Standard
1550 If the number of sequnces in the alignment is 2 then a pairwise alignment
1551  will be output otherwise a multiple alignment.
1552  And if the sequence type, determined automagically, is protein, then residues
1553  and symbols (+,\InsetSpace ~
1554 :,\InsetSpace ~
1555 .) will be used to show consensus according to the Blosum62
1556  matrix.
1557 \end_layout
1558
1559 \begin_layout Subsection
1560 How to write tabular output?
1561 \begin_inset LatexCommand label
1562 name "sub:How-to-write-tab"
1563
1564 \end_inset
1565
1566
1567 \end_layout
1568
1569 \begin_layout Standard
1570 Outputting the data stream as a table can be done with
1571 \series bold
1572 write_tab
1573 \series default
1574 , which will write generate one row per record with the values as columns.
1575  If you supply the optional
1576 \begin_inset ERT
1577 status collapsed
1578
1579 \begin_layout Standard
1580
1581 -
1582 \backslash
1583 /-
1584 \end_layout
1585
1586 \end_inset
1587
1588 comment switch, when the first row in the table will be a 'comment' line
1589  prefixed with a '#':
1590 \end_layout
1591
1592 \begin_layout LyX-Code
1593 ...
1594  | write_tab --result_out=<file> --comment
1595 \end_layout
1596
1597 \begin_layout Standard
1598 You can also change the delimiter from the default (tab) to
1599 \emph on
1600 e.g.
1601
1602 \emph default
1603  ',':
1604 \end_layout
1605
1606 \begin_layout LyX-Code
1607 ...
1608  | write_tab --result_out=<file> --delimit=','
1609 \end_layout
1610
1611 \begin_layout Standard
1612 If you want the values output in a specific order you have to supply a comma
1613  separated list using the
1614 \begin_inset ERT
1615 status collapsed
1616
1617 \begin_layout Standard
1618
1619 -
1620 \backslash
1621 /-
1622 \end_layout
1623
1624 \end_inset
1625
1626 keys switch that will print only those keys in that order:
1627 \end_layout
1628
1629 \begin_layout LyX-Code
1630 ...
1631  | write_tab --result_out=<file> --keys=SEQ_NAME,COUNT
1632 \end_layout
1633
1634 \begin_layout Standard
1635 Alternatively, if you have some keys that you don't want in the tabular
1636  output, use the
1637 \begin_inset ERT
1638 status collapsed
1639
1640 \begin_layout Standard
1641
1642 -
1643 \backslash
1644 /-
1645 \end_layout
1646
1647 \end_inset
1648
1649 no_keys switch.
1650  So to print all keys except SEQ and SEQ_TYPE do:
1651 \end_layout
1652
1653 \begin_layout LyX-Code
1654 ...
1655  | write_tab --result_out=<file> --no_keys=SEQ,SEQ_TYPE
1656 \end_layout
1657
1658 \begin_layout Standard
1659 Finally, if you have a stream containing a mix of different records types,
1660
1661 \emph on
1662 e.g.
1663
1664 \emph default
1665  records with sequences and records with matches, then you can use
1666 \series bold
1667 write_tab
1668 \series default
1669  to output all the records in tabluar format, however, the
1670 \begin_inset ERT
1671 status collapsed
1672
1673 \begin_layout Standard
1674
1675 -
1676 \backslash
1677 /-
1678 \end_layout
1679
1680 \end_inset
1681
1682 comment,
1683 \begin_inset ERT
1684 status collapsed
1685
1686 \begin_layout Standard
1687
1688 -
1689 \backslash
1690 /-
1691 \end_layout
1692
1693 \end_inset
1694
1695 keys, and
1696 \begin_inset ERT
1697 status collapsed
1698
1699 \begin_layout Standard
1700
1701 -
1702 \backslash
1703 /-
1704 \end_layout
1705
1706 \end_inset
1707
1708 no_keys switches will only respond to records of the first type encountered.
1709  The reason is that outputting mixed records is probably not what you want
1710  anyway, and you should remove all the unwanted records from the stream
1711  before outputting the table:
1712 \series bold
1713 grab
1714 \series default
1715  is your friend (see\InsetSpace ~
1716
1717 \begin_inset LatexCommand ref
1718 reference "sub:How-to-grab"
1719
1720 \end_inset
1721
1722 ).
1723 \end_layout
1724
1725 \begin_layout Subsection
1726 How to write a BED output?
1727 \begin_inset LatexCommand label
1728 name "sub:How-to-write-BED"
1729
1730 \end_inset
1731
1732
1733 \end_layout
1734
1735 \begin_layout Standard
1736 Data in BED format can be output if the records contain the mandatory keys
1737  CHR, CHR_BEG, and CHR_END using
1738 \series bold
1739 write_bed
1740 \series default
1741 .
1742  If the optional keys are also present, they will be output as well:
1743 \end_layout
1744
1745 \begin_layout LyX-Code
1746 write_bed --result_out=<file>
1747 \end_layout
1748
1749 \begin_layout Subsection
1750 How to write PSL output?
1751 \begin_inset LatexCommand label
1752 name "sub:How-to-write-PSL"
1753
1754 \end_inset
1755
1756
1757 \end_layout
1758
1759 \begin_layout Standard
1760 Data in PSL format can be output using
1761 \series bold
1762 write_psl:
1763 \end_layout
1764
1765 \begin_layout LyX-Code
1766 write_psl --result_out=<file>
1767 \end_layout
1768
1769 \begin_layout Section
1770 Manipulating Records
1771 \end_layout
1772
1773 \begin_layout Subsection
1774 How to select a few records?
1775 \begin_inset LatexCommand label
1776 name "sub:How-to-select-a-few-records"
1777
1778 \end_inset
1779
1780
1781 \end_layout
1782
1783 \begin_layout Standard
1784 To quickly get an overview of your data you can limit the data stream to
1785  show a few records.
1786  This also very useful to test the pipeline with a few records if you are
1787  setting up a complex analysis using several biopieces.
1788  That way you can inspect that all goes well before analyzing and waiting
1789  for the full data set.
1790  All of the read_<type> biopieces have the
1791 \begin_inset ERT
1792 status collapsed
1793
1794 \begin_layout Standard
1795
1796 -
1797 \backslash
1798 /-
1799 \end_layout
1800
1801 \end_inset
1802
1803 num switch which will take a number as argument and only that number of
1804  records will be read.
1805  So to read in the first 10 FASTA entries from a file:
1806 \end_layout
1807
1808 \begin_layout LyX-Code
1809 read_fasta --data_in=test.fna --num=10
1810 \end_layout
1811
1812 \begin_layout Standard
1813 Another way of doing this is to use
1814 \series bold
1815 head_records
1816 \series default
1817  will limit the stream to show the first 10 records (default):
1818 \end_layout
1819
1820 \begin_layout LyX-Code
1821 ...
1822  | head_records
1823 \end_layout
1824
1825 \begin_layout Standard
1826 Using
1827 \series bold
1828 head_records
1829 \series default
1830  directly after one of the read_<type> biopieces will be a lot slower than
1831  using the
1832 \begin_inset ERT
1833 status collapsed
1834
1835 \begin_layout Standard
1836
1837 -
1838 \backslash
1839 /-
1840 \end_layout
1841
1842 \end_inset
1843
1844 num switch with the read_<type> biopieces, however,
1845 \series bold
1846 head_records
1847 \series default
1848  can also be used to limit the output from all the other biopieces.
1849  It is also possible to give
1850 \series bold
1851 head_records
1852 \series default
1853  a number of records to show using the
1854 \begin_inset ERT
1855 status collapsed
1856
1857 \begin_layout Standard
1858
1859 -
1860 \backslash
1861 /-
1862 \end_layout
1863
1864 \end_inset
1865
1866 num switch.
1867  So to display the first 100 records do:
1868 \end_layout
1869
1870 \begin_layout LyX-Code
1871 ...
1872  | head_records --num=100
1873 \end_layout
1874
1875 \begin_layout Subsection
1876 How to select random records?
1877 \begin_inset LatexCommand label
1878 name "sub:How-to-select-random-records"
1879
1880 \end_inset
1881
1882
1883 \end_layout
1884
1885 \begin_layout Standard
1886 If you want to inspect a number of random records from the stream this can
1887  be done with the
1888 \series bold
1889 random_records
1890 \series default
1891  biopiece.
1892  So if you have 1 mio records in the stream and you want to select 1000
1893  random records do:
1894 \end_layout
1895
1896 \begin_layout LyX-Code
1897 ...
1898  | random_records --num=1000
1899 \end_layout
1900
1901 \begin_layout Subsection
1902 How to count all records in the data stream?
1903 \end_layout
1904
1905 \begin_layout Standard
1906 To count all the records in the data stream use
1907 \series bold
1908 count_records
1909 \series default
1910 , which adds one record (which is not included in the count) to the data
1911  stream.
1912  So to count the number of sequences in a FASTA file you can do this:
1913 \end_layout
1914
1915 \begin_layout LyX-Code
1916 cat test.fna | read_fasta | count_records --no_stream
1917 \end_layout
1918
1919 \begin_layout Standard
1920 Which will write the last record containing the count to 'stdout':
1921 \end_layout
1922
1923 \begin_layout LyX-Code
1924
1925 \size scriptsize
1926 count_records: 630
1927 \end_layout
1928
1929 \begin_layout LyX-Code
1930
1931 \size scriptsize
1932 ---
1933 \end_layout
1934
1935 \begin_layout Standard
1936 It is also possible to write the count to file using the
1937 \begin_inset ERT
1938 status collapsed
1939
1940 \begin_layout Standard
1941
1942 -
1943 \backslash
1944 /-
1945 \end_layout
1946
1947 \end_inset
1948
1949 result_out switch.
1950 \end_layout
1951
1952 \begin_layout Subsection
1953 How to get the length of record values?
1954 \begin_inset LatexCommand label
1955 name "sub:How-to-get-value_length"
1956
1957 \end_inset
1958
1959
1960 \end_layout
1961
1962 \begin_layout Standard
1963 Use the
1964 \series bold
1965 length_vals
1966 \series default
1967  biopiece to get the length of each value for a comma separated list of
1968  keys:
1969 \end_layout
1970
1971 \begin_layout LyX-Code
1972 ...
1973  | length_vals --keys=HIT,PATTERN
1974 \end_layout
1975
1976 \begin_layout Subsection
1977 How to grab specific records?
1978 \begin_inset LatexCommand label
1979 name "sub:How-to-grab"
1980
1981 \end_inset
1982
1983
1984 \end_layout
1985
1986 \begin_layout Standard
1987 The biopiece
1988 \series bold
1989 grab
1990 \series default
1991  is related to the Unix grep and locates records based on matching keys
1992  and/or values using either a pattern, a Perl regex, or a numerical evaluation.
1993  To easily
1994 \series bold
1995 grab
1996 \series default
1997  all records in the stream that has any mentioning of the pattern 'human'
1998  just pipe the data stream through
1999 \series bold
2000 grab
2001 \series default
2002  like this:
2003 \end_layout
2004
2005 \begin_layout LyX-Code
2006 ...
2007  | grab --pattern=human
2008 \end_layout
2009
2010 \begin_layout Standard
2011 This will search for the pattern 'human' in all keys and all values.
2012  The
2013 \begin_inset ERT
2014 status collapsed
2015
2016 \begin_layout Standard
2017
2018 -
2019 \backslash
2020 /-
2021 \end_layout
2022
2023 \end_inset
2024
2025 pattern switch takes a comma separated list of patterns, so in order to
2026  match multiple patterns do:
2027 \end_layout
2028
2029 \begin_layout LyX-Code
2030 ...
2031  | grab --pattern=human,mouse
2032 \end_layout
2033
2034 \begin_layout Standard
2035 It is also possible to use the
2036 \begin_inset ERT
2037 status collapsed
2038
2039 \begin_layout Standard
2040
2041 -
2042 \backslash
2043 /-
2044 \end_layout
2045
2046 \end_inset
2047
2048 pattern_in switch instead of
2049 \begin_inset ERT
2050 status collapsed
2051
2052 \begin_layout Standard
2053
2054 -
2055 \backslash
2056 /-
2057 \end_layout
2058
2059 \end_inset
2060
2061 pattern.
2062
2063 \begin_inset ERT
2064 status collapsed
2065
2066 \begin_layout Standard
2067
2068 -
2069 \backslash
2070 /-
2071 \end_layout
2072
2073 \end_inset
2074
2075 pattern_in is used to read a file with one pattern per line:
2076 \end_layout
2077
2078 \begin_layout LyX-Code
2079 ...
2080  | grab --pattern_in=patterns.txt
2081 \end_layout
2082
2083 \begin_layout Standard
2084 If you want the opposite result --- to find all records that does not match
2085  the patterns, add the
2086 \begin_inset ERT
2087 status collapsed
2088
2089 \begin_layout Standard
2090
2091 -
2092 \backslash
2093 /-
2094 \end_layout
2095
2096 \end_inset
2097
2098 invert switch, which not only works with the
2099 \begin_inset ERT
2100 status collapsed
2101
2102 \begin_layout Standard
2103
2104 -
2105 \backslash
2106 /-
2107 \end_layout
2108
2109 \end_inset
2110
2111 pattern switch, but also with
2112 \begin_inset ERT
2113 status collapsed
2114
2115 \begin_layout Standard
2116
2117 -
2118 \backslash
2119 /-
2120 \end_layout
2121
2122 \end_inset
2123
2124 regex and
2125 \begin_inset ERT
2126 status collapsed
2127
2128 \begin_layout Standard
2129
2130 -
2131 \backslash
2132 /-
2133 \end_layout
2134
2135 \end_inset
2136
2137 eval:
2138 \end_layout
2139
2140 \begin_layout LyX-Code
2141 ...
2142  | grab --pattern=human --invert
2143 \end_layout
2144
2145 \begin_layout Standard
2146 If you want to search the record keys only,
2147 \emph on
2148 e.g.
2149
2150 \emph default
2151  to find all records containing the key SEQ you can add the
2152 \begin_inset ERT
2153 status collapsed
2154
2155 \begin_layout Standard
2156
2157 -
2158 \backslash
2159 /-
2160 \end_layout
2161
2162 \end_inset
2163
2164 keys_only switch.
2165  This will prevent matching of SEQ in any record value, and in fact SEQ
2166  is a not uncommon peptide sequence you could get an unwanted record.
2167  Also, this will give an increase in speed since only the keys are searched:
2168 \end_layout
2169
2170 \begin_layout LyX-Code
2171 ...
2172  | grab --pattern=SEQ --keys_only
2173 \end_layout
2174
2175 \begin_layout Standard
2176 However, if you are interested in finding the peptide sequence SEQ and not
2177  the SEQ key, just add the
2178 \begin_inset ERT
2179 status collapsed
2180
2181 \begin_layout Standard
2182
2183 -
2184 \backslash
2185 /-
2186 \end_layout
2187
2188 \end_inset
2189
2190 vals_only switch instead:
2191 \end_layout
2192
2193 \begin_layout LyX-Code
2194 ...
2195  | grab --pattern=SEQ --vals_only
2196 \end_layout
2197
2198 \begin_layout Standard
2199 Also, if you want to grab for certain key/value pairs you can supply a comma
2200  separated list of keys whos values will then be searched using the
2201 \begin_inset ERT
2202 status collapsed
2203
2204 \begin_layout Standard
2205
2206 -
2207 \backslash
2208 /-
2209 \end_layout
2210
2211 \end_inset
2212
2213 keys switch.
2214  This is handy if your records contain large genomic sequences and you dont
2215  want to search the entire sequence for
2216 \emph on
2217 e.g.
2218
2219 \emph default
2220  the organism name --- it is much faster to tell
2221 \series bold
2222 grab
2223 \series default
2224  which keys to search the value for:
2225 \end_layout
2226
2227 \begin_layout LyX-Code
2228 ...
2229  | grab --pattern=human --keys=SEQ_NAME
2230 \end_layout
2231
2232 \begin_layout LyX-Code
2233
2234 \end_layout
2235
2236 \begin_layout Standard
2237 It is also possible to invoke flexible matching using regex (regular expressions
2238 ) instead of simple pattern matching.
2239  In
2240 \series bold
2241 grab
2242 \series default
2243  the regex engine is Perl based and allows use of different type of wild
2244  cards, alternatives,
2245 \emph on
2246 etc
2247 \emph default
2248
2249 \begin_inset Foot
2250 status open
2251
2252 \begin_layout Standard
2253 \begin_inset LatexCommand url
2254 target "http://perldoc.perl.org/perlreref.html"
2255
2256 \end_inset
2257
2258
2259 \end_layout
2260
2261 \end_inset
2262
2263 .
2264  If you want to
2265 \series bold
2266 grab
2267 \series default
2268  records withs the sequence ATCG or GCTA you can do this:
2269 \end_layout
2270
2271 \begin_layout LyX-Code
2272 ...
2273  | grab --regex='ATCG|GCTA'
2274 \end_layout
2275
2276 \begin_layout Standard
2277 Or if you want to find sequences beginning with ATCG:
2278 \end_layout
2279
2280 \begin_layout LyX-Code
2281 ...
2282  | grab --regex='^ATCG'
2283 \end_layout
2284
2285 \begin_layout Standard
2286 You can also use
2287 \series bold
2288 grab
2289 \series default
2290  to locate records that fulfill a numerical property using the
2291 \begin_inset ERT
2292 status collapsed
2293
2294 \begin_layout Standard
2295
2296 -
2297 \backslash
2298 /-
2299 \end_layout
2300
2301 \end_inset
2302
2303 eval switch witch takes an expression in three parts.
2304  The first part is the key that holds the value we want to evaluate, the
2305  second part holds one the six operators:
2306 \end_layout
2307
2308 \begin_layout Enumerate
2309 Greater than: >
2310 \end_layout
2311
2312 \begin_layout Enumerate
2313 Greater than or equal to: >=
2314 \end_layout
2315
2316 \begin_layout Enumerate
2317 Less than: <
2318 \end_layout
2319
2320 \begin_layout Enumerate
2321 Less than or equal to: <=
2322 \end_layout
2323
2324 \begin_layout Enumerate
2325 Equal to: =
2326 \end_layout
2327
2328 \begin_layout Enumerate
2329 Not equal to: !=
2330 \end_layout
2331
2332 \begin_layout Enumerate
2333 String wise equal to: eq
2334 \end_layout
2335
2336 \begin_layout Enumerate
2337 String wise not equal to: ne
2338 \end_layout
2339
2340 \begin_layout Standard
2341 And finally comes the number used in the evaluation.
2342  So to
2343 \series bold
2344 grab
2345 \series default
2346  all records with a sequence length greater than 30:
2347 \end_layout
2348
2349 \begin_layout LyX-Code
2350 ...
2351  length_seq | grab --eval='SEQ_LEN > 30'
2352 \end_layout
2353
2354 \begin_layout Standard
2355 If you want to locate all records containing the pattern 'human' and where
2356  the sequence length is greater that 30, you do this by running the stream
2357  through
2358 \series bold
2359 grab
2360 \series default
2361  twice:
2362 \end_layout
2363
2364 \begin_layout LyX-Code
2365 ...
2366  | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30'
2367 \end_layout
2368
2369 \begin_layout Standard
2370 Finally, it is possible to do fast matching of expressions from a file using
2371  the
2372 \begin_inset ERT
2373 status collapsed
2374
2375 \begin_layout Standard
2376
2377 -
2378 \backslash
2379 /-
2380 \end_layout
2381
2382 \end_inset
2383
2384 exact switch.
2385  Each of these expressions has to be matched exactly over the entrie length,
2386  which if useful if you have a file with accession numbers, that you want
2387  to locate in the stream:
2388 \end_layout
2389
2390 \begin_layout LyX-Code
2391 ...
2392  | grab --exact acc_no.txt | ...
2393 \end_layout
2394
2395 \begin_layout Standard
2396 Using
2397 \begin_inset ERT
2398 status collapsed
2399
2400 \begin_layout Standard
2401
2402 -
2403 \backslash
2404 /-
2405 \end_layout
2406
2407 \end_inset
2408
2409 exact is much faster than using
2410 \begin_inset ERT
2411 status collapsed
2412
2413 \begin_layout Standard
2414
2415 -
2416 \backslash
2417 /-
2418 \end_layout
2419
2420 \end_inset
2421
2422 pattern_in, because with
2423 \begin_inset ERT
2424 status collapsed
2425
2426 \begin_layout Standard
2427
2428 -
2429 \backslash
2430 /-
2431 \end_layout
2432
2433 \end_inset
2434
2435 exact the expression has to be complete matches, where
2436 \begin_inset ERT
2437 status collapsed
2438
2439 \begin_layout Standard
2440
2441 -
2442 \backslash
2443 /-
2444 \end_layout
2445
2446 \end_inset
2447
2448 pattern_in looks for subpatterns.
2449 \end_layout
2450
2451 \begin_layout Standard
2452 NB! To get the best speed performance, use the most restrictive
2453 \series bold
2454 grab
2455 \series default
2456  first.
2457 \end_layout
2458
2459 \begin_layout Subsection
2460 How to remove keys from records?
2461 \end_layout
2462
2463 \begin_layout Standard
2464 To remove one or more specific keys from all records in the data stream
2465  use
2466 \series bold
2467 remove_keys
2468 \series default
2469  like this:
2470 \end_layout
2471
2472 \begin_layout LyX-Code
2473 ...
2474  | remove_keys --keys=SEQ,SEQ_NAME
2475 \end_layout
2476
2477 \begin_layout Standard
2478 In the above example SEQ and SEQ_NAME will be removed from all records if
2479  they exists in these.
2480  If all keys are removed from a record, then the record will be removed.
2481 \end_layout
2482
2483 \begin_layout Subsection
2484 How to rename keys in records?
2485 \end_layout
2486
2487 \begin_layout Standard
2488 Sometimes you want to rename a record key,
2489 \emph on
2490 e.g.
2491
2492 \emph default
2493  if you have read in a two column table with sequence name and sequence
2494  in each column (see
2495 \begin_inset LatexCommand ref
2496 reference "sub:How-to-read-table"
2497
2498 \end_inset
2499
2500 ) without specifying the key names, then the sequence name will be called
2501  V0 and the sequence V1 as default in the
2502 \series bold
2503 read_tab
2504 \series default
2505  biopiece.
2506  To rename the V0 and V1 keys we need to run the stream through
2507 \series bold
2508 rename_keys
2509 \series default
2510  twice (one for each key to rename):
2511 \end_layout
2512
2513 \begin_layout LyX-Code
2514 ...
2515  | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ
2516 \end_layout
2517
2518 \begin_layout Standard
2519 The first instance of
2520 \series bold
2521 rename_keys
2522 \series default
2523  replaces all the V0 keys with SEQ_NAME, and the second instance of
2524 \series bold
2525 rename_keys
2526 \series default
2527  replaces all the V1 keys with SEQ.
2528
2529 \emph on
2530 Et viola
2531 \emph default
2532  the data can now be used in the biopieces that requires these keys.
2533 \end_layout
2534
2535 \begin_layout Section
2536 Manipulating Sequences
2537 \end_layout
2538
2539 \begin_layout Subsection
2540 How to get sequence lengths?
2541 \end_layout
2542
2543 \begin_layout Standard
2544 The length for sequences in records can be determined with
2545 \series bold
2546 length_seq
2547 \series default
2548 , which adds the key SEQ_LEN to each record with the sequence length as
2549  the value.
2550  It also generates an extra record that is emitted last with the key TOTAL_SEQ_L
2551 EN showing the total length of all the sequences.
2552 \end_layout
2553
2554 \begin_layout LyX-Code
2555 read_fasta --data_in=<file> | length_seq
2556 \end_layout
2557
2558 \begin_layout Standard
2559 It is also possible to determine the sequence length using the generic tool
2560
2561 \series bold
2562 length_vals
2563 \series default
2564
2565 \begin_inset LatexCommand eqref
2566 reference "sub:How-to-get-value_length"
2567
2568 \end_inset
2569
2570 , which determines the length of the values for a given list of keys:
2571 \end_layout
2572
2573 \begin_layout LyX-Code
2574 read_fasta --data_in=<file> | length_vals --keys=SEQ
2575 \end_layout
2576
2577 \begin_layout Standard
2578 To obtain the total length of all sequences use
2579 \series bold
2580 sum_vals
2581 \series default
2582  like this:
2583 \end_layout
2584
2585 \begin_layout LyX-Code
2586 read_fasta --data_in=<file> | length_vals --keys=SEQ
2587 \end_layout
2588
2589 \begin_layout LyX-Code
2590 | sum_vals --keys=SEQ_LEN
2591 \end_layout
2592
2593 \begin_layout Standard
2594 The biopiece
2595 \series bold
2596 analyze_seq
2597 \series default
2598  will also determine the length of each sequence (see\InsetSpace ~
2599
2600 \begin_inset LatexCommand ref
2601 reference "sub:How-to-analyze"
2602
2603 \end_inset
2604
2605 ).
2606 \end_layout
2607
2608 \begin_layout Subsection
2609 How to analyze sequence composition?
2610 \begin_inset LatexCommand label
2611 name "sub:How-to-analyze"
2612
2613 \end_inset
2614
2615
2616 \end_layout
2617
2618 \begin_layout Standard
2619 If you want to find out the sequence type, composition, length, as well
2620  as GC content, indel content and proportions of soft and hard masked sequence,
2621  then use
2622 \series bold
2623 analyze_seq
2624 \series default
2625 .
2626  This handy biopiece will determine all these things per sequence from which
2627  it is easy to get an overview using the
2628 \series bold
2629 write_tab
2630 \series default
2631  biopiece to output a table (see\InsetSpace ~
2632
2633 \begin_inset LatexCommand ref
2634 reference "sub:How-to-write-tab"
2635
2636 \end_inset
2637
2638 ).
2639  So in order to determine the sequence composition of a FASTA file with
2640  just one entry containing the sequence 'ATCG' we just read the data with
2641
2642 \series bold
2643 read_fasta
2644 \series default
2645  and run the output through
2646 \series bold
2647 analyze_seq
2648 \series default
2649  which will add the analysis to the record like this:
2650 \end_layout
2651
2652 \begin_layout LyX-Code
2653 read_fasta --data_in=test.fna | analyze_seq ...
2654 \end_layout
2655
2656 \begin_layout LyX-Code
2657
2658 \end_layout
2659
2660 \begin_layout LyX-Code
2661
2662 \size scriptsize
2663 RES:D: 0
2664 \end_layout
2665
2666 \begin_layout LyX-Code
2667
2668 \size scriptsize
2669 MIX_INDEX: 0.55
2670 \end_layout
2671
2672 \begin_layout LyX-Code
2673
2674 \size scriptsize
2675 RES:W: 0
2676 \end_layout
2677
2678 \begin_layout LyX-Code
2679
2680 \size scriptsize
2681 RES:G: 16
2682 \end_layout
2683
2684 \begin_layout LyX-Code
2685
2686 \size scriptsize
2687 SOFT_MASK%: 63.75
2688 \end_layout
2689
2690 \begin_layout LyX-Code
2691
2692 \size scriptsize
2693 RES:B: 0
2694 \end_layout
2695
2696 \begin_layout LyX-Code
2697
2698 \size scriptsize
2699 RES:V: 0
2700 \end_layout
2701
2702 \begin_layout LyX-Code
2703
2704 \size scriptsize
2705 HARD_MASK%: 0.00
2706 \end_layout
2707
2708 \begin_layout LyX-Code
2709
2710 \size scriptsize
2711 RES:H: 0
2712 \end_layout
2713
2714 \begin_layout LyX-Code
2715
2716 \size scriptsize
2717 RES:S: 0
2718 \end_layout
2719
2720 \begin_layout LyX-Code
2721
2722 \size scriptsize
2723 RES:N: 0
2724 \end_layout
2725
2726 \begin_layout LyX-Code
2727
2728 \size scriptsize
2729 RES:.: 0
2730 \end_layout
2731
2732 \begin_layout LyX-Code
2733
2734 \size scriptsize
2735 GC%: 35.00
2736 \end_layout
2737
2738 \begin_layout LyX-Code
2739
2740 \size scriptsize
2741 RES:A: 8
2742 \end_layout
2743
2744 \begin_layout LyX-Code
2745
2746 \size scriptsize
2747 RES:Y: 0
2748 \end_layout
2749
2750 \begin_layout LyX-Code
2751
2752 \size scriptsize
2753 RES:M: 0
2754 \end_layout
2755
2756 \begin_layout LyX-Code
2757
2758 \size scriptsize
2759 RES:T: 44
2760 \end_layout
2761
2762 \begin_layout LyX-Code
2763
2764 \size scriptsize
2765 SEQ_TYPE: DNA
2766 \end_layout
2767
2768 \begin_layout LyX-Code
2769
2770 \size scriptsize
2771 RES:K: 0
2772 \end_layout
2773
2774 \begin_layout LyX-Code
2775
2776 \size scriptsize
2777 RES:~: 0
2778 \end_layout
2779
2780 \begin_layout LyX-Code
2781
2782 \size scriptsize
2783 SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc
2784 tgtcg
2785 \end_layout
2786
2787 \begin_layout LyX-Code
2788
2789 \size scriptsize
2790 SEQ_LEN:
2791 \end_layout
2792
2793 \begin_layout LyX-Code
2794
2795 \size scriptsize
2796 80 RES:R: 0
2797 \end_layout
2798
2799 \begin_layout LyX-Code
2800
2801 \size scriptsize
2802 RES:C: 12
2803 \end_layout
2804
2805 \begin_layout LyX-Code
2806
2807 \size scriptsize
2808 RES:-: 0
2809 \end_layout
2810
2811 \begin_layout LyX-Code
2812
2813 \size scriptsize
2814 RES:U: 0
2815 \end_layout
2816
2817 \begin_layout LyX-Code
2818
2819 \size scriptsize
2820 ---
2821 \end_layout
2822
2823 \begin_layout LyX-Code
2824
2825 \end_layout
2826
2827 \begin_layout Standard
2828 Now to make a table of how may As, Ts, Cs, and Gs you can add the following:
2829 \end_layout
2830
2831 \begin_layout LyX-Code
2832 ...
2833  | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G
2834 \end_layout
2835
2836 \begin_layout Standard
2837 Or if you want to see the proportions of hard and soft masked sequence:
2838 \end_layout
2839
2840 \begin_layout LyX-Code
2841 ...
2842  | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK%
2843 \end_layout
2844
2845 \begin_layout Standard
2846 If you have a stack of sequences in one file and you want to determine the
2847  mean GC content you can do it using the
2848 \series bold
2849 mean_vals
2850 \series default
2851  biopiece:
2852 \end_layout
2853
2854 \begin_layout LyX-Code
2855 read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC%
2856 \end_layout
2857
2858 \begin_layout Standard
2859 Or if you want the total count of Ns you can use
2860 \series bold
2861 sum_vals
2862 \series default
2863  like this:
2864 \end_layout
2865
2866 \begin_layout LyX-Code
2867 read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N
2868 \end_layout
2869
2870 \begin_layout Standard
2871 The MIX_INDEX key is calculated as the count of the most common residue
2872  over the sequence length, and can be used as a cut-off for removing sequence
2873  tags consisting of mostly one nucleotide:
2874 \end_layout
2875
2876 \begin_layout LyX-Code
2877 read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85'
2878 \end_layout
2879
2880 \begin_layout Subsection
2881 How to extract subsequences?
2882 \begin_inset LatexCommand label
2883 name "sub:How-to-extract"
2884
2885 \end_inset
2886
2887
2888 \end_layout
2889
2890 \begin_layout Standard
2891 In order to extract a subsequence from a longer sequence use the biopiece
2892  extract_seq, which will replace the sequence in the record with the subsequence
2893  (this behaviour should probably be modified to be dependant of a
2894 \begin_inset ERT
2895 status collapsed
2896
2897 \begin_layout Standard
2898
2899 -
2900 \backslash
2901 /-
2902 \end_layout
2903
2904 \end_inset
2905
2906 replace or a
2907 \begin_inset ERT
2908 status collapsed
2909
2910 \begin_layout Standard
2911
2912 -
2913 \backslash
2914 /-
2915 \end_layout
2916
2917 \end_inset
2918
2919 no_replace switch
2920 \begin_inset Note Note
2921 status collapsed
2922
2923 \begin_layout Standard
2924 also in split_seq
2925 \end_layout
2926
2927 \end_inset
2928
2929 ).
2930  So to extract the first 20 residues from all sequences do (first residue
2931  is designated 1):
2932 \end_layout
2933
2934 \begin_layout LyX-Code
2935 ...
2936  | extract_seq --beg=1 --len=20
2937 \end_layout
2938
2939 \begin_layout Standard
2940 You can also specify a begin and end coordinate set:
2941 \end_layout
2942
2943 \begin_layout LyX-Code
2944 ...
2945  | extract_seq --beg=20 --end=40
2946 \end_layout
2947
2948 \begin_layout Standard
2949 If you want the subsequences from position 20 to the sequence end do:
2950 \end_layout
2951
2952 \begin_layout LyX-Code
2953 ...
2954  | extract_seq --beg=20
2955 \end_layout
2956
2957 \begin_layout Standard
2958 If you want to extract subsequences a given distance from the sequence end
2959  you can do this by reversing the sequence with the biopiece
2960 \series bold
2961 reverse_seq
2962 \series default
2963
2964 \begin_inset LatexCommand eqref
2965 reference "sub:How-to-reverse-seq"
2966
2967 \end_inset
2968
2969 , followed by
2970 \series bold
2971 extract_seq
2972 \series default
2973  to get the subsequence, and then
2974 \series bold
2975 reverse_seq
2976 \series default
2977  again to get the subsequence back in the original orientation:
2978 \end_layout
2979
2980 \begin_layout LyX-Code
2981 read_fasta --data_in=test.fna | reverse_seq
2982 \end_layout
2983
2984 \begin_layout LyX-Code
2985 | extract_seq --beg=10 --len=10 | reverse_seq
2986 \end_layout
2987
2988 \begin_layout Subsection
2989 How to get genomic sequence?
2990 \begin_inset LatexCommand label
2991 name "sub:How-to-get-genomic-sequence"
2992
2993 \end_inset
2994
2995
2996 \end_layout
2997
2998 \begin_layout Standard
2999 The biopiece
3000 \series bold
3001 get_genomic_seq
3002 \series default
3003  can extract subsequences for a given genome specified with the
3004 \begin_inset ERT
3005 status collapsed
3006
3007 \begin_layout Standard
3008
3009 -
3010 \backslash
3011 /-
3012 \end_layout
3013
3014 \end_inset
3015
3016 genome switch explicitly using the
3017 \begin_inset ERT
3018 status collapsed
3019
3020 \begin_layout Standard
3021
3022 -
3023 \backslash
3024 /-
3025 \end_layout
3026
3027 \end_inset
3028
3029 beg and
3030 \begin_inset ERT
3031 status collapsed
3032
3033 \begin_layout Standard
3034
3035 -
3036 \backslash
3037 /-
3038 \end_layout
3039
3040 \end_inset
3041
3042 end/
3043 \begin_inset ERT
3044 status collapsed
3045
3046 \begin_layout Standard
3047
3048 -
3049 \backslash
3050 /-
3051 \end_layout
3052
3053 \end_inset
3054
3055 len switches:
3056 \end_layout
3057
3058 \begin_layout LyX-Code
3059 get_genome_seq --genome=<genome> --beg=1 --len=100
3060 \end_layout
3061
3062 \begin_layout Standard
3063 Alternatively,
3064 \series bold
3065 get_genome_seq
3066 \series default
3067  can be used to append the corresponding sequence to BED, PSL, and BLAST
3068  records:
3069 \end_layout
3070
3071 \begin_layout LyX-Code
3072 read_bed --data_in=<BED file> | get_genome_seq --genome=<genome>
3073 \end_layout
3074
3075 \begin_layout Standard
3076 It is also possible to include flaking sequence using the
3077 \begin_inset ERT
3078 status collapsed
3079
3080 \begin_layout Standard
3081
3082 -
3083 \backslash
3084 /-
3085 \end_layout
3086
3087 \end_inset
3088
3089 flank switch.
3090  So to include 50 nucleotides upstream and 50 nucleotides downstream for
3091  each BED entry do:
3092 \end_layout
3093
3094 \begin_layout LyX-Code
3095 read_bed --data_in=<BED file> | get_genome_seq --genome=<genome> --flank=50
3096 \end_layout
3097
3098 \begin_layout Subsection
3099 How to upper-case sequences?
3100 \end_layout
3101
3102 \begin_layout Standard
3103 Sequences can be shifted from lower case to upper case using
3104 \series bold
3105 uppercase_seq
3106 \series default
3107 :
3108 \end_layout
3109
3110 \begin_layout LyX-Code
3111 ...
3112  | uppercase_seq
3113 \end_layout
3114
3115 \begin_layout Subsection
3116 How to reverse sequences?
3117 \begin_inset LatexCommand label
3118 name "sub:How-to-reverse-seq"
3119
3120 \end_inset
3121
3122
3123 \end_layout
3124
3125 \begin_layout Standard
3126 The order of residues in a sequence can be reversed using reverse_seq:
3127 \end_layout
3128
3129 \begin_layout LyX-Code
3130 ...
3131  | reverse_seq
3132 \end_layout
3133
3134 \begin_layout Standard
3135 Note that in order to reverse/complement a sequence you also need the
3136 \series bold
3137 complement_seq
3138 \series default
3139  biopiece (see\InsetSpace ~
3140
3141 \begin_inset LatexCommand ref
3142 reference "sub:How-to-complement"
3143
3144 \end_inset
3145
3146 ).
3147 \end_layout
3148
3149 \begin_layout Subsection
3150 How to complement sequences?
3151 \begin_inset LatexCommand label
3152 name "sub:How-to-complement"
3153
3154 \end_inset
3155
3156
3157 \end_layout
3158
3159 \begin_layout Standard
3160 DNA and RNA sequences can be complemented with
3161 \series bold
3162 complement_seq
3163 \series default
3164 , which automagically determines the sequence type:
3165 \end_layout
3166
3167 \begin_layout LyX-Code
3168 ...
3169  | complement_seq
3170 \end_layout
3171
3172 \begin_layout Standard
3173 Note that in order to reverse/complement a sequence you also need the
3174 \series bold
3175 reverse_seq
3176 \series default
3177  biopiece (see\InsetSpace ~
3178
3179 \begin_inset LatexCommand ref
3180 reference "sub:How-to-reverse-seq"
3181
3182 \end_inset
3183
3184 ).
3185 \end_layout
3186
3187 \begin_layout Subsection
3188 How to remove indels from sequnces?
3189 \end_layout
3190
3191 \begin_layout Standard
3192 Indels can be removed from sequences with the
3193 \series bold
3194 remove_indels
3195 \series default
3196  biopiece.
3197  This is useful if you have aligned some sequences (see\InsetSpace ~
3198
3199 \begin_inset LatexCommand ref
3200 reference "sub:How-to-align"
3201
3202 \end_inset
3203
3204 ) and extracted (see\InsetSpace ~
3205
3206 \begin_inset LatexCommand ref
3207 reference "sub:How-to-extract"
3208
3209 \end_inset
3210
3211 ) a block of subsequences from the alignment and you want to use these sequence
3212  in a search where you need to remove the indels first.
3213  '-', '~', and '.' are considered indels:
3214 \end_layout
3215
3216 \begin_layout LyX-Code
3217 ...
3218  | remove_indels
3219 \end_layout
3220
3221 \begin_layout Subsection
3222 How to shuffle sequences?
3223 \end_layout
3224
3225 \begin_layout Standard
3226 All residues in sequences in the stream can be shuffled to random positions
3227  using the
3228 \series bold
3229 shuffle_seq
3230 \series default
3231  biopiece:
3232 \end_layout
3233
3234 \begin_layout LyX-Code
3235 ...
3236  | shuffle_seq
3237 \end_layout
3238
3239 \begin_layout Subsection
3240 How to split sequences into overlapping subsequences?
3241 \end_layout
3242
3243 \begin_layout Standard
3244 Sequences can be slit into overlapping subsequences with the
3245 \series bold
3246 split_seq
3247 \series default
3248  biopiece.
3249 \end_layout
3250
3251 \begin_layout LyX-Code
3252 ...
3253  | split_seq --word_size=20 --uniq
3254 \end_layout
3255
3256 \begin_layout Subsection
3257 How to determine the oligo frequency?
3258 \end_layout
3259
3260 \begin_layout Standard
3261 In order to determine if any oligo usage is over represented in one or more
3262  sequences you can determine the frequency of oligos of a given size with
3263
3264 \series bold
3265 oligo_freq
3266 \series default
3267 :
3268 \end_layout
3269
3270 \begin_layout LyX-Code
3271 ...
3272  | oligo_freq --word_size=4
3273 \end_layout
3274
3275 \begin_layout Standard
3276 And if you have more than one sequence and want to accumulate the frequences
3277  you need the
3278 \begin_inset ERT
3279 status collapsed
3280
3281 \begin_layout Standard
3282
3283 -
3284 \backslash
3285 /-
3286 \end_layout
3287
3288 \end_inset
3289
3290 all switch:
3291 \end_layout
3292
3293 \begin_layout LyX-Code
3294 ...
3295  | oligo_freq --word_size=4 --all
3296 \end_layout
3297
3298 \begin_layout Standard
3299 To get a meaningful result you need to write the resulting frequencies as
3300  a table with
3301 \series bold
3302 write_tab
3303 \series default
3304  (see\InsetSpace ~
3305
3306 \begin_inset LatexCommand ref
3307 reference "sub:How-to-write-tab"
3308
3309 \end_inset
3310
3311 ), but first it is important to
3312 \series bold
3313 grab
3314 \series default
3315  (see\InsetSpace ~
3316
3317 \begin_inset LatexCommand ref
3318 reference "sub:How-to-grab"
3319
3320 \end_inset
3321
3322 ) the records with the frequencies to avoid full length sequences in the
3323  table:
3324 \end_layout
3325
3326 \begin_layout LyX-Code
3327 ...
3328  | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only
3329 \end_layout
3330
3331 \begin_layout LyX-Code
3332 | write_tab --no_stream
3333 \end_layout
3334
3335 \begin_layout Standard
3336 And the resulting frequency table can be sorted with Unix sort (man sort).
3337 \end_layout
3338
3339 \begin_layout Subsection
3340 How to search for sequences in genomes?
3341 \end_layout
3342
3343 \begin_layout Standard
3344 See the following biopiece:
3345 \end_layout
3346
3347 \begin_layout Itemize
3348
3349 \series bold
3350 patscan_seq
3351 \series default
3352
3353 \begin_inset LatexCommand eqref
3354 reference "sub:How-to-use-patscan"
3355
3356 \end_inset
3357
3358
3359 \end_layout
3360
3361 \begin_layout Itemize
3362
3363 \series bold
3364 blat_seq
3365 \series default
3366
3367 \begin_inset LatexCommand eqref
3368 reference "sub:How-to-use-BLAT"
3369
3370 \end_inset
3371
3372
3373 \end_layout
3374
3375 \begin_layout Itemize
3376
3377 \series bold
3378 blast_seq
3379 \series default
3380
3381 \begin_inset LatexCommand eqref
3382 reference "sub:How-to-use-BLAST"
3383
3384 \end_inset
3385
3386
3387 \end_layout
3388
3389 \begin_layout Itemize
3390
3391 \series bold
3392 vmatch_seq
3393 \series default
3394
3395 \begin_inset LatexCommand eqref
3396 reference "sub:How-to-use-Vmatch"
3397
3398 \end_inset
3399
3400
3401 \end_layout
3402
3403 \begin_layout Subsection
3404 How to search sequences for a pattern?
3405 \begin_inset LatexCommand label
3406 name "sub:How-to-use-patscan"
3407
3408 \end_inset
3409
3410
3411 \end_layout
3412
3413 \begin_layout Standard
3414 It is possible to search sequences in the data stream for patterns using
3415  the
3416 \series bold
3417 patscan_seq
3418 \series default
3419  biopiece which utilizes the powerful scan_for_matches engine.
3420  Consult the documentation for scan_for_matches in order to learn how to
3421  define patterns (the documentation is included in Appendix\InsetSpace ~
3422
3423 \begin_inset LatexCommand ref
3424 reference "sec:scan_for_matches-README"
3425
3426 \end_inset
3427
3428 ).
3429 \end_layout
3430
3431 \begin_layout Standard
3432 To search all sequences for a simple pattern consisting of the sequence
3433  ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion:
3434 \end_layout
3435
3436 \begin_layout LyX-Code
3437 read_fasta --data_in=<file> | patscan_seq --pattern='ATCGATCG[3,2,1]'
3438 \end_layout
3439
3440 \begin_layout Standard
3441 The
3442 \begin_inset ERT
3443 status collapsed
3444
3445 \begin_layout Standard
3446
3447 -
3448 \backslash
3449 /-
3450 \end_layout
3451
3452 \end_inset
3453
3454 pattern switch takes a comma seperated list of patterns, so if you want
3455  to search for more that one pattern do:
3456 \end_layout
3457
3458 \begin_layout LyX-Code
3459 ...
3460  | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]'
3461 \end_layout
3462
3463 \begin_layout Standard
3464 It is also possible to have a list of different patterns to search for in
3465  a file with one pattern per line.
3466  In order to get
3467 \series bold
3468 patscan_seq
3469 \series default
3470  to read these patterns use the
3471 \begin_inset ERT
3472 status collapsed
3473
3474 \begin_layout Standard
3475
3476 -
3477 \backslash
3478 /-
3479 \end_layout
3480
3481 \end_inset
3482
3483 pattern_in switch:
3484 \end_layout
3485
3486 \begin_layout LyX-Code
3487 ...
3488  | patscan_seq --pattern_in=<file>
3489 \end_layout
3490
3491 \begin_layout Standard
3492 To also scan the complementary strand in nucleotide sequences (
3493 \series bold
3494 patscan_seq
3495 \series default
3496  automagically determines the sequence type) you need to add the
3497 \begin_inset ERT
3498 status collapsed
3499
3500 \begin_layout Standard
3501
3502 -
3503 \backslash
3504 /-
3505 \end_layout
3506
3507 \end_inset
3508
3509 comp switch:
3510 \end_layout
3511
3512 \begin_layout LyX-Code
3513 ...
3514  | patscan_seq --pattern=<pattern> --comp
3515 \end_layout
3516
3517 \begin_layout Standard
3518 It is also possible to use
3519 \series bold
3520 patscan_seq
3521 \series default
3522  to output those records that does not contain a certain pattern by using
3523  the
3524 \begin_inset ERT
3525 status collapsed
3526
3527 \begin_layout Standard
3528
3529 -
3530 \backslash
3531 /-
3532 \end_layout
3533
3534 \end_inset
3535
3536 invert switch:
3537 \end_layout
3538
3539 \begin_layout LyX-Code
3540 ...
3541  | patscan_seq --pattern=<pattern> --invert
3542 \end_layout
3543
3544 \begin_layout Standard
3545 Finally,
3546 \series bold
3547 patscan_seq
3548 \series default
3549  can also scan for patterns in a given genome sequence, instead of sequences
3550  in the stream, using the
3551 \begin_inset ERT
3552 status open
3553
3554 \begin_layout Standard
3555
3556 -
3557 \backslash
3558 /-
3559 \end_layout
3560
3561 \end_inset
3562
3563 genome switch:
3564 \end_layout
3565
3566 \begin_layout LyX-Code
3567 patscan --pattern=<pattern> --genome=<genome>
3568 \end_layout
3569
3570 \begin_layout Subsection
3571 How to use BLAT for sequence search?
3572 \begin_inset LatexCommand label
3573 name "sub:How-to-use-BLAT"
3574
3575 \end_inset
3576
3577
3578 \end_layout
3579
3580 \begin_layout Standard
3581 Sequences in the data stream can be matched against supported genomes using
3582
3583 \series bold
3584 blat_seq
3585 \series default
3586  which is a biopiece using BLAT as the name might suggest.
3587  Currently only Mouse and Human genomes are available and it is not possible
3588  to use OOC files since there is still a need for a local repository for
3589  genome files.
3590  Otherwise it is just:
3591 \end_layout
3592
3593 \begin_layout LyX-Code
3594 read_fasta --data_in=<file> | blat_seq --genome=<genome>
3595 \end_layout
3596
3597 \begin_layout Standard
3598 The search results can then be written to file with
3599 \series bold
3600 write_psl
3601 \series default
3602  (see\InsetSpace ~
3603
3604 \begin_inset LatexCommand ref
3605 reference "sub:How-to-write-PSL"
3606
3607 \end_inset
3608
3609 ) or
3610 \series bold
3611 write_bed
3612 \series default
3613  (see\InsetSpace ~
3614
3615 \begin_inset LatexCommand ref
3616 reference "sub:How-to-write-BED"
3617
3618 \end_inset
3619
3620 ) allthough with
3621 \series bold
3622 write_bed
3623 \series default
3624  some information will be lost).
3625  It is also possible to plot chromosome distribution of the search results
3626  using
3627 \series bold
3628 plot_chrdist
3629 \series default
3630  (see\InsetSpace ~
3631
3632 \begin_inset LatexCommand ref
3633 reference "sub:How-to-plot-chrdist"
3634
3635 \end_inset
3636
3637 ) or the distribution of the match lengths using
3638 \series bold
3639 plot_lendist
3640 \series default
3641  (see\InsetSpace ~
3642
3643 \begin_inset LatexCommand ref
3644 reference "sub:How-to-plot-lendist"
3645
3646 \end_inset
3647
3648 ) or a karyogram with the hits using
3649 \series bold
3650 plot_karyogram
3651 \series default
3652  (see\InsetSpace ~
3653
3654 \begin_inset LatexCommand ref
3655 reference "sub:How-to-plot-karyogram"
3656
3657 \end_inset
3658
3659 ).
3660 \end_layout
3661
3662 \begin_layout Subsection
3663 How to use BLAST for sequence search?
3664 \begin_inset LatexCommand label
3665 name "sub:How-to-use-BLAST"
3666
3667 \end_inset
3668
3669
3670 \end_layout
3671
3672 \begin_layout Standard
3673 Two biopieces exist for blasting sequences:
3674 \series bold
3675 create_blast_db
3676 \series default
3677  is used to create the BLAST database required for BLAST which is queried
3678  using the biopiece
3679 \series bold
3680 blast_seq
3681 \series default
3682 .
3683  So in order to create a BLAST database from sequences in the data stream
3684  you simple run:
3685 \end_layout
3686
3687 \begin_layout LyX-Code
3688 ...
3689  | create_blast_db --database=my_database --no_stream
3690 \end_layout
3691
3692 \begin_layout Standard
3693 The type of sequence to use for the database is automagically determined
3694  by
3695 \series bold
3696 create_blast_db
3697 \series default
3698 , but don't have a mixture of peptide and nucleic acids sequences in the
3699  stream.
3700  The
3701 \begin_inset ERT
3702 status collapsed
3703
3704 \begin_layout Standard
3705
3706 -
3707 \backslash
3708 /-
3709 \end_layout
3710
3711 \end_inset
3712
3713 database switch takes a path as argument, but will default to 'blastdb_<time_sta
3714 mp> if not set.
3715 \end_layout
3716
3717 \begin_layout Standard
3718 The resulting database can now be queried with sequences in another data
3719  stream using
3720 \series bold
3721 blast_seq
3722 \series default
3723 :
3724 \end_layout
3725
3726 \begin_layout LyX-Code
3727 ...
3728  | blast_seq --database=my_database
3729 \end_layout
3730
3731 \begin_layout Standard
3732 Again, the sequence type is determined automagically and the appropriate
3733  BLAST program is guessed (see below table), however, the program name can
3734  be overruled with the
3735 \begin_inset ERT
3736 status collapsed
3737
3738 \begin_layout Standard
3739
3740 -
3741 \backslash
3742 /-
3743 \end_layout
3744
3745 \end_inset
3746
3747 program switch.
3748 \end_layout
3749
3750 \begin_layout Standard
3751 \noindent
3752 \align center
3753 \begin_inset Tabular
3754 <lyxtabular version="3" rows="5" columns="3">
3755 <features>
3756 <column alignment="center" valignment="top" width="0">
3757 <column alignment="center" valignment="top" width="0">
3758 <column alignment="center" valignment="top" width="0">
3759 <row bottomline="true">
3760 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3761 \begin_inset Text
3762
3763 \begin_layout Standard
3764 Subject sequence
3765 \end_layout
3766
3767 \end_inset
3768 </cell>
3769 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3770 \begin_inset Text
3771
3772 \begin_layout Standard
3773 Query sequence
3774 \end_layout
3775
3776 \end_inset
3777 </cell>
3778 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3779 \begin_inset Text
3780
3781 \begin_layout Standard
3782 Program guess
3783 \end_layout
3784
3785 \end_inset
3786 </cell>
3787 </row>
3788 <row>
3789 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3790 \begin_inset Text
3791
3792 \begin_layout Standard
3793 Nucleotide
3794 \end_layout
3795
3796 \end_inset
3797 </cell>
3798 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3799 \begin_inset Text
3800
3801 \begin_layout Standard
3802 Nucleotide
3803 \end_layout
3804
3805 \end_inset
3806 </cell>
3807 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3808 \begin_inset Text
3809
3810 \begin_layout Standard
3811 blastn
3812 \end_layout
3813
3814 \end_inset
3815 </cell>
3816 </row>
3817 <row>
3818 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3819 \begin_inset Text
3820
3821 \begin_layout Standard
3822 Protein
3823 \end_layout
3824
3825 \end_inset
3826 </cell>
3827 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3828 \begin_inset Text
3829
3830 \begin_layout Standard
3831 Protein
3832 \end_layout
3833
3834 \end_inset
3835 </cell>
3836 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3837 \begin_inset Text
3838
3839 \begin_layout Standard
3840 blastp
3841 \end_layout
3842
3843 \end_inset
3844 </cell>
3845 </row>
3846 <row>
3847 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3848 \begin_inset Text
3849
3850 \begin_layout Standard
3851 Protein
3852 \end_layout
3853
3854 \end_inset
3855 </cell>
3856 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3857 \begin_inset Text
3858
3859 \begin_layout Standard
3860 Nucleotide
3861 \end_layout
3862
3863 \end_inset
3864 </cell>
3865 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3866 \begin_inset Text
3867
3868 \begin_layout Standard
3869 blastx
3870 \end_layout
3871
3872 \end_inset
3873 </cell>
3874 </row>
3875 <row>
3876 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3877 \begin_inset Text
3878
3879 \begin_layout Standard
3880 Nucleotide
3881 \end_layout
3882
3883 \end_inset
3884 </cell>
3885 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
3886 \begin_inset Text
3887
3888 \begin_layout Standard
3889 Protein
3890 \end_layout
3891
3892 \end_inset
3893 </cell>
3894 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
3895 \begin_inset Text
3896
3897 \begin_layout Standard
3898 tblastn
3899 \end_layout
3900
3901 \end_inset
3902 </cell>
3903 </row>
3904 </lyxtabular>
3905
3906 \end_inset
3907
3908
3909 \end_layout
3910
3911 \begin_layout Standard
3912 Finally, it is also possible to use
3913 \series bold
3914 blast_seq
3915 \series default
3916  for blasting sequences agains a preformatted genome using the
3917 \begin_inset ERT
3918 status collapsed
3919
3920 \begin_layout Standard
3921
3922 -
3923 \backslash
3924 /-
3925 \end_layout
3926
3927 \end_inset
3928
3929 genome switch instead of the
3930 \begin_inset ERT
3931 status collapsed
3932
3933 \begin_layout Standard
3934
3935 -
3936 \backslash
3937 /-
3938 \end_layout
3939
3940 \end_inset
3941
3942 database switch:
3943 \end_layout
3944
3945 \begin_layout LyX-Code
3946 ...
3947  | blast_seq --genome=<genome>
3948 \end_layout
3949
3950 \begin_layout Subsection
3951 How to use Vmatch for sequence search?
3952 \begin_inset LatexCommand label
3953 name "sub:How-to-use-Vmatch"
3954
3955 \end_inset
3956
3957
3958 \end_layout
3959
3960 \begin_layout Standard
3961 The powerful suffix array software package Vmatch
3962 \begin_inset Foot
3963 status collapsed
3964
3965 \begin_layout Standard
3966 \begin_inset LatexCommand url
3967 target "http://www.vmatch.de/"
3968
3969 \end_inset
3970
3971
3972 \end_layout
3973
3974 \end_inset
3975
3976  can be used for exact mapping of sequences against indexed genomes using
3977  the biopiece
3978 \series bold
3979 vmatch_seq
3980 \series default
3981 , which will e.g.
3982  map 700000 ESTs to the human genome locating all 160 mio hits in less than
3983  an hour.
3984  Only nucleotide sequences and sequences longer than 11 nucleotides will
3985  be mapped.
3986  It is recommended that sequences consisting of mostly one nucleotide type
3987  are removed.
3988  This can be done with the
3989 \series bold
3990 analyze_seq
3991 \series default
3992  biopiece
3993 \begin_inset LatexCommand eqref
3994 reference "sub:How-to-analyze"
3995
3996 \end_inset
3997
3998 .
3999 \end_layout
4000
4001 \begin_layout LyX-Code
4002 ...
4003  | vmatch_seq --genome=<genome>
4004 \end_layout
4005
4006 \begin_layout Standard
4007 It is also possible to allow for mismatches using the
4008 \begin_inset ERT
4009 status collapsed
4010
4011 \begin_layout Standard
4012
4013 -
4014 \backslash
4015 /-
4016 \end_layout
4017
4018 \end_inset
4019
4020 hamming_dist switch.
4021  So to allow for 2 mismatches:
4022 \end_layout
4023
4024 \begin_layout LyX-Code
4025 ...
4026  | vmatch_seq --genome=<genome> --hamming_dist=2
4027 \end_layout
4028
4029 \begin_layout Standard
4030 Or to allow for 10% mismathing nucleotides:
4031 \end_layout
4032
4033 \begin_layout LyX-Code
4034 ...
4035  | vmatch_seq --genome=<genome> --hamming_dist=10p
4036 \end_layout
4037
4038 \begin_layout Standard
4039 To allow both indels and mismatches use the
4040 \begin_inset ERT
4041 status collapsed
4042
4043 \begin_layout Standard
4044
4045 -
4046 \backslash
4047 /-
4048 \end_layout
4049
4050 \end_inset
4051
4052 edit_dist switch.
4053  So to allow for one mismatch or one indel:
4054 \end_layout
4055
4056 \begin_layout LyX-Code
4057 ...
4058  | vmatch_seq --genome=<genome> --hamming_dist=1
4059 \end_layout
4060
4061 \begin_layout Standard
4062 Or to allow for 5% indels or mismatches:
4063 \end_layout
4064
4065 \begin_layout LyX-Code
4066 ...
4067  | vmatch_seq --genome=<genome> --hamming_dist=5p
4068 \end_layout
4069
4070 \begin_layout Standard
4071 Note that using
4072 \begin_inset ERT
4073 status collapsed
4074
4075 \begin_layout Standard
4076
4077 -
4078 \backslash
4079 /-
4080 \end_layout
4081
4082 \end_inset
4083
4084 hamming_dist or
4085 \begin_inset ERT
4086 status collapsed
4087
4088 \begin_layout Standard
4089
4090 -
4091 \backslash
4092 /-
4093 \end_layout
4094
4095 \end_inset
4096
4097 edit_dist greatly slows down vmatch considerably --- use with care.
4098 \end_layout
4099
4100 \begin_layout Standard
4101 The resulting SCORE key can be replaced to hold the number of genome matches
4102  of a given sequence (multi-mappers) is the
4103 \begin_inset ERT
4104 status collapsed
4105
4106 \begin_layout Standard
4107
4108 -
4109 \backslash
4110 /-
4111 \end_layout
4112
4113 \end_inset
4114
4115 count switch is given.
4116 \end_layout
4117
4118 \begin_layout Subsection
4119 How to find all matches between sequences?
4120 \begin_inset LatexCommand label
4121 name "sub:How-to-find-matches"
4122
4123 \end_inset
4124
4125
4126 \end_layout
4127
4128 \begin_layout Standard
4129 All matches between two sequences can be determined with the biopiece
4130 \series bold
4131 match_seq
4132 \series default
4133 .
4134  The match finding engine underneath the hood of
4135 \series bold
4136 match_seq
4137 \series default
4138  is the super fast suffix tree program MUMmer
4139 \begin_inset Foot
4140 status collapsed
4141
4142 \begin_layout Standard
4143 \begin_inset LatexCommand url
4144 target "http://mummer.sourceforge.net/"
4145
4146 \end_inset
4147
4148
4149 \end_layout
4150
4151 \end_inset
4152
4153 , which will locate all forward and reverse matches between huge sequences
4154  in a matter of minutes (if the repeat count is not too high and if the
4155  word size used is appropriate).
4156  Matching two
4157 \emph on
4158 Helicobacter pylori
4159 \emph default
4160  genomes (1.7Mbp) takes around 10 seconds:
4161 \end_layout
4162
4163 \begin_layout LyX-Code
4164 ...
4165  | match_seq --word_size=20 --direction=both
4166 \end_layout
4167
4168 \begin_layout Standard
4169 The output from
4170 \series bold
4171 match_seq
4172 \series default
4173  can be used to generate a dot plot with
4174 \series bold
4175 plot_matches
4176 \series default
4177  (see\InsetSpace ~
4178
4179 \begin_inset LatexCommand ref
4180 reference "sub:How-to-generate-dotplot"
4181
4182 \end_inset
4183
4184 ).
4185 \end_layout
4186
4187 \begin_layout Subsection
4188 How to align sequences?
4189 \begin_inset LatexCommand label
4190 name "sub:How-to-align"
4191
4192 \end_inset
4193
4194
4195 \end_layout
4196
4197 \begin_layout Standard
4198 Sequences in the stream can be aligned with the
4199 \series bold
4200 align_seq
4201 \series default
4202  biopiece that uses Muscle
4203 \begin_inset Foot
4204 status open
4205
4206 \begin_layout Standard
4207 \begin_inset LatexCommand url
4208 target "http://www.drive5.com/muscle/muscle.html"
4209
4210 \end_inset
4211
4212
4213 \end_layout
4214
4215 \end_inset
4216
4217  as aligment engine.
4218  Currently you cannot change any of the Muscle alignment parameters and
4219
4220 \series bold
4221 align_seq
4222 \series default
4223  will create an alignment based on the defaults (which are really good!):
4224 \end_layout
4225
4226 \begin_layout LyX-Code
4227 ...
4228  | align_seq
4229 \end_layout
4230
4231 \begin_layout Standard
4232 The aligned output can be written to file in FASTA format using
4233 \series bold
4234 write_fasta
4235 \series default
4236  (see\InsetSpace ~
4237
4238 \begin_inset LatexCommand ref
4239 reference "sub:How-to-write-fasta"
4240
4241 \end_inset
4242
4243 ) or in pretty text using
4244 \series bold
4245 write_align
4246 \series default
4247  (see\InsetSpace ~
4248
4249 \begin_inset LatexCommand ref
4250 reference "sub:How-to-write-alignment"
4251
4252 \end_inset
4253
4254 ).
4255 \end_layout
4256
4257 \begin_layout Subsection
4258 How to create a weight matrix?
4259 \end_layout
4260
4261 \begin_layout Standard
4262 If you want a weight matrix to show the sequence composition of a stack
4263  of sequences you can use the biopiece create_weight_matrix:
4264 \end_layout
4265
4266 \begin_layout LyX-Code
4267 ...
4268  | create_weight_matrix
4269 \end_layout
4270
4271 \begin_layout Standard
4272 The result can be output in percent using the
4273 \begin_inset ERT
4274 status open
4275
4276 \begin_layout Standard
4277
4278 -
4279 \backslash
4280 /-
4281 \end_layout
4282
4283 \end_inset
4284
4285 percent switch:
4286 \end_layout
4287
4288 \begin_layout LyX-Code
4289 ...
4290  | create_weight_matrix --percent
4291 \end_layout
4292
4293 \begin_layout Standard
4294 The weight matrix can be written as tabular output with
4295 \series bold
4296 write_tab
4297 \series default
4298  (see\InsetSpace ~
4299
4300 \begin_inset LatexCommand ref
4301 reference "sub:How-to-write-tab"
4302
4303 \end_inset
4304
4305 ) after removeing the records containing SEQ with
4306 \series bold
4307 grab
4308 \series default
4309  (see\InsetSpace ~
4310
4311 \begin_inset LatexCommand ref
4312 reference "sub:How-to-grab"
4313
4314 \end_inset
4315
4316 ):
4317 \end_layout
4318
4319 \begin_layout LyX-Code
4320 ...
4321  | create_weight_matrix | grab --invert --keys=SEQ --keys_only
4322 \end_layout
4323
4324 \begin_layout LyX-Code
4325 | write_tab --no_stream
4326 \end_layout
4327
4328 \begin_layout Standard
4329 The V0 column will hold the residue, while the rest of the columns will
4330  hold the frequencies for each sequence position.
4331 \end_layout
4332
4333 \begin_layout Section
4334 Plotting
4335 \end_layout
4336
4337 \begin_layout Standard
4338 There exists several biopieces for plotting.
4339  Some of these are based on GNUplot
4340 \begin_inset Foot
4341 status open
4342
4343 \begin_layout Standard
4344 \begin_inset LatexCommand url
4345 target "http://www.gnuplot.info/"
4346
4347 \end_inset
4348
4349
4350 \end_layout
4351
4352 \end_inset
4353
4354 , which is an extremely powerful platform to generate all sorts of plots
4355  and even though GNUplot has quite a steep learning curve, the biopieces
4356  utilizing GNUplot are simple to use.
4357  GNUplot is able to output a lot of different formats (called terminals
4358  in GNUplot), but the biopieces focusses on three formats only:
4359 \end_layout
4360
4361 \begin_layout Enumerate
4362 The 'dumb' terminal is default to the GNUplot based biopieces and will output
4363  a plot in crude ASCII text (Fig.\InsetSpace ~
4364
4365 \begin_inset LatexCommand ref
4366 reference "fig:Dumb-terminal"
4367
4368 \end_inset
4369
4370 ).
4371  This is quite nice for a quick and dirty plot to get an overview of your
4372  data .
4373 \end_layout
4374
4375 \begin_layout Enumerate
4376 The 'post' or 'postscript' terminal output postscript code which is publication
4377  grade graphics that can be viewed with applications such as Ghostview,
4378  Photoshop, and Preview.
4379 \end_layout
4380
4381 \begin_layout Enumerate
4382 The 'svg' terminal output's scalable vector graphics (SVG) which is a vector
4383  based format.
4384  SVG is great because you can edit the resulting plot using Photoshop or
4385  Inkscape
4386 \begin_inset Foot
4387 status collapsed
4388
4389 \begin_layout Standard
4390 Inkscape is a really handy drawing program that is free and open source.
4391  Availble at
4392 \begin_inset LatexCommand htmlurl
4393 target "http://www.inkscape.org"
4394
4395 \end_inset
4396
4397
4398 \end_layout
4399
4400 \end_inset
4401
4402  if you want to add additional labels, captions, arrows, and so on and then
4403  save the result in different formats, such as postscript without loosing
4404  resolution.
4405 \end_layout
4406
4407 \begin_layout Standard
4408 The biopieces for plotting that are not based on GNUplot only output SVG
4409  (that may change in the future).
4410 \end_layout
4411
4412 \begin_layout Standard
4413 \begin_inset Float figure
4414 wide false
4415 sideways false
4416 status open
4417
4418 \begin_layout Standard
4419 \noindent
4420 \align center
4421 \begin_inset Graphics
4422         filename lendist_ascii.png
4423         lyxscale 70
4424         width 12cm
4425
4426 \end_inset
4427
4428
4429 \end_layout
4430
4431 \begin_layout Standard
4432 \begin_inset Caption
4433
4434 \begin_layout Standard
4435 \begin_inset LatexCommand label
4436 name "fig:Dumb-terminal"
4437
4438 \end_inset
4439
4440 Dumb terminal
4441 \end_layout
4442
4443 \end_inset
4444
4445
4446 \end_layout
4447
4448 \begin_layout Quote
4449 The output of a length distribution plot in the default 'dumb terminal'
4450  to the terminal window.
4451
4452 \end_layout
4453
4454 \end_inset
4455
4456
4457 \end_layout
4458
4459 \begin_layout Subsection
4460 How to plot a histogram?
4461 \begin_inset LatexCommand label
4462 name "How-to-plot-histogram"
4463
4464 \end_inset
4465
4466
4467 \end_layout
4468
4469 \begin_layout Standard
4470 A generic histogram for a given value can be plotted with the biopiece
4471 \series bold
4472 plot_histogram
4473 \series default
4474  (Fig.\InsetSpace ~
4475
4476 \begin_inset LatexCommand ref
4477 reference "fig:Histogram"
4478
4479 \end_inset
4480
4481 ):
4482 \end_layout
4483
4484 \begin_layout LyX-Code
4485 ...
4486  | plot_histogram --key=TISSUE --no_stream
4487 \end_layout
4488
4489 \begin_layout Standard
4490 (Figure missing)
4491 \end_layout
4492
4493 \begin_layout Standard
4494 \noindent
4495 \align left
4496 \begin_inset Float figure
4497 wide false
4498 sideways false
4499 status open
4500
4501 \begin_layout Standard
4502 \noindent
4503 \align center
4504 \begin_inset Graphics
4505         filename histogram.png
4506         lyxscale 70
4507         width 12cm
4508
4509 \end_inset
4510
4511
4512 \end_layout
4513
4514 \begin_layout Standard
4515 \begin_inset Caption
4516
4517 \begin_layout Standard
4518 \begin_inset LatexCommand label
4519 name "fig:Histogram"
4520
4521 \end_inset
4522
4523 Histogram
4524 \end_layout
4525
4526 \end_inset
4527
4528
4529 \end_layout
4530
4531 \end_inset
4532
4533
4534 \end_layout
4535
4536 \begin_layout Subsection
4537 How to plot a length distribution?
4538 \begin_inset LatexCommand label
4539 name "sub:How-to-plot-lendist"
4540
4541 \end_inset
4542
4543
4544 \end_layout
4545
4546 \begin_layout Standard
4547 Plotting of length distributions, weather sequence lengths, patterns lengths,
4548  hit lengths,
4549 \emph on
4550 etc.
4551
4552 \emph default
4553  is a really handy thing and can be done with the the biopiece
4554 \series bold
4555 plot_lendist
4556 \series default
4557 .
4558  If you have a file with FASTA entries and want to plot the length distribution
4559  you do it like this:
4560 \end_layout
4561
4562 \begin_layout LyX-Code
4563 read_fasta --data_in=<file> | length_seq
4564 \end_layout
4565
4566 \begin_layout LyX-Code
4567 | plot_lendist --key=SEQ_LEN --no_stream
4568 \end_layout
4569
4570 \begin_layout Standard
4571 The result will be written to the default dumb terminal and will look like
4572  Fig.\InsetSpace ~
4573
4574 \begin_inset LatexCommand ref
4575 reference "fig:Dumb-terminal"
4576
4577 \end_inset
4578
4579 .
4580 \end_layout
4581
4582 \begin_layout Standard
4583 If you instead want the result in postscript format you can do:
4584 \end_layout
4585
4586 \begin_layout LyX-Code
4587 ...
4588  | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps
4589 \end_layout
4590
4591 \begin_layout Standard
4592 That will generate the plot and save it to file, but not interrupt the data
4593  stream which can then be used in further analysis.
4594  You can also save the plot implicetly using '>', however, it is then important
4595  to terminate the stream with the
4596 \begin_inset ERT
4597 status collapsed
4598
4599 \begin_layout Standard
4600
4601 -
4602 \backslash
4603 /-
4604 \end_layout
4605
4606 \end_inset
4607
4608 no_stream switch:
4609 \end_layout
4610
4611 \begin_layout LyX-Code
4612 ...
4613  | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps
4614 \end_layout
4615
4616 \begin_layout Standard
4617 The resulting plot can be seen in Fig.\InsetSpace ~
4618
4619 \begin_inset LatexCommand ref
4620 reference "fig:Length-distribution"
4621
4622 \end_inset
4623
4624 .
4625 \end_layout
4626
4627 \begin_layout Standard
4628 \begin_inset Float figure
4629 wide false
4630 sideways false
4631 status open
4632
4633 \begin_layout Standard
4634
4635 \end_layout
4636
4637 \begin_layout Standard
4638 \noindent
4639 \align center
4640 \begin_inset Graphics
4641         filename lendist.ps
4642         lyxscale 50
4643         width 12cm
4644
4645 \end_inset
4646
4647
4648 \end_layout
4649
4650 \begin_layout Standard
4651 \begin_inset Caption
4652
4653 \begin_layout Standard
4654 \begin_inset LatexCommand label
4655 name "fig:Length-distribution"
4656
4657 \end_inset
4658
4659 Length distribution
4660 \end_layout
4661
4662 \end_inset
4663
4664
4665 \end_layout
4666
4667 \begin_layout Quote
4668 Length distribution of 630 piRNA like RNAs.
4669 \end_layout
4670
4671 \end_inset
4672
4673
4674 \end_layout
4675
4676 \begin_layout Subsection
4677 How to plot a chromosome distribution?
4678 \begin_inset LatexCommand label
4679 name "sub:How-to-plot-chrdist"
4680
4681 \end_inset
4682
4683
4684 \end_layout
4685
4686 \begin_layout Standard
4687 If you have the result of a sequence search against a multi chromosome genome,
4688  it is very practical to be able to plot the distribution of search hits
4689  on the different chromosomes.
4690  This can be done with
4691 \series bold
4692 plot_chrdist
4693 \series default
4694 :
4695 \end_layout
4696
4697 \begin_layout LyX-Code
4698 read_fasta --data_in=<file> | blat_genome | plot_chrdist --no_stream
4699 \end_layout
4700
4701 \begin_layout Standard
4702 The above example will result in a crude plot using the 'dumb' terminal,
4703  and if you want to mess around with the results from the BLAT search you
4704  probably want to save the result to file first (see\InsetSpace ~
4705
4706 \begin_inset LatexCommand ref
4707 reference "sub:How-to-write-PSL"
4708
4709 \end_inset
4710
4711 ).
4712  To plot the chromosome distribution from the saved search result you can
4713  do:
4714 \end_layout
4715
4716 \begin_layout LyX-Code
4717 read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps
4718 \end_layout
4719
4720 \begin_layout Standard
4721 That will result in the output show in Fig.\InsetSpace ~
4722
4723 \begin_inset LatexCommand ref
4724 reference "fig:Chromosome-distribution"
4725
4726 \end_inset
4727
4728 .
4729 \end_layout
4730
4731 \begin_layout Standard
4732 \begin_inset Float figure
4733 wide false
4734 sideways false
4735 status open
4736
4737 \begin_layout Standard
4738
4739 \end_layout
4740
4741 \begin_layout Standard
4742 \noindent
4743 \align center
4744 \begin_inset Graphics
4745         filename chrdist.ps
4746         lyxscale 50
4747         width 12cm
4748         rotateAngle 90
4749
4750 \end_inset
4751
4752
4753 \end_layout
4754
4755 \begin_layout Standard
4756 \begin_inset Caption
4757
4758 \begin_layout Standard
4759 \begin_inset LatexCommand label
4760 name "fig:Chromosome-distribution"
4761
4762 \end_inset
4763
4764 Chromosome distribution
4765 \end_layout
4766
4767 \end_inset
4768
4769
4770 \end_layout
4771
4772 \end_inset
4773
4774
4775 \end_layout
4776
4777 \begin_layout Subsection
4778 How to generate a dotplot?
4779 \begin_inset LatexCommand label
4780 name "sub:How-to-generate-dotplot"
4781
4782 \end_inset
4783
4784
4785 \end_layout
4786
4787 \begin_layout Standard
4788 A dotplot is a powerful way to get an overview of the size and location
4789  of sequence insertions, deletions, and duplications between two sequences.
4790  Generating a dotplot with biopieces is a two step process where you initially
4791  find all matches between two sequences using the tool
4792 \series bold
4793 match_seq
4794 \series default
4795  (see\InsetSpace ~
4796
4797 \begin_inset LatexCommand ref
4798 reference "sub:How-to-find-matches"
4799
4800 \end_inset
4801
4802 ) and plot the resulting matches with
4803 \series bold
4804 plot_matches
4805 \series default
4806 .
4807  Matching and plotting two
4808 \emph on
4809 Helicobacter pylori
4810 \emph default
4811  genomes (1.7Mbp) takes around 10 seconds:
4812 \end_layout
4813
4814 \begin_layout LyX-Code
4815 ...
4816  | match_seq | plot_matches --terminal=post --result_out=plot.ps
4817 \end_layout
4818
4819 \begin_layout Standard
4820 The resulting dotplot is in Fig.\InsetSpace ~
4821
4822 \begin_inset LatexCommand ref
4823 reference "fig:Dotplot"
4824
4825 \end_inset
4826
4827 .
4828 \end_layout
4829
4830 \begin_layout Standard
4831 \begin_inset Float figure
4832 wide false
4833 sideways false
4834 status open
4835
4836 \begin_layout Standard
4837 \noindent
4838 \align center
4839 \begin_inset Graphics
4840         filename dotplot.ps
4841         lyxscale 50
4842         width 12cm
4843
4844 \end_inset
4845
4846
4847 \end_layout
4848
4849 \begin_layout Standard
4850 \begin_inset Caption
4851
4852 \begin_layout Standard
4853 \begin_inset LatexCommand label
4854 name "fig:Dotplot"
4855
4856 \end_inset
4857
4858 Dotplot
4859 \end_layout
4860
4861 \end_inset
4862
4863
4864 \end_layout
4865
4866 \begin_layout Quote
4867 Forward matches are displayed in green while reverse matches are displayed
4868  in red.
4869 \end_layout
4870
4871 \end_inset
4872
4873
4874 \end_layout
4875
4876 \begin_layout Subsection
4877 How to plot a sequence logo?
4878 \end_layout
4879
4880 \begin_layout Standard
4881 Sequence logos can be generate with
4882 \series bold
4883 plot_seqlogo
4884 \series default
4885 .
4886  The sequnce type is determined automagically and an entropy scale of 2
4887  bits and 4 bits is used for nucleotide and peptide sequences, respectively
4888 \begin_inset Foot
4889 status collapsed
4890
4891 \begin_layout Standard
4892 \begin_inset LatexCommand htmlurl
4893 target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html"
4894
4895 \end_inset
4896
4897
4898 \end_layout
4899
4900 \end_inset
4901
4902 .
4903 \end_layout
4904
4905 \begin_layout LyX-Code
4906 ...
4907  | plot_seqlogo --no_stream --result_out=seqlogo.svg
4908 \end_layout
4909
4910 \begin_layout Standard
4911 An example of a sequence logo can be seen in Fig.\InsetSpace ~
4912
4913 \begin_inset LatexCommand ref
4914 reference "fig:Sequence-logo"
4915
4916 \end_inset
4917
4918 .
4919 \end_layout
4920
4921 \begin_layout Standard
4922 \begin_inset Float figure
4923 wide false
4924 sideways false
4925 status open
4926
4927 \begin_layout Standard
4928 \noindent
4929 \align center
4930 \begin_inset Graphics
4931         filename seqlogo.png
4932         lyxscale 50
4933         width 12cm
4934
4935 \end_inset
4936
4937
4938 \end_layout
4939
4940 \begin_layout Standard
4941 \begin_inset Caption
4942
4943 \begin_layout Standard
4944 \begin_inset LatexCommand label
4945 name "fig:Sequence-logo"
4946
4947 \end_inset
4948
4949 Sequence logo
4950 \end_layout
4951
4952 \end_inset
4953
4954
4955 \end_layout
4956
4957 \end_inset
4958
4959
4960 \end_layout
4961
4962 \begin_layout Subsection
4963 How to plot a karyogram?
4964 \begin_inset LatexCommand label
4965 name "sub:How-to-plot-karyogram"
4966
4967 \end_inset
4968
4969
4970 \end_layout
4971
4972 \begin_layout Standard
4973 To plot search hits on genomes use
4974 \series bold
4975 plot_karyogram
4976 \series default
4977 , which will output a nice karyogram in SVG graphics:
4978 \end_layout
4979
4980 \begin_layout LyX-Code
4981 ...
4982  | plot_karyogram --result_out=karyogram.svg
4983 \end_layout
4984
4985 \begin_layout Standard
4986 The banding data is taken from the UCSC genome browser database and currently
4987  only Human and Mouse is supported.
4988  Fig.\InsetSpace ~
4989
4990 \begin_inset LatexCommand ref
4991 reference "fig:Karyogram"
4992
4993 \end_inset
4994
4995  shows the distribution of piRNA like RNAs matched to the Human genome.
4996 \end_layout
4997
4998 \begin_layout Standard
4999 \begin_inset Float figure
5000 wide false
5001 sideways false
5002 status open
5003
5004 \begin_layout Standard
5005 \noindent
5006 \align center
5007 \begin_inset Graphics
5008         filename karyogram.png
5009         lyxscale 35
5010         width 12cm
5011
5012 \end_inset
5013
5014
5015 \end_layout
5016
5017 \begin_layout Standard
5018 \begin_inset Caption
5019
5020 \begin_layout Standard
5021 \begin_inset LatexCommand label
5022 name "fig:Karyogram"
5023
5024 \end_inset
5025
5026 Karyogram
5027 \end_layout
5028
5029 \end_inset
5030
5031
5032 \end_layout
5033
5034 \begin_layout Quote
5035 Hits from a search of piRNA like RNAs in the Human genome is displayed as
5036  short horizontal bars.
5037 \end_layout
5038
5039 \end_inset
5040
5041
5042 \end_layout
5043
5044 \begin_layout Section
5045 Uploading Results
5046 \end_layout
5047
5048 \begin_layout Subsection
5049 How do I display my results in the UCSC Genome Browser?
5050 \end_layout
5051
5052 \begin_layout Standard
5053 Results from the list of biopieces below can be uploaded directly to a local
5054  mirror of the UCSC Genome Browser using the biopiece
5055 \series bold
5056 upload_to_ucsc
5057 \series default
5058 :
5059 \end_layout
5060
5061 \begin_layout Itemize
5062 patscan_seq
5063 \begin_inset LatexCommand eqref
5064 reference "sub:How-to-use-patscan"
5065
5066 \end_inset
5067
5068
5069 \end_layout
5070
5071 \begin_layout Itemize
5072 blat_seq
5073 \begin_inset LatexCommand eqref
5074 reference "sub:How-to-use-BLAT"
5075
5076 \end_inset
5077
5078
5079 \end_layout
5080
5081 \begin_layout Itemize
5082 blast_seq
5083 \begin_inset LatexCommand eqref
5084 reference "sub:How-to-use-BLAST"
5085
5086 \end_inset
5087
5088
5089 \end_layout
5090
5091 \begin_layout Itemize
5092 vmatch_seq
5093 \begin_inset LatexCommand eqref
5094 reference "sub:How-to-use-Vmatch"
5095
5096 \end_inset
5097
5098
5099 \end_layout
5100
5101 \begin_layout Standard
5102 The syntax for uploading data the most simple way requires two mandatory
5103  switches:
5104 \begin_inset ERT
5105 status collapsed
5106
5107 \begin_layout Standard
5108
5109 -
5110 \backslash
5111 /-
5112 \end_layout
5113
5114 \end_inset
5115
5116 database, which is the UCSC database name (such as hg18, mm9, etc.) and
5117 \begin_inset ERT
5118 status collapsed
5119
5120 \begin_layout Standard
5121
5122 -
5123 \backslash
5124 /-
5125 \end_layout
5126
5127 \end_inset
5128
5129 table which should be the users initials followed by an underscore and a
5130  short description of the data:
5131 \end_layout
5132
5133 \begin_layout LyX-Code
5134 ...
5135  | upload_to_ucsc --database=hg18 --table=mah_snoRNAs
5136 \end_layout
5137
5138 \begin_layout Standard
5139 The
5140 \series bold
5141 upload_to_ucsc
5142 \series default
5143  biopiece modifies the users ~/ucsc/my_tracks.ra file automagically (a backup
5144  is created with the name ~/ucsc/my_tracks.ra~) with default values that
5145  can be overridden using the following switches:
5146 \end_layout
5147
5148 \begin_layout Itemize
5149 \begin_inset ERT
5150 status collapsed
5151
5152 \begin_layout Standard
5153
5154 -
5155 \backslash
5156 /-
5157 \end_layout
5158
5159 \end_inset
5160
5161 short_label - Short label for track - Default=database->table
5162 \end_layout
5163
5164 \begin_layout Itemize
5165 \begin_inset ERT
5166 status collapsed
5167
5168 \begin_layout Standard
5169
5170 -
5171 \backslash
5172 /-
5173 \end_layout
5174
5175 \end_inset
5176
5177 long_label - Long label for track - Default=database->table
5178 \end_layout
5179
5180 \begin_layout Itemize
5181 \begin_inset ERT
5182 status collapsed
5183
5184 \begin_layout Standard
5185
5186 -
5187 \backslash
5188 /-
5189 \end_layout
5190
5191 \end_inset
5192
5193 group - Track group name - Default=<user name as defined in env>
5194 \end_layout
5195
5196 \begin_layout Itemize
5197 \begin_inset ERT
5198 status collapsed
5199
5200 \begin_layout Standard
5201
5202 -
5203 \backslash
5204 /-
5205 \end_layout
5206
5207 \end_inset
5208
5209 priority - Track display priority - Default=1
5210 \end_layout
5211
5212 \begin_layout Itemize
5213 \begin_inset ERT
5214 status collapsed
5215
5216 \begin_layout Standard
5217
5218 -
5219 \backslash
5220 /-
5221 \end_layout
5222
5223 \end_inset
5224
5225 color - Track color - Default=147,73,42
5226 \end_layout
5227
5228 \begin_layout Itemize
5229 \begin_inset ERT
5230 status collapsed
5231
5232 \begin_layout Standard
5233
5234 -
5235 \backslash
5236 /-
5237 \end_layout
5238
5239 \end_inset
5240
5241 chunk_size - Chunks for loading - Default=10000000
5242 \end_layout
5243
5244 \begin_layout Itemize
5245 \begin_inset ERT
5246 status collapsed
5247
5248 \begin_layout Standard
5249
5250 -
5251 \backslash
5252 /-
5253 \end_layout
5254
5255 \end_inset
5256
5257 visibility - Track visibility - Default=pack
5258 \end_layout
5259
5260 \begin_layout Standard
5261 Also, data in BED or PSL format can be uploaded with
5262 \series bold
5263 upload_to_ucsc
5264 \series default
5265  as long as these reference to genomes and chromosomes existing in the UCSC
5266  Genome Browser:
5267 \end_layout
5268
5269 \begin_layout LyX-Code
5270 read_bed --data_in=<bed file> | upload_to_ucsc ...
5271 \end_layout
5272
5273 \begin_layout LyX-Code
5274
5275 \end_layout
5276
5277 \begin_layout LyX-Code
5278 read_psl --data_in=<psl file> | upload_to_ucsc ...
5279 \end_layout
5280
5281 \begin_layout Section
5282 Power Scripting
5283 \end_layout
5284
5285 \begin_layout Standard
5286 It is possible to do commandline scripting of biopiece records using Perl.
5287  Because a biopiece record essentially is a hash structure, you can pass
5288  records to
5289 \series bold
5290 bioscript
5291 \series default
5292  command, which is a wrapper around the Perl executable that allows direct
5293  manipulations of the records using the power of Perl.
5294 \end_layout
5295
5296 \begin_layout Standard
5297 In the below example we replace in all records the value to the CHR key
5298  with a forthrunning number:
5299 \end_layout
5300
5301 \begin_layout LyX-Code
5302 ...
5303  | bioscript 'while($r=get_record(
5304 \backslash
5305 *STDIN)){$r->{CHR}=$i++; put_record($r)}'
5306 \end_layout
5307
5308 \begin_layout Standard
5309 Something more useful would probably be to create custom FASTA headers.
5310  E.g.
5311  if we read in a BED file, lookup the genomic sequence, create a custom
5312  FASTA header with
5313 \series bold
5314 bioscript
5315 \series default
5316  and output FASTA entries:
5317 \end_layout
5318
5319 \begin_layout LyX-Code
5320 ...
5321  | bioscript 'while($r=get_record(
5322 \backslash
5323 *STDIN)){$r->{SEQ_NAME}= //
5324 \end_layout
5325
5326 \begin_layout LyX-Code
5327 join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}'
5328 \end_layout
5329
5330 \begin_layout Standard
5331 And the output:
5332 \end_layout
5333
5334 \begin_layout LyX-Code
5335 >chr2L_21567527_21567550
5336 \end_layout
5337
5338 \begin_layout LyX-Code
5339 taccaaacggatgcctcagacatc
5340 \end_layout
5341
5342 \begin_layout LyX-Code
5343 >chr2L_693380_693403
5344 \end_layout
5345
5346 \begin_layout LyX-Code
5347 taccaaacggatgcctcagacatc
5348 \end_layout
5349
5350 \begin_layout LyX-Code
5351 >chr2L_13859534_13859557
5352 \end_layout
5353
5354 \begin_layout LyX-Code
5355 taccaaacggatgcctcagacatc
5356 \end_layout
5357
5358 \begin_layout LyX-Code
5359 >chr2L_9005090_9005113
5360 \end_layout
5361
5362 \begin_layout LyX-Code
5363 taccaaacggatgcctcagacatc
5364 \end_layout
5365
5366 \begin_layout LyX-Code
5367 >chr2L_2106825_2106848
5368 \end_layout
5369
5370 \begin_layout LyX-Code
5371 taccaaacggatgcctcagacatc
5372 \end_layout
5373
5374 \begin_layout LyX-Code
5375 >chr2L_14649031_14649054
5376 \end_layout
5377
5378 \begin_layout LyX-Code
5379 taccaaacggatgcctcagacatc
5380 \end_layout
5381
5382 \begin_layout Section
5383 Trouble shooting
5384 \end_layout
5385
5386 \begin_layout Standard
5387 Shoot the messenger!
5388 \end_layout
5389
5390 \begin_layout Section
5391 \start_of_appendix
5392 Keys
5393 \begin_inset LatexCommand label
5394 name "sec:Keys"
5395
5396 \end_inset
5397
5398
5399 \end_layout
5400
5401 \begin_layout Standard
5402 HIT
5403 \end_layout
5404
5405 \begin_layout Standard
5406 HIT_BEG
5407 \end_layout
5408
5409 \begin_layout Standard
5410 HIT_END
5411 \end_layout
5412
5413 \begin_layout Standard
5414 HIT_LEN
5415 \end_layout
5416
5417 \begin_layout Standard
5418 HIT_NAME
5419 \end_layout
5420
5421 \begin_layout Standard
5422 PATTERN
5423 \end_layout
5424
5425 \begin_layout Section
5426 Switches
5427 \begin_inset LatexCommand label
5428 name "sec:Switches"
5429
5430 \end_inset
5431
5432
5433 \end_layout
5434
5435 \begin_layout Standard
5436 \begin_inset ERT
5437 status collapsed
5438
5439 \begin_layout Standard
5440
5441 -
5442 \backslash
5443 /-
5444 \end_layout
5445
5446 \end_inset
5447
5448 stream_in
5449 \end_layout
5450
5451 \begin_layout Standard
5452 \begin_inset ERT
5453 status collapsed
5454
5455 \begin_layout Standard
5456
5457 -
5458 \backslash
5459 /-
5460 \end_layout
5461
5462 \end_inset
5463
5464 stream_out
5465 \end_layout
5466
5467 \begin_layout Standard
5468 \begin_inset ERT
5469 status collapsed
5470
5471 \begin_layout Standard
5472
5473 -
5474 \backslash
5475 /-
5476 \end_layout
5477
5478 \end_inset
5479
5480 no_stream
5481 \end_layout
5482
5483 \begin_layout Standard
5484 \begin_inset ERT
5485 status collapsed
5486
5487 \begin_layout Standard
5488
5489 -
5490 \backslash
5491 /-
5492 \end_layout
5493
5494 \end_inset
5495
5496 data_in
5497 \end_layout
5498
5499 \begin_layout Standard
5500 \begin_inset ERT
5501 status collapsed
5502
5503 \begin_layout Standard
5504
5505 -
5506 \backslash
5507 /-
5508 \end_layout
5509
5510 \end_inset
5511
5512 result_out
5513 \end_layout
5514
5515 \begin_layout Standard
5516 \begin_inset ERT
5517 status collapsed
5518
5519 \begin_layout Standard
5520
5521 -
5522 \backslash
5523 /-
5524 \end_layout
5525
5526 \end_inset
5527
5528 num
5529 \end_layout
5530
5531 \begin_layout Section
5532 scan_for_matches README
5533 \begin_inset LatexCommand label
5534 name "sec:scan_for_matches-README"
5535
5536 \end_inset
5537
5538
5539 \end_layout
5540
5541 \begin_layout LyX-Code
5542                           scan_for_matches:
5543 \end_layout
5544
5545 \begin_layout LyX-Code
5546     A Program to Scan Nucleotide or Protein Sequences for Matching Patterns
5547 \end_layout
5548
5549 \begin_layout LyX-Code
5550                         Ross Overbeek
5551 \end_layout
5552
5553 \begin_layout LyX-Code
5554                         MCS
5555 \end_layout
5556
5557 \begin_layout LyX-Code
5558                         Argonne National Laboratory
5559 \end_layout
5560
5561 \begin_layout LyX-Code
5562                         Argonne, IL 60439
5563 \end_layout
5564
5565 \begin_layout LyX-Code
5566                         USA
5567 \end_layout
5568
5569 \begin_layout LyX-Code
5570 Scan_for_matches is a utility that we have written to search for
5571 \end_layout
5572
5573 \begin_layout LyX-Code
5574 patterns in DNA and protein sequences.
5575   I wrote most of the code,
5576 \end_layout
5577
5578 \begin_layout LyX-Code
5579 although David Joerg and Morgan Price wrote sections of an
5580 \end_layout
5581
5582 \begin_layout LyX-Code
5583 earlier version.
5584   The whole notion of pattern matching has a rich
5585 \end_layout
5586
5587 \begin_layout LyX-Code
5588 history, and we borrowed liberally from many sources.
5589   However, it is
5590 \end_layout
5591
5592 \begin_layout LyX-Code
5593 worth noting that we were strongly influenced by the elegant tools
5594 \end_layout
5595
5596 \begin_layout LyX-Code
5597 developed and distributed by David Searls.
5598   My intent is to make the
5599 \end_layout
5600
5601 \begin_layout LyX-Code
5602 existing tool available to anyone in the research community that might
5603 \end_layout
5604
5605 \begin_layout LyX-Code
5606 find it useful.
5607   I will continue to try to fix bugs and make suggested
5608 \end_layout
5609
5610 \begin_layout LyX-Code
5611 enhancements, at least until I feel that a superior tool exists.
5612 \end_layout
5613
5614 \begin_layout LyX-Code
5615 Hence, I would appreciate it if all bug reports and suggestions are
5616 \end_layout
5617
5618 \begin_layout LyX-Code
5619 directed to me at Overbeek@mcs.anl.gov.
5620
5621 \end_layout
5622
5623 \begin_layout LyX-Code
5624 I will try to log all bug fixes and report them to users that send me
5625 \end_layout
5626
5627 \begin_layout LyX-Code
5628 their email addresses.
5629   I do not require that you give me your name
5630 \end_layout
5631
5632 \begin_layout LyX-Code
5633 and address.
5634   However, if you do give it to me, I will try to notify
5635 \end_layout
5636
5637 \begin_layout LyX-Code
5638 you of serious problems as they are discovered.
5639 \end_layout
5640
5641 \begin_layout LyX-Code
5642 Getting Started:
5643 \end_layout
5644
5645 \begin_layout LyX-Code
5646     The distribution should contain at least the following programs:
5647 \end_layout
5648
5649 \begin_layout LyX-Code
5650                 README                  -     This document
5651 \end_layout
5652
5653 \begin_layout LyX-Code
5654                 ggpunit.c               -     One of the two source files
5655 \end_layout
5656
5657 \begin_layout LyX-Code
5658                 scan_for_matches.c      -     The second source file
5659 \end_layout
5660
5661 \begin_layout LyX-Code
5662
5663 \end_layout
5664
5665 \begin_layout LyX-Code
5666                 run_tests               -     A perl script to test things
5667 \end_layout
5668
5669 \begin_layout LyX-Code
5670                 show_hits               -     A handy perl script
5671 \end_layout
5672
5673 \begin_layout LyX-Code
5674                 test_dna_input          -     Test sequences for DNA
5675 \end_layout
5676
5677 \begin_layout LyX-Code
5678                 test_dna_patterns       -     Test patterns for DNA scan
5679 \end_layout
5680
5681 \begin_layout LyX-Code
5682                 test_output             -     Desired output from test
5683 \end_layout
5684
5685 \begin_layout LyX-Code
5686                 test_prot_input         -     Test protein sequences
5687 \end_layout
5688
5689 \begin_layout LyX-Code
5690                 test_prot_patterns      -     Test patterns for proteins
5691 \end_layout
5692
5693 \begin_layout LyX-Code
5694                 testit                  -     a perl script used for test
5695 \end_layout
5696
5697 \begin_layout LyX-Code
5698     Only the first three files are required.
5699   The others are useful,
5700 \end_layout
5701
5702 \begin_layout LyX-Code
5703     but only if you have Perl installed on your system.
5704   If you do
5705 \end_layout
5706
5707 \begin_layout LyX-Code
5708     have Perl, I suggest that you type
5709 \end_layout
5710
5711 \begin_layout LyX-Code
5712
5713 \end_layout
5714
5715 \begin_layout LyX-Code
5716                 which perl
5717 \end_layout
5718
5719 \begin_layout LyX-Code
5720     to find out where it installed.
5721   On my system, I get the following
5722 \end_layout
5723
5724 \begin_layout LyX-Code
5725     response:
5726 \end_layout
5727
5728 \begin_layout LyX-Code
5729
5730 \end_layout
5731
5732 \begin_layout LyX-Code
5733                 clone% which perl
5734 \end_layout
5735
5736 \begin_layout LyX-Code
5737                 /usr/local/bin/perl
5738 \end_layout
5739
5740 \begin_layout LyX-Code
5741     indicating that Perl is installed in /usr/local/bin.
5742   Anyway, once
5743 \end_layout
5744
5745 \begin_layout LyX-Code
5746     you know where it is installed, edit the first line of files
5747 \end_layout
5748
5749 \begin_layout LyX-Code
5750         testit
5751 \end_layout
5752
5753 \begin_layout LyX-Code
5754         show_hits
5755 \end_layout
5756
5757 \begin_layout LyX-Code
5758     replacing /usr/local/bin/perl with the appropriate location.
5759   I
5760 \end_layout
5761
5762 \begin_layout LyX-Code
5763     will assume that you can do this, although it is not critical (it
5764 \end_layout
5765
5766 \begin_layout LyX-Code
5767     is needed only to test the installation and to use the "show_hits"
5768 \end_layout
5769
5770 \begin_layout LyX-Code
5771     utility).
5772   Perl is not required to actually install and run
5773 \end_layout
5774
5775 \begin_layout LyX-Code
5776     scan_for_matches.
5777
5778 \end_layout
5779
5780 \begin_layout LyX-Code
5781     If you do not have Perl, I suggest you get it and install it (it
5782 \end_layout
5783
5784 \begin_layout LyX-Code
5785     is a wonderful utility).
5786   Information about Perl and how to get it
5787 \end_layout
5788
5789 \begin_layout LyX-Code
5790     can be found in the book "Programming Perl" by Larry Wall and
5791 \end_layout
5792
5793 \begin_layout LyX-Code
5794     Randall L.
5795  Schwartz, published by O'Reilly & Associates, Inc.
5796 \end_layout
5797
5798 \begin_layout LyX-Code
5799     To get started, you will need to compile the program.
5800    I do this
5801 \end_layout
5802
5803 \begin_layout LyX-Code
5804     using
5805 \end_layout
5806
5807 \begin_layout LyX-Code
5808         gcc -O -o scan_for_matches  ggpunit.c scan_for_matches.c
5809 \end_layout
5810
5811 \begin_layout LyX-Code
5812     If you do not use GNU C, use
5813 \end_layout
5814
5815 \begin_layout LyX-Code
5816         cc -O -DCC -o scan_for_matches  ggpunit.c scan_for_matches.c
5817 \end_layout
5818
5819 \begin_layout LyX-Code
5820     which works on my Sun.
5821
5822 \end_layout
5823
5824 \begin_layout LyX-Code
5825     Once you have compiled scan_for_matches, you can verify that it
5826 \end_layout
5827
5828 \begin_layout LyX-Code
5829     works with
5830 \end_layout
5831
5832 \begin_layout LyX-Code
5833         clone% run_tests tmp
5834 \end_layout
5835
5836 \begin_layout LyX-Code
5837         clone% diff tmp test_output
5838 \end_layout
5839
5840 \begin_layout LyX-Code
5841     You may get a few strange lines of the sort
5842 \end_layout
5843
5844 \begin_layout LyX-Code
5845         clone% run_tests tmp
5846 \end_layout
5847
5848 \begin_layout LyX-Code
5849         rm: tmp: No such file or directory
5850 \end_layout
5851
5852 \begin_layout LyX-Code
5853         clone% diff tmp test_output
5854 \end_layout
5855
5856 \begin_layout LyX-Code
5857     These should cause no concern.
5858   However, if the "diff" shows that
5859 \end_layout
5860
5861 \begin_layout LyX-Code
5862     tmp and test_output are different, contact me (you have a
5863 \end_layout
5864
5865 \begin_layout LyX-Code
5866     problem).
5867
5868 \end_layout
5869
5870 \begin_layout LyX-Code
5871     You should now be able to use scan_for_matches by following the
5872 \end_layout
5873
5874 \begin_layout LyX-Code
5875     instructions given below (which is all the normal user should have
5876 \end_layout
5877
5878 \begin_layout LyX-Code
5879     to understand, once things are installed properly).
5880 \end_layout
5881
5882 \begin_layout LyX-Code
5883  ==============================================================
5884 \end_layout
5885
5886 \begin_layout LyX-Code
5887 How to run scan_for_matches:
5888 \end_layout
5889
5890 \begin_layout LyX-Code
5891     To run the program, you type need to create two files
5892 \end_layout
5893
5894 \begin_layout LyX-Code
5895     1.
5896   the first file contains the pattern you wish to scan for; I'll
5897 \end_layout
5898
5899 \begin_layout LyX-Code
5900         call this file pat_file in what follows (but any name is ok)
5901 \end_layout
5902
5903 \begin_layout LyX-Code
5904     2.
5905   the second file contains a set of sequences to scan.
5906   These
5907 \end_layout
5908
5909 \begin_layout LyX-Code
5910         should be in "fasta format".
5911   Just look at the contents of
5912 \end_layout
5913
5914 \begin_layout LyX-Code
5915         test_dna_input to see examples of this format.
5916   Basically,
5917 \end_layout
5918
5919 \begin_layout LyX-Code
5920         each sequence begins with a line of the form
5921 \end_layout
5922
5923 \begin_layout LyX-Code
5924            >sequence_id
5925 \end_layout
5926
5927 \begin_layout LyX-Code
5928         and is followed by one or more lines containing the sequence.
5929 \end_layout
5930
5931 \begin_layout LyX-Code
5932     Once these files have been created, you just use
5933 \end_layout
5934
5935 \begin_layout LyX-Code
5936         scan_for_matches pat_file < input_file
5937 \end_layout
5938
5939 \begin_layout LyX-Code
5940     to scan all of the input sequences for the given pattern.
5941   As an
5942 \end_layout
5943
5944 \begin_layout LyX-Code
5945     example, suppose that pat_file contains a single line of the form
5946 \end_layout
5947
5948 \begin_layout LyX-Code
5949                 p1=4...7 3...8 ~p1
5950 \end_layout
5951
5952 \begin_layout LyX-Code
5953     Then,
5954 \end_layout
5955
5956 \begin_layout LyX-Code
5957                 scan_for_matches pat_file < test_dna_input
5958 \end_layout
5959
5960 \begin_layout LyX-Code
5961     should produce two "hits".
5962   When I run this on my machine, I get
5963 \end_layout
5964
5965 \begin_layout LyX-Code
5966         clone% scan_for_matches pat_file < test_dna_input
5967 \end_layout
5968
5969 \begin_layout LyX-Code
5970         >tst1:[6,27]
5971 \end_layout
5972
5973 \begin_layout LyX-Code
5974         cguaacc ggttaacc gguuacg
5975 \end_layout
5976
5977 \begin_layout LyX-Code
5978         >tst2:[6,27]
5979 \end_layout
5980
5981 \begin_layout LyX-Code
5982         CGUAACC GGTTAACC GGUUACG
5983 \end_layout
5984
5985 \begin_layout LyX-Code
5986         clone%
5987 \end_layout
5988
5989 \begin_layout LyX-Code
5990 Simple Patterns Built by Matching Ranges and Reverse Complements
5991 \end_layout
5992
5993 \begin_layout LyX-Code
5994     Let me first explain this simple pattern:
5995 \end_layout
5996
5997 \begin_layout LyX-Code
5998
5999 \end_layout
6000
6001 \begin_layout LyX-Code
6002                 p1=4...7 3...8 ~p1
6003 \end_layout
6004
6005 \begin_layout LyX-Code
6006     The pattern consists of three "pattern units" separated by spaces.
6007 \end_layout
6008
6009 \begin_layout LyX-Code
6010     The first pattern unit is
6011 \end_layout
6012
6013 \begin_layout LyX-Code
6014                 p1=4...7
6015 \end_layout
6016
6017 \begin_layout LyX-Code
6018     which means "match 4 to 7 characters and call them p1".
6019   The
6020 \end_layout
6021
6022 \begin_layout LyX-Code
6023     second pattern unit is
6024 \end_layout
6025
6026 \begin_layout LyX-Code
6027                 3...8
6028 \end_layout
6029
6030 \begin_layout LyX-Code
6031     which means "then match 3 to 8 characters".
6032   The last pattern unit
6033 \end_layout
6034
6035 \begin_layout LyX-Code
6036     is
6037 \end_layout
6038
6039 \begin_layout LyX-Code
6040                 ~p1
6041 \end_layout
6042
6043 \begin_layout LyX-Code
6044     which means "match the reverse complement of p1".
6045   The first
6046 \end_layout
6047
6048 \begin_layout LyX-Code
6049     reported hit is shown as
6050 \end_layout
6051
6052 \begin_layout LyX-Code
6053         >tst1:[6,27]
6054 \end_layout
6055
6056 \begin_layout LyX-Code
6057         cguaacc ggttaacc gguuacg
6058 \end_layout
6059
6060 \begin_layout LyX-Code
6061     which states that characters 6 through 27 of sequence tst1 were
6062 \end_layout
6063
6064 \begin_layout LyX-Code
6065     matched.
6066   "cguaac" matched the first pattern unit, "ggttaacc" the
6067 \end_layout
6068
6069 \begin_layout LyX-Code
6070     second, and "gguuacg" the third.
6071   This is an example of a common
6072 \end_layout
6073
6074 \begin_layout LyX-Code
6075     type of pattern used to search for sections of DNA or RNA that
6076 \end_layout
6077
6078 \begin_layout LyX-Code
6079     would fold into a hairpin loop.
6080 \end_layout
6081
6082 \begin_layout LyX-Code
6083 Searching Both Strands
6084 \end_layout
6085
6086 \begin_layout LyX-Code
6087     Now for a short aside: scan_for_matches only searched the
6088 \end_layout
6089
6090 \begin_layout LyX-Code
6091     sequences in the input file; it did not search the opposite
6092 \end_layout
6093
6094 \begin_layout LyX-Code
6095     strand.
6096   With a pattern of the sort we just used, there is not
6097 \end_layout
6098
6099 \begin_layout LyX-Code
6100     need o search the opposite strand.
6101   However, it is normally the
6102 \end_layout
6103
6104 \begin_layout LyX-Code
6105     case that you will wish to search both the sequence and the
6106 \end_layout
6107
6108 \begin_layout LyX-Code
6109     opposite strand (i.e., the reverse complement of the sequence).
6110 \end_layout
6111
6112 \begin_layout LyX-Code
6113     To do that, you would just use the "-c" command line.
6114   For example,
6115 \end_layout
6116
6117 \begin_layout LyX-Code
6118         scan_for_matches -c pat_file < test_dna_input
6119 \end_layout
6120
6121 \begin_layout LyX-Code
6122     Hits on the opposite strand will show a beginning location greater
6123 \end_layout
6124
6125 \begin_layout LyX-Code
6126     than te end location of the match.
6127 \end_layout
6128
6129 \begin_layout LyX-Code
6130 Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions
6131 \end_layout
6132
6133 \begin_layout LyX-Code
6134     Let us stop now and ask "What additional features would one need to
6135 \end_layout
6136
6137 \begin_layout LyX-Code
6138     really find the kinds of loop structures that characterize tRNAs,
6139 \end_layout
6140
6141 \begin_layout LyX-Code
6142     rRNAs, and so forth?"  I can immediately think of two:
6143 \end_layout
6144
6145 \begin_layout LyX-Code
6146         a) you will need to be able to allow non-standard pairings
6147 \end_layout
6148
6149 \begin_layout LyX-Code
6150            (those other than G-C and A-U), and
6151 \end_layout
6152
6153 \begin_layout LyX-Code
6154         b) you will need to be able to tolerate some number of
6155 \end_layout
6156
6157 \begin_layout LyX-Code
6158            mismatches and bulges.
6159 \end_layout
6160
6161 \begin_layout LyX-Code
6162
6163 \end_layout
6164
6165 \begin_layout LyX-Code
6166     Let me first show you how to handle non-standard "rules for
6167 \end_layout
6168
6169 \begin_layout LyX-Code
6170     pairing in reverse complements".
6171   Consider the following pattern,
6172 \end_layout
6173
6174 \begin_layout LyX-Code
6175     which I show as two line (you may use as many lines as you like in
6176 \end_layout
6177
6178 \begin_layout LyX-Code
6179     forming a pattern, although you can only break a pattern at points
6180 \end_layout
6181
6182 \begin_layout LyX-Code
6183     where space would be legal):
6184 \end_layout
6185
6186 \begin_layout LyX-Code
6187             r1={au,ua,gc,cg,gu,ug,ga,ag}
6188 \end_layout
6189
6190 \begin_layout LyX-Code
6191             p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1
6192 \end_layout
6193
6194 \begin_layout LyX-Code
6195     The first "pattern unit" does not actually match anything; rather,
6196 \end_layout
6197
6198 \begin_layout LyX-Code
6199     it defines a "pairing rule" in which standard pairings are
6200 \end_layout
6201
6202 \begin_layout LyX-Code
6203     allowed, as well as G-A and A-G (in case you wondered, Us and Ts
6204 \end_layout
6205
6206 \begin_layout LyX-Code
6207     and upper and lower case can be used interchangably; for example
6208 \end_layout
6209
6210 \begin_layout LyX-Code
6211     r1={AT,UA,gc,cg} could be used to define the "standard rule" for
6212 \end_layout
6213
6214 \begin_layout LyX-Code
6215     pairings).
6216   The second line consists of six pattern units which
6217 \end_layout
6218
6219 \begin_layout LyX-Code
6220     may be interpreted as follows:
6221 \end_layout
6222
6223 \begin_layout LyX-Code
6224             p1=2...3     match 2 or 3 characters (call it p1)
6225 \end_layout
6226
6227 \begin_layout LyX-Code
6228             0...4        match 0 to 4 characters
6229 \end_layout
6230
6231 \begin_layout LyX-Code
6232             p2=2...5     match 2 to 5 characters (call it p2)
6233 \end_layout
6234
6235 \begin_layout LyX-Code
6236             1...5        match 1 to 5 characters
6237 \end_layout
6238
6239 \begin_layout LyX-Code
6240             r1~p2        match the reverse complement of p2,
6241 \end_layout
6242
6243 \begin_layout LyX-Code
6244                             allowing G-A and A-G pairs
6245 \end_layout
6246
6247 \begin_layout LyX-Code
6248             0...4        match 0 to 4 characters
6249 \end_layout
6250
6251 \begin_layout LyX-Code
6252             ~p1          match the reverse complement of p1
6253 \end_layout
6254
6255 \begin_layout LyX-Code
6256                             allowing only G-C, C-G, A-T, and T-A pairs
6257 \end_layout
6258
6259 \begin_layout LyX-Code
6260     Thus, r1~p2 means "match the reverse complement of p2 using rule r1".
6261 \end_layout
6262
6263 \begin_layout LyX-Code
6264     Now let us consider the issue of tolerating mismatches and bulges.
6265 \end_layout
6266
6267 \begin_layout LyX-Code
6268     You may add a "qualifier" to the pattern unit that gives the
6269 \end_layout
6270
6271 \begin_layout LyX-Code
6272     tolerable number of "mismatches, deletions, and insertions".
6273 \end_layout
6274
6275 \begin_layout LyX-Code
6276     Thus,
6277 \end_layout
6278
6279 \begin_layout LyX-Code
6280                 p1=10...10 3...8 ~p1[1,2,1]
6281 \end_layout
6282
6283 \begin_layout LyX-Code
6284     means that the third pattern unit must match 10 characters,
6285 \end_layout
6286
6287 \begin_layout LyX-Code
6288     allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or
6289 \end_layout
6290
6291 \begin_layout LyX-Code
6292     T-A), two deletions (a deletion is a character that occurs in p1,
6293 \end_layout
6294
6295 \begin_layout LyX-Code
6296     but has been "deleted" from the string matched by ~p1), and one
6297 \end_layout
6298
6299 \begin_layout LyX-Code
6300     insertion (an "insertion" is a character that occurs in the string
6301 \end_layout
6302
6303 \begin_layout LyX-Code
6304     matched by ~p1, but not for which no corresponding character
6305 \end_layout
6306
6307 \begin_layout LyX-Code
6308     occurs in p1).
6309   In this case, the pattern would match
6310 \end_layout
6311
6312 \begin_layout LyX-Code
6313               ACGTACGTAC GGGGGGGG GCGTTACCT
6314 \end_layout
6315
6316 \begin_layout LyX-Code
6317     which is, you must admit, a fairly weak loop.
6318   It is common to
6319 \end_layout
6320
6321 \begin_layout LyX-Code
6322     allow mismatches, but you will find yourself using insertions and
6323 \end_layout
6324
6325 \begin_layout LyX-Code
6326     deletions much more rarely.
6327   In any event, you should note that
6328 \end_layout
6329
6330 \begin_layout LyX-Code
6331     allowing mismatches, insertions, and deletions does force the
6332 \end_layout
6333
6334 \begin_layout LyX-Code
6335     program to try many additional possible pairings, so it does slow
6336 \end_layout
6337
6338 \begin_layout LyX-Code
6339     things down a bit.
6340 \end_layout
6341
6342 \begin_layout LyX-Code
6343 How Patterns Are Matched
6344 \end_layout
6345
6346 \begin_layout LyX-Code
6347     Now is as good a time as any to discuss the basic flow of control
6348 \end_layout
6349
6350 \begin_layout LyX-Code
6351     when matching patterns.
6352   Recall that a "pattern" is a sequence of
6353 \end_layout
6354
6355 \begin_layout LyX-Code
6356     "pattern units".
6357   Suppose that the pattern units were
6358 \end_layout
6359
6360 \begin_layout LyX-Code
6361         u1 u2 u3 u4 ...
6362  un
6363 \end_layout
6364
6365 \begin_layout LyX-Code
6366     The scan of a sequence S begins by setting the current position
6367 \end_layout
6368
6369 \begin_layout LyX-Code
6370     to 1.
6371   Then, an attempt is made to match u1 starting at the
6372 \end_layout
6373
6374 \begin_layout LyX-Code
6375     current position.
6376   Each attempt to match a pattern unit can
6377 \end_layout
6378
6379 \begin_layout LyX-Code
6380     succeed or fail.
6381   If it succeeds, then an attempt is made to match
6382 \end_layout
6383
6384 \begin_layout LyX-Code
6385     the next unit.
6386   If it fails, then an attempt is made to find an
6387 \end_layout
6388
6389 \begin_layout LyX-Code
6390     alternative match for the immediately preceding pattern unit.
6391   If
6392 \end_layout
6393
6394 \begin_layout LyX-Code
6395     this succeeds, then we proceed forward again to the next unit.
6396   If
6397 \end_layout
6398
6399 \begin_layout LyX-Code
6400     it fails we go back to the preceding unit.
6401   This process is called
6402 \end_layout
6403
6404 \begin_layout LyX-Code
6405     "backtracking".
6406   If there are no previous units, then the current
6407 \end_layout
6408
6409 \begin_layout LyX-Code
6410     position is incremented by one, and everything starts again.
6411   This
6412 \end_layout
6413
6414 \begin_layout LyX-Code
6415     proceeds until either the current position goes past the end of
6416 \end_layout
6417
6418 \begin_layout LyX-Code
6419     the sequence or all of the pattern units succeed.
6420   On success,
6421 \end_layout
6422
6423 \begin_layout LyX-Code
6424     scan_for_matches reports the "hit", the current position is set
6425 \end_layout
6426
6427 \begin_layout LyX-Code
6428     just past the hit, and an attempt is made to find another hit.
6429 \end_layout
6430
6431 \begin_layout LyX-Code
6432     If you wish to limit the scan to simply finding a maximum of, say,
6433 \end_layout
6434
6435 \begin_layout LyX-Code
6436     10 hits, you can use the -n option (-n 10 would set the limit to
6437 \end_layout
6438
6439 \begin_layout LyX-Code
6440     10 reported hits).
6441   For example,
6442 \end_layout
6443
6444 \begin_layout LyX-Code
6445         scan_for_matches -c -n 1 pat_file < test_dna_input
6446 \end_layout
6447
6448 \begin_layout LyX-Code
6449     would search for just the first hit (and would stop searching the
6450 \end_layout
6451
6452 \begin_layout LyX-Code
6453     current sequences or any that follow in the input file).
6454 \end_layout
6455
6456 \begin_layout LyX-Code
6457 Searching for repeats:
6458 \end_layout
6459
6460 \begin_layout LyX-Code
6461     In the last section, I discussed almost all of the details
6462 \end_layout
6463
6464 \begin_layout LyX-Code
6465     required to allow you to look for repeats.
6466   Consider the following
6467 \end_layout
6468
6469 \begin_layout LyX-Code
6470     set of patterns:
6471 \end_layout
6472
6473 \begin_layout LyX-Code
6474         p1=6...6 3...8 p1   (find exact 6 character repeat separated
6475 \end_layout
6476
6477 \begin_layout LyX-Code
6478                              by to 8 characters)
6479 \end_layout
6480
6481 \begin_layout LyX-Code
6482         p1=6...6 3..8 p1[1,0,0]   (allow one mismatch)
6483 \end_layout
6484
6485 \begin_layout LyX-Code
6486         p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0]
6487 \end_layout
6488
6489 \begin_layout LyX-Code
6490                             (match 12 characters that are the remains
6491 \end_layout
6492
6493 \begin_layout LyX-Code
6494                              of a 3-character sequence occurring 4 times)
6495 \end_layout
6496
6497 \begin_layout LyX-Code
6498
6499 \end_layout
6500
6501 \begin_layout LyX-Code
6502         p1=4...8 0...3 p2=6...8 p1 0...3 p2
6503 \end_layout
6504
6505 \begin_layout LyX-Code
6506                             (This would match things like
6507 \end_layout
6508
6509 \begin_layout LyX-Code
6510                                 ATCT G TCTTT ATCT TG TCTTT
6511 \end_layout
6512
6513 \begin_layout LyX-Code
6514                             )
6515 \end_layout
6516
6517 \begin_layout LyX-Code
6518 Searching for particular sequences:
6519 \end_layout
6520
6521 \begin_layout LyX-Code
6522     Occasionally, one wishes to match a specific, known sequence.
6523 \end_layout
6524
6525 \begin_layout LyX-Code
6526     In such a case, you can just give the sequence (along with an
6527 \end_layout
6528
6529 \begin_layout LyX-Code
6530     optional statement of the allowable mismatches, insertions, and
6531 \end_layout
6532
6533 \begin_layout LyX-Code
6534     deletions).
6535   Thus,
6536 \end_layout
6537
6538 \begin_layout LyX-Code
6539         p1=6...8 GAGA ~p1    (match a hairpin with GAGA as the loop)
6540 \end_layout
6541
6542 \begin_layout LyX-Code
6543         RRRRYYYY             (match 4 purines followed by 4 pyrimidines)
6544 \end_layout
6545
6546 \begin_layout LyX-Code
6547         TATAA[1,0,0]         (match TATAA, allowing 1 mismatch)
6548 \end_layout
6549
6550 \begin_layout LyX-Code
6551
6552 \end_layout
6553
6554 \begin_layout LyX-Code
6555 Matches against a "weight matrix":
6556 \end_layout
6557
6558 \begin_layout LyX-Code
6559     I will conclude my examples of the types of pattern units
6560 \end_layout
6561
6562 \begin_layout LyX-Code
6563     available for matching against nucleotide sequences by discussing a
6564 \end_layout
6565
6566 \begin_layout LyX-Code
6567     crude implemetation of matching using a "weight matrix".
6568   While I
6569 \end_layout
6570
6571 \begin_layout LyX-Code
6572     am less than overwhelmed with the syntax that I chose, I think that
6573 \end_layout
6574
6575 \begin_layout LyX-Code
6576     the reader should be aware that I was thinking of generating
6577 \end_layout
6578
6579 \begin_layout LyX-Code
6580     patterns containing such pattern units automatically from
6581 \end_layout
6582
6583 \begin_layout LyX-Code
6584     alignments (and did not really plan on typing such things in by
6585 \end_layout
6586
6587 \begin_layout LyX-Code
6588     hand very often).
6589   Anyway, suppose that you wanted to match a
6590 \end_layout
6591
6592 \begin_layout LyX-Code
6593     sequence of eight characters.
6594   The "consensus" of these eight
6595 \end_layout
6596
6597 \begin_layout LyX-Code
6598     characters is GRCACCGS, but the actual "frequencies of occurrence"
6599 \end_layout
6600
6601 \begin_layout LyX-Code
6602     are given in the matrix below.
6603   Thus, the first character is an A
6604 \end_layout
6605
6606 \begin_layout LyX-Code
6607     16% the time and a G 84% of the time.
6608   The second is an A 57% of
6609 \end_layout
6610
6611 \begin_layout LyX-Code
6612     the time, a C 10% of the time, a G 29% of the time, and a T 4% of
6613 \end_layout
6614
6615 \begin_layout LyX-Code
6616     the time.
6617
6618 \end_layout
6619
6620 \begin_layout LyX-Code
6621              C1     C2    C3    C4   C5    C6    C7    C8
6622 \end_layout
6623
6624 \begin_layout LyX-Code
6625
6626 \end_layout
6627
6628 \begin_layout LyX-Code
6629        A     16     57     0    95    0    18     0     0
6630 \end_layout
6631
6632 \begin_layout LyX-Code
6633        C      0     10    80     0  100    60     0    50
6634 \end_layout
6635
6636 \begin_layout LyX-Code
6637        G     84     29     0     0    0    20   100    50
6638 \end_layout
6639
6640 \begin_layout LyX-Code
6641        T      0      4    20     5    0     2     0     0
6642 \end_layout
6643
6644 \begin_layout LyX-Code
6645
6646 \end_layout
6647
6648 \begin_layout LyX-Code
6649     One could use the following pattern unit to search for inexact
6650 \end_layout
6651
6652 \begin_layout LyX-Code
6653     matches related to such a "weight matrix":
6654 \end_layout
6655
6656 \begin_layout LyX-Code
6657         {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5),
6658 \end_layout
6659
6660 \begin_layout LyX-Code
6661          (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450
6662 \end_layout
6663
6664 \begin_layout LyX-Code
6665     This pattern unit will attempt to match exactly eight characters.
6666 \end_layout
6667
6668 \begin_layout LyX-Code
6669     For each character in the sequence, the entry in the corresponding
6670 \end_layout
6671
6672 \begin_layout LyX-Code
6673     tuple is added to an accumulated sum.
6674   If the sum is greater than
6675 \end_layout
6676
6677 \begin_layout LyX-Code
6678     450, the match succeeds; else it fails.
6679 \end_layout
6680
6681 \begin_layout LyX-Code
6682     Recently, this feature was upgraded to allow ranges.
6683   Thus,
6684 \end_layout
6685
6686 \begin_layout LyX-Code
6687   600 >  {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5),
6688 \end_layout
6689
6690 \begin_layout LyX-Code
6691          (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450
6692 \end_layout
6693
6694 \begin_layout LyX-Code
6695     will work, as well.
6696 \end_layout
6697
6698 \begin_layout LyX-Code
6699 Allowing Alternatives:
6700 \end_layout
6701
6702 \begin_layout LyX-Code
6703     Very occasionally, you may wish to allow alternative pattern units
6704 \end_layout
6705
6706 \begin_layout LyX-Code
6707     (i.e., "match either A or B").
6708   You can do this using something
6709 \end_layout
6710
6711 \begin_layout LyX-Code
6712     like
6713 \end_layout
6714
6715 \begin_layout LyX-Code
6716                 ( GAGA | GCGCA)
6717 \end_layout
6718
6719 \begin_layout LyX-Code
6720     which says "match either GAGA or GCGCA".
6721   You may take
6722 \end_layout
6723
6724 \begin_layout LyX-Code
6725     alternatives of a list of pattern units, for example
6726 \end_layout
6727
6728 \begin_layout LyX-Code
6729         (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG)
6730 \end_layout
6731
6732 \begin_layout LyX-Code
6733     would match one of two sequences of pattern units.
6734   There is one
6735 \end_layout
6736
6737 \begin_layout LyX-Code
6738     clumsy aspect of the syntax: to match a list of alternatives, you
6739 \end_layout
6740
6741 \begin_layout LyX-Code
6742     need to fully the request.
6743   Thus,
6744 \end_layout
6745
6746 \begin_layout LyX-Code
6747         (GAGA | (GCGCA | TTCGA))
6748 \end_layout
6749
6750 \begin_layout LyX-Code
6751     would be needed to try the three alternatives.
6752 \end_layout
6753
6754 \begin_layout LyX-Code
6755 One Minor Extension
6756 \end_layout
6757
6758 \begin_layout LyX-Code
6759     Sometimes a pattern will contain a sequence of distinct ranges,
6760 \end_layout
6761
6762 \begin_layout LyX-Code
6763     and you might wish to limit the sum of the lengths of the matched
6764 \end_layout
6765
6766 \begin_layout LyX-Code
6767     subsequences.
6768    For example, suppose that you basically wanted to
6769 \end_layout
6770
6771 \begin_layout LyX-Code
6772     match something like
6773 \end_layout
6774
6775 \begin_layout LyX-Code
6776     ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT
6777 \end_layout
6778
6779 \begin_layout LyX-Code
6780     but that the sum of the lengths of p1, p2, and p3 must not exceed
6781 \end_layout
6782
6783 \begin_layout LyX-Code
6784     eight characters.
6785   To do this, you could add
6786 \end_layout
6787
6788 \begin_layout LyX-Code
6789         length(p1+p2+p3) < 9
6790 \end_layout
6791
6792 \begin_layout LyX-Code
6793     as the last pattern unit.
6794   It will just succeed or fail (but does
6795 \end_layout
6796
6797 \begin_layout LyX-Code
6798     not actually match any characters in the sequence).
6799 \end_layout
6800
6801 \begin_layout LyX-Code
6802
6803 \end_layout
6804
6805 \begin_layout LyX-Code
6806 Matching Protein Sequences
6807 \end_layout
6808
6809 \begin_layout LyX-Code
6810     Suppose that the input file contains protein sequences.
6811   In this
6812 \end_layout
6813
6814 \begin_layout LyX-Code
6815     case, you must invoke scan_for_matches with the "-p" option.
6816   You
6817 \end_layout
6818
6819 \begin_layout LyX-Code
6820     cannot use aspects of the language that relate directly to
6821 \end_layout
6822
6823 \begin_layout LyX-Code
6824     nucleotide sequences (e.g., the -c command line option or pattern
6825 \end_layout
6826
6827 \begin_layout LyX-Code
6828     constructs referring to the reverse complement of a previously
6829 \end_layout
6830
6831 \begin_layout LyX-Code
6832     matched unit).
6833
6834 \end_layout
6835
6836 \begin_layout LyX-Code
6837     You also have two additional constructs that allow you to match
6838 \end_layout
6839
6840 \begin_layout LyX-Code
6841     either "one of a set of amino acids" or "any amino acid other than
6842 \end_layout
6843
6844 \begin_layout LyX-Code
6845     those a given set".
6846   For example,
6847 \end_layout
6848
6849 \begin_layout LyX-Code
6850         p1=0...4 any(HQD) 1...3 notany(HK) p1
6851 \end_layout
6852
6853 \begin_layout LyX-Code
6854     would successfully match a string like
6855 \end_layout
6856
6857 \begin_layout LyX-Code
6858            YWV D AA C YWV
6859 \end_layout
6860
6861 \begin_layout LyX-Code
6862 Using the show_hits Utility
6863 \end_layout
6864
6865 \begin_layout LyX-Code
6866     When viewing a large set of complex matches, you might find it
6867 \end_layout
6868
6869 \begin_layout LyX-Code
6870     convenient to post-process the scan_for_matches output to get a
6871 \end_layout
6872
6873 \begin_layout LyX-Code
6874     more readable version.
6875   We provide a simple post-processor called
6876 \end_layout
6877
6878 \begin_layout LyX-Code
6879     "show_hits".
6880   To see its effect, just pipe the output of a
6881 \end_layout
6882
6883 \begin_layout LyX-Code
6884     scan_for_matches into show_hits:
6885 \end_layout
6886
6887 \begin_layout LyX-Code
6888      Normal Output:
6889 \end_layout
6890
6891 \begin_layout LyX-Code
6892         clone% scan_for_matches -c pat_file < tmp
6893 \end_layout
6894
6895 \begin_layout LyX-Code
6896         >tst1:[1,28]
6897 \end_layout
6898
6899 \begin_layout LyX-Code
6900         gtacguaacc  ggttaac cgguuacgtac
6901 \end_layout
6902
6903 \begin_layout LyX-Code
6904         >tst1:[28,1]
6905 \end_layout
6906
6907 \begin_layout LyX-Code
6908         gtacgtaacc  ggttaac cggttacgtac
6909 \end_layout
6910
6911 \begin_layout LyX-Code
6912         >tst2:[2,31]
6913 \end_layout
6914
6915 \begin_layout LyX-Code
6916         CGTACGUAAC C GGTTAACC GGUUACGTACG
6917 \end_layout
6918
6919 \begin_layout LyX-Code
6920         >tst2:[31,2]
6921 \end_layout
6922
6923 \begin_layout LyX-Code
6924         CGTACGTAAC C GGTTAACC GGTTACGTACG
6925 \end_layout
6926
6927 \begin_layout LyX-Code
6928         >tst3:[3,32]
6929 \end_layout
6930
6931 \begin_layout LyX-Code
6932         gtacguaacc g gttaactt cgguuacgtac
6933 \end_layout
6934
6935 \begin_layout LyX-Code
6936         >tst3:[32,3]
6937 \end_layout
6938
6939 \begin_layout LyX-Code
6940         gtacgtaacc g aagttaac cggttacgtac
6941 \end_layout
6942
6943 \begin_layout LyX-Code
6944      Piped Through show_hits:
6945 \end_layout
6946
6947 \begin_layout LyX-Code
6948
6949 \end_layout
6950
6951 \begin_layout LyX-Code
6952         clone% scan_for_matches -c pat_file < tmp | show_hits
6953 \end_layout
6954
6955 \begin_layout LyX-Code
6956         tst1:[1,28]:  gtacguaacc   ggttaac  cgguuacgtac
6957 \end_layout
6958
6959 \begin_layout LyX-Code
6960         tst1:[28,1]:  gtacgtaacc   ggttaac  cggttacgtac
6961 \end_layout
6962
6963 \begin_layout LyX-Code
6964         tst2:[2,31]:  CGTACGUAAC C GGTTAACC GGUUACGTACG
6965 \end_layout
6966
6967 \begin_layout LyX-Code
6968         tst2:[31,2]:  CGTACGTAAC C GGTTAACC GGTTACGTACG
6969 \end_layout
6970
6971 \begin_layout LyX-Code
6972         tst3:[3,32]:  gtacguaacc g gttaactt cgguuacgtac
6973 \end_layout
6974
6975 \begin_layout LyX-Code
6976         tst3:[32,3]:  gtacgtaacc g aagttaac cggttacgtac
6977 \end_layout
6978
6979 \begin_layout LyX-Code
6980         clone%
6981 \end_layout
6982
6983 \begin_layout LyX-Code
6984     Optionally, you can specify which of the "fields" in the matches
6985 \end_layout
6986
6987 \begin_layout LyX-Code
6988     you wish to sort on, and show_hits will sort them.
6989   The field
6990 \end_layout
6991
6992 \begin_layout LyX-Code
6993     numbers start with 0.
6994   So, you might get something like
6995 \end_layout
6996
6997 \begin_layout LyX-Code
6998         clone% scan_for_matches -c pat_file < tmp | show_hits 2 1
6999 \end_layout
7000
7001 \begin_layout LyX-Code
7002         tst2:[2,31]:  CGTACGUAAC C GGTTAACC GGUUACGTACG
7003 \end_layout
7004
7005 \begin_layout LyX-Code
7006         tst2:[31,2]:  CGTACGTAAC C GGTTAACC GGTTACGTACG
7007 \end_layout
7008
7009 \begin_layout LyX-Code
7010         tst3:[32,3]:  gtacgtaacc g aagttaac cggttacgtac
7011 \end_layout
7012
7013 \begin_layout LyX-Code
7014         tst1:[1,28]:  gtacguaacc   ggttaac  cgguuacgtac
7015 \end_layout
7016
7017 \begin_layout LyX-Code
7018         tst1:[28,1]:  gtacgtaacc   ggttaac  cggttacgtac
7019 \end_layout
7020
7021 \begin_layout LyX-Code
7022         tst3:[3,32]:  gtacguaacc g gttaactt cgguuacgtac
7023 \end_layout
7024
7025 \begin_layout LyX-Code
7026         clone%
7027 \end_layout
7028
7029 \begin_layout LyX-Code
7030     In this case, the hits have been sorted on fields 2 and 1 (that is,
7031 \end_layout
7032
7033 \begin_layout LyX-Code
7034     the third and second matched subfields).
7035 \end_layout
7036
7037 \begin_layout LyX-Code
7038     show_hits is just one possible little post-processor, and you
7039 \end_layout
7040
7041 \begin_layout LyX-Code
7042     might well wish to write a customized one for yourself.
7043 \end_layout
7044
7045 \begin_layout LyX-Code
7046 Reducing the Cost of a Search
7047 \end_layout
7048
7049 \begin_layout LyX-Code
7050     The scan_for_matches utility uses a fairly simple search, and may
7051 \end_layout
7052
7053 \begin_layout LyX-Code
7054     consume large amounts of CPU time for complex patterns.
7055   Someday,
7056 \end_layout
7057
7058 \begin_layout LyX-Code
7059     I may decide to optimize the code.
7060   However, until then, let me
7061 \end_layout
7062
7063 \begin_layout LyX-Code
7064     mention one useful technique.
7065
7066 \end_layout
7067
7068 \begin_layout LyX-Code
7069     When you have a complex pattern that includes a number of varying
7070 \end_layout
7071
7072 \begin_layout LyX-Code
7073     ranges, imprecise matches, and so forth, it is useful to
7074 \end_layout
7075
7076 \begin_layout LyX-Code
7077     "pipeline" matches.
7078   That is, form a simpler pattern that can be
7079 \end_layout
7080
7081 \begin_layout LyX-Code
7082     used to scan through a large database extracting sections that
7083 \end_layout
7084
7085 \begin_layout LyX-Code
7086     might be matched by the more complex pattern.
7087   Let me illustrate
7088 \end_layout
7089
7090 \begin_layout LyX-Code
7091     with a short example.
7092   Suppose that you really wished to match the
7093 \end_layout
7094
7095 \begin_layout LyX-Code
7096     pattern
7097 \end_layout
7098
7099 \begin_layout LyX-Code
7100     p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0]
7101 \end_layout
7102
7103 \begin_layout LyX-Code
7104     In this case, the pattern units AGC 3...5 RYGC can be used to rapidly
7105 \end_layout
7106
7107 \begin_layout LyX-Code
7108     constrain the overall search.
7109   You can preprocess the overall
7110 \end_layout
7111
7112 \begin_layout LyX-Code
7113     database using the pattern:
7114 \end_layout
7115
7116 \begin_layout LyX-Code
7117           31...31 AGC 3...5 RYGC 7...7
7118 \end_layout
7119
7120 \begin_layout LyX-Code
7121     Put the complex pattern in pat_file1 and the simpler pattern in
7122 \end_layout
7123
7124 \begin_layout LyX-Code
7125     pat_file2.
7126   Then use,
7127 \end_layout
7128
7129 \begin_layout LyX-Code
7130         scan_for_matches -c pat_file2 < nucleotide_database |
7131 \end_layout
7132
7133 \begin_layout LyX-Code
7134         scan_for_matches pat_file1
7135 \end_layout
7136
7137 \begin_layout LyX-Code
7138     The output will show things like
7139 \end_layout
7140
7141 \begin_layout LyX-Code
7142     >seqid:[232,280][2,47]
7143 \end_layout
7144
7145 \begin_layout LyX-Code
7146     matches pieces
7147 \end_layout
7148
7149 \begin_layout LyX-Code
7150     Then, the actual section of the sequence that was matched can be
7151 \end_layout
7152
7153 \begin_layout LyX-Code
7154     easily computed as [233,278] (remember, the positions start from
7155 \end_layout
7156
7157 \begin_layout LyX-Code
7158     1, not 0).
7159 \end_layout
7160
7161 \begin_layout LyX-Code
7162     Let me finally add, you should do a few short experiments to see
7163 \end_layout
7164
7165 \begin_layout LyX-Code
7166     whether or not such pipelining actually improves performance -- it
7167 \end_layout
7168
7169 \begin_layout LyX-Code
7170     is not always obvious where the time is going, and I have
7171 \end_layout
7172
7173 \begin_layout LyX-Code
7174     sometimes found that the added complexity of pipelining actually
7175 \end_layout
7176
7177 \begin_layout LyX-Code
7178     slowed things up.
7179   It gets its best improvements when there are
7180 \end_layout
7181
7182 \begin_layout LyX-Code
7183     exact matches of more than just a few characters that can be
7184 \end_layout
7185
7186 \begin_layout LyX-Code
7187     rapidly used to eliminate large sections of the database.
7188 \end_layout
7189
7190 \begin_layout LyX-Code
7191 =============
7192 \end_layout
7193
7194 \begin_layout LyX-Code
7195 Additions:
7196 \end_layout
7197
7198 \begin_layout LyX-Code
7199 Feb 9, 1995:   the pattern units ^ and $ now work as in normal regular
7200 \end_layout
7201
7202 \begin_layout LyX-Code
7203                expressions.
7204   That is
7205 \end_layout
7206
7207 \begin_layout LyX-Code
7208                         TTF $
7209 \end_layout
7210
7211 \begin_layout LyX-Code
7212                matches only TTF at the end of the string and
7213 \end_layout
7214
7215 \begin_layout LyX-Code
7216                         ^ TTF
7217 \end_layout
7218
7219 \begin_layout LyX-Code
7220                matches only an initial TTF
7221 \end_layout
7222
7223 \begin_layout LyX-Code
7224                The pattern unit
7225 \end_layout
7226
7227 \begin_layout LyX-Code
7228                         <p1
7229 \end_layout
7230
7231 \begin_layout LyX-Code
7232                matches the reverse of the string named p1.
7233   That is,
7234 \end_layout
7235
7236 \begin_layout LyX-Code
7237                if p1 matched GCAT, then <p1 would match TACG.
7238   Thus,
7239 \end_layout
7240
7241 \begin_layout LyX-Code
7242                    p1=6...6 <p1
7243 \end_layout
7244
7245 \begin_layout LyX-Code
7246                matches a real palindrome (not the biologically common
7247 \end_layout
7248
7249 \begin_layout LyX-Code
7250                meaning of "reverse complement")
7251 \end_layout
7252
7253 \begin_layout LyX-Code
7254
7255 \end_layout
7256
7257 \end_body
7258 \end_document