From: martinahansen Date: Fri, 27 Jun 2008 05:29:57 +0000 (+0000) Subject: new wiki usage test X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=f9d549b6b97555f06d42e3589a1e4aaf75716aa9;p=biopieces.git new wiki usage test git-svn-id: http://biopieces.googlecode.com/svn/trunk@54 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_doc/00README b/bp_doc/00README new file mode 100644 index 0000000..cdbed61 --- /dev/null +++ b/bp_doc/00README @@ -0,0 +1,2 @@ +# Adding stuff to the wiki, like images and documents +svn co https://biopieces.googlecode.com/svn/wiki wiki --username martinahansen diff --git a/bp_doc/biopieces_cookbook.lyx~ b/bp_doc/biopieces_cookbook.lyx~ new file mode 100644 index 0000000..1090825 --- /dev/null +++ b/bp_doc/biopieces_cookbook.lyx~ @@ -0,0 +1,7258 @@ +#LyX 1.5.1 created this file. For more info see http://www.lyx.org/ +\lyxformat 276 +\begin_document +\begin_header +\textclass scrartcl +\begin_preamble +\usepackage[colorlinks=true, urlcolor=blue, linkcolor=black]{hyperref} +\end_preamble +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 +\graphics default +\paperfontsize default +\spacing single +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes false +\output_changes false +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +Biopieces Cookbook +\end_layout + +\begin_layout Author +Martin Asser Hansen +\end_layout + +\begin_layout Publishers +John Mattick Group +\newline +Institute for Molecular Bioscience +\newline +University of Queensland +\newline +Aust +ralia +\newline +E-mail: mail@maasha.dk +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + + +\backslash +thispagestyle{empty} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard + +\newpage + +\end_layout + +\begin_layout Standard +\begin_inset LatexCommand tableofcontents + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset FloatList figure + +\end_inset + + +\end_layout + +\begin_layout Standard + +\newpage + +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +Biopieces is a collection of simple tools that can be linked together (piped + as we shall call it) in a very flexible manner to perform both simple and + complex tasks. + The fundamental idea is that biopieces work on a data stream that will + only terminate at the end of an analysis and that this data stream can + be passed through several different biopieces, each performing one specific + task. + The advantage of this approach is that a user can perform simple and complex + tasks without having to write advanced code. + Moreover, since the data format used to pass data between biopieces is + text based, biopieces can be written by different developers in their favorite + programming language --- and still the biopieces will be able to work together. +\end_layout + +\begin_layout Standard +In the most simple form bioools can be piped together on the command line + like this (using the pipe character '|'): +\end_layout + +\begin_layout LyX-Code +read_data | calculate_something | write_result +\end_layout + +\begin_layout Standard +However, a more comprehensive analysis could be composed: +\end_layout + +\begin_layout LyX-Code +read_data | select_entries | convert_entries | search_database +\end_layout + +\begin_layout LyX-Code +evaluate_results | plot_diagram | plot_another_diagram | +\end_layout + +\begin_layout LyX-Code +load_to_database +\end_layout + +\begin_layout Standard +The data stream that is piped through the biopieces consists of records + of key/value pairs in the same way a hash does in order to keep as simple + a structure as possible. + An example record can be seen below: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +REC_TYPE: PATSCAN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MATCH: AGATCAAGTG +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_BEG: 7 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_END: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +ALIGN_LEN: 9 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_ID: piR-t6 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +STRAND: + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +PATTERN: AGATCAAGTG +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +The ' +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\backslash +/- +\end_layout + +\end_inset + +' denotes the delimiter of the records, and each key is a word followed + by a ':' and a white-space and then the value. + By convention the biopieces only uses upper case keys (a list of used keys + can be seen in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:Keys" + +\end_inset + +). + Since the records basically are hash structures this mean that the order + of the keys in the stream is unordered, and in the above example it is + pure coincidence that HIT_BEG is displayed before HIT_END, however, when + the order of the keys is importent, the biopieces will automagically see + to that. +\end_layout + +\begin_layout Standard +All of the biopieces are able to read and write a data stream to and from + file as long as the records are in the biopieces format. + This means that if you are undertaking a lengthy analysis where one of + the steps is time consuming, you may save the stream after this step, and + subsequently start one or more analysis from that last step +\begin_inset Foot +status collapsed + +\begin_layout Standard +It is a goal that the biopieces at some point will be able to dump the data + stream to file in case one of the tools fail critically. +\end_layout + +\end_inset + +. + If you are running a lengthy analysis it is highly recommended that you + create a small test sample of the data and run that through the pipeline + --- and once you are satisfied with the result proceed with the full data + set (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-select-a-few-records" + +\end_inset + +). +\end_layout + +\begin_layout Standard +All of the biopieces can be supplied with long arguments prefixed with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + + switches or single character arguments prefixed with - switches that can + be grouped together (e.g. + -xok). + In this cookbook only the long switches are used to emphasize what these + switches do. +\end_layout + +\begin_layout Section +Setup +\end_layout + +\begin_layout Standard +In order to get the biopieces to work, you need to add environment settings + to include the code binaries, scripts, and modules that constitute the + biopieces package. + Assuming that you are using bash, add the following to your '~/.bashrc' + file using your favorite editor. + After the changes has been saved you need to either run 'source ~/.bashrc' + or relogin. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +if [ -f "/home/m.hansen/maasha/conf/bashrc" ]; then +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + source "/home/m.hansen/maasha/conf/bashrc" +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +fi +\end_layout + +\begin_layout Section +Getting Started +\end_layout + +\begin_layout Standard +The biopiece +\series bold +list_biopieces +\series default + lists all the biopieces along with a description: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biopieces +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +align_seq Align sequences in stream using Muscle. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_seq Analysis the residue composition of each sequence + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_vals Determine type, count, min, max, sum and mean for + values in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blast_seq BLAST sequences in stream against a specified database. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blat_seq BLAT sequences in stream against a specified genome. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +complement_seq Complement sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records Count the number of records in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_seq Count sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_vals Count the number of times values of given keys exists + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +create_blast_db Create a BLAST database from sequences in stream for + use with BLAST. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +... +\end_layout + +\begin_layout Standard +To list the biopieces for writing different formats, you can use unix's + grep like this: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biopieces | grep write +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_align Write aligned sequences in pretty alignment format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_bed Write records from stream as BED lines. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_blast Write BLAST records from stream in BLAST tabular format + (-m8 and 9). +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_fasta Write sequences in FASTA format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_psl Write records from stream in PSL format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_tab Write records from stream as tab separated table. +\end_layout + +\begin_layout Standard +In order to find out how a specific biopiece works, you just type the program + name without any arguments and press return and the usage of the biopiece + will be displayed. + E.g. + +\series bold +read_fasta +\series default + : +\end_layout + +\begin_layout Standard +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +width "100col%" +special "none" +height "1in" +height_special "totalheight" +status open + +\begin_layout LyX-Code + +\size scriptsize +Program name: read_fasta +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Author: Martin Asser Hansen - Copyright (C) - All rights reserved +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Contact: mail@maasha.dk +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Date: August 2007 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/ +gpl.html) +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Description: Read FASTA entries. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Usage: read_fasta [options] -i +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Options: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-i | --data_in=] - Comma separated list of files + to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-n | --num=] - Limit number of records to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-I | --stream_in=] - Read input stream from file + - Default=STDIN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-O | --stream_out=] - Write output stream to file + - Default=STDOUT +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Examples: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna - Read FASTA entries from file. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test1.fna,test2,fna - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i '*.fna' - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna -n 10 - Read first 10 FASTA entries from + file. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +The Data Stream +\end_layout + +\begin_layout Subsection +How to read the data stream from file? +\begin_inset LatexCommand label +name "sub:How-to-read-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +You want to read a data stream that you previously have saved to file in + biopieces format. + This can be done implicetly or explicitly. + The implicit way uses the 'stdout' stream of the Unix terminal: +\end_layout + +\begin_layout LyX-Code +cat | +\end_layout + +\begin_layout Standard +cat is the Unix command that reads a file and output the result to 'stdout' + --- which in this case is piped to any biopiece represented by the . + It is also possible to read the data stream using '<' to direct the 'stdout' + stream into the biopiece like this: +\end_layout + +\begin_layout LyX-Code + < +\end_layout + +\begin_layout Standard +However, that will not work if you pipe more biopieces together. + Then it is much safer to read the stream from a file explicitly like this: +\end_layout + +\begin_layout LyX-Code + --stream_in= +\end_layout + +\begin_layout Standard +Here the filename is explicetly given to the biopiece + with the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in. + This switch works with all biopieces. + It is also possible to read in data from multiple sources by repeating + the explicit read step: +\end_layout + +\begin_layout LyX-Code + --stream_in= | --stream_in= +\end_layout + +\begin_layout Subsection +How to write the data stream to file? +\begin_inset LatexCommand label +name "sub:How-to-write-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to save the output stream from a biopiece to file, so you can read + in the stream again at a later time, you can do one of two things: +\end_layout + +\begin_layout LyX-Code + > +\end_layout + +\begin_layout Standard +All, the biopieces write the data stream to 'stdout' by default which can + be written to a file by redirecting 'stdout' to file using '>' , however, + if one of the biopieces for writing other formats is used then the both + the biopieces records as well as the result output will go to 'stdout' + in a mixture causing havock! To avoid this you must use the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out that explictly tells the biopiece to write the output stream + to file: +\end_layout + +\begin_layout LyX-Code + --stream_out= +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out switch works with all biopieces. +\end_layout + +\begin_layout Subsection +How to terminate the data stream? +\end_layout + +\begin_layout Standard +The data stream is never stops unless the user want to save the stream or + by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch that will terminate the stream: +\end_layout + +\begin_layout LyX-Code + --no_stream +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch only works with those biopieces where it makes sense that + the user might want to terminale the data stream, +\emph on +i.e +\emph default +. + after an analysis step where the user wants to output the result, but not + the data stream. +\end_layout + +\begin_layout Subsection +How to write my results to file? +\begin_inset LatexCommand label +name "sub:How-to-write-result" + +\end_inset + + +\end_layout + +\begin_layout Standard +Saving the result of an analysis to file can be done implicitly or explicitly. + The implicit way: +\end_layout + +\begin_layout LyX-Code + --no_stream > +\end_layout + +\begin_layout Standard +If you use '>' to redirect 'stdout' to file then it is important to use + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch to avoid writing a mix of biopieces records and result + to the same file causing havock. + The safe way is to use the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which explicetly tells the biopiece to write the result + to a given file: +\end_layout + +\begin_layout LyX-Code + --result_out= +\end_layout + +\begin_layout Standard +Using the above method will not terminate the stream, so it is possible + to pipe that into another biopiece generating different results: +\end_layout + +\begin_layout LyX-Code + --result_out= | --result_out= +\end_layout + +\begin_layout Standard +And still the data stream will continue unless terminated with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream: +\end_layout + +\begin_layout LyX-Code + --result_out= --no_stream +\end_layout + +\begin_layout Standard +Or written to file using implicitly or explicity +\begin_inset LatexCommand eqref +reference "sub:How-to-write-result" + +\end_inset + +. + The explicit way: +\end_layout + +\begin_layout LyX-Code + --result_out= --stream_out= +\end_layout + +\begin_layout Subsection +How to read data from multiple sources? +\end_layout + +\begin_layout Standard +To read multiple data sources, with the same type or different type of data + do: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +where type is the data type a specific biopiece reads. +\end_layout + +\begin_layout Section +Reading input +\end_layout + +\begin_layout Subsection +How to read biopieces input? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-read-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to read in data? +\end_layout + +\begin_layout Standard +Data in different formats can be read with the appropriate biopiece for + that format. + The biopieces are typicalled named 'read_' such as +\series bold +read_fasta +\series default +, +\series bold +read_bed +\series default +, +\series bold +read_tab +\series default +, etc., and all behave in a similar manner. + Data can be read by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in switch and a file name to the file containing the data: +\end_layout + +\begin_layout LyX-Code + --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read in a saved biopieces stream (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-stream" + +\end_inset + +) as well as reading data in one go: +\end_layout + +\begin_layout LyX-Code + --stream_in= --data_in= +\end_layout + +\begin_layout Standard +If you want to read data from several files you can do this: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +If you have several data files you can read in all explicitly with a comma + separated list: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,file2,file3 +\end_layout + +\begin_layout Standard +And it is also possible to use file globbing +\begin_inset Foot +status open + +\begin_layout Standard +using the short option will only work if you quote the argument -i '*.fna' +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout LyX-Code + --data_in=*.fna +\end_layout + +\begin_layout Standard +Or in a combination: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,/dir/*.fna +\end_layout + +\begin_layout Standard +Finally, it is possible to read in data in different formats using the appropria +te biopiece for each format: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= ... +\end_layout + +\begin_layout Subsection +How to read FASTA input? +\end_layout + +\begin_layout Standard +Sequences in FASTA format can be read explicitly using +\series bold +read_fasta +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= +\end_layout + +\begin_layout Subsection +How to read alignment input? +\end_layout + +\begin_layout Standard +If your alignment if FASTA formatted then you can +\series bold +read_align +\series default +. + It is also possible to use +\series bold +read_fasta +\series default + since the data is FASTA formatted, however, with +\series bold +read_fasta +\series default + the key ALIGN will be omitted. + The ALIGN key is used to determine which sequences belong to what alignment + which is required for +\series bold +write_align +\series default +. +\end_layout + +\begin_layout LyX-Code +read_align --data_in= +\end_layout + +\begin_layout Subsection +How to read tabular input? +\begin_inset LatexCommand label +name "sub:How-to-read-table" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tabular input can be read with +\series bold +read_tab +\series default + which will read in all rows and chosen columns (separated by a given delimter) + from a table in text format. +\end_layout + +\begin_layout Standard +The table below: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Human +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +ATACGTCAG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +23524 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Dog +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AGCATGAC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2442 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Mouse +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +GACTG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +234 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Cat +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AAATGCA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2342 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Can be read using the command: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= +\end_layout + +\begin_layout Standard +Which will result in four records, one for each row, where the keys V0, + V1, V2 are the default keys for the organism, sequence, and count, respectively. + It is possible to select a subset of colums to read by using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +cols switch which takes a comma separated list of columns numbers (first + column is designated 0) as argument. + So to read in only the sequence and the count so that the count comes before + the sequence do: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 +\end_layout + +\begin_layout Standard +It is also possible to name the columns with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 --keys=COUNT,SEQ +\end_layout + +\begin_layout Subsection +How to read BED input? +\end_layout + +\begin_layout Standard +The BED (Browser Extensible Data +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/FAQ/FAQformat" + +\end_inset + + +\end_layout + +\end_inset + +) format is a tabular format for data pertaining to one of the Eukaryotic + genomes in the UCSC genome brower +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/" + +\end_inset + + +\end_layout + +\end_inset + +. + The BED format consists of up to 12 columns, where the first three are + mandatory CHR, CHR_BEG, and CHR_END. + The mandatory columns and any of the optional columns can all be read in + easily with the +\series bold +read_bed +\series default + biopiece. +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read the BED file with +\series bold +read_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +), however, that will be more cumbersome because you need to specify the + keys: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --keys=CHR,CHR_BEG,CHR_END ... +\end_layout + +\begin_layout Subsection +How to read PSL input? +\end_layout + +\begin_layout Standard +The PSL format is the output from BLAT and contains 21 mandatory fields + that can be read with +\series bold +read_psl +\series default +: +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= +\end_layout + +\begin_layout Section +Writing output +\end_layout + +\begin_layout Standard +All result output can be written explicitly to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which all result generating biopieces have. + It is also possible to write the result to file implicetly by directing + 'stdout' to file using '>', however, that requires the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream swich to prevent a mixture of data stream and results in the file. + The explicit (and safe) way: +\end_layout + +\begin_layout LyX-Code +... + | --result_out= +\end_layout + +\begin_layout Standard +The implicit way: +\end_layout + +\begin_layout LyX-Code +... + | --no_stream > +\end_layout + +\begin_layout Subsection +How to write biopieces output? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-write-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to write FASTA output? +\begin_inset LatexCommand label +name "sub:How-to-write-fasta" + +\end_inset + + +\end_layout + +\begin_layout Standard +FASTA output can be written with +\series bold +write_fasta +\series default +. +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --result_out= +\end_layout + +\begin_layout Standard +It is also possible to wrap the sequences to a given width using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch allthough wrapping of sequence is generally an evil thing: +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --no_stream --wrap=80 +\end_layout + +\begin_layout Subsection +How to write alignment output? +\begin_inset LatexCommand label +name "sub:How-to-write-alignment" + +\end_inset + + +\end_layout + +\begin_layout Standard +Pretty alignments with ruler +\begin_inset Foot +status collapsed + +\begin_layout Standard +'.' for every 10 residues, ':' for every 50, and '|' for every 100 +\end_layout + +\end_inset + + and consensus sequence +\begin_inset Note Note +status collapsed + +\begin_layout Standard +which reminds me to make that an option. +\end_layout + +\end_inset + + can be created with +\series bold +write_align +\series default +, what also have the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch to break the alignment into blocks of a given width: +\end_layout + +\begin_layout LyX-Code +... + | write_align --result_out= --wrap=80 +\end_layout + +\begin_layout Standard +If the number of sequnces in the alignment is 2 then a pairwise alignment + will be output otherwise a multiple alignment. + And if the sequence type, determined automagically, is protein, then residues + and symbols (+,\InsetSpace ~ +:,\InsetSpace ~ +.) will be used to show consensus according to the Blosum62 + matrix. +\end_layout + +\begin_layout Subsection +How to write tabular output? +\begin_inset LatexCommand label +name "sub:How-to-write-tab" + +\end_inset + + +\end_layout + +\begin_layout Standard +Outputting the data stream as a table can be done with +\series bold +write_tab +\series default +, which will write generate one row per record with the values as columns. + If you supply the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment switch, when the first row in the table will be a 'comment' line + prefixed with a '#': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --comment +\end_layout + +\begin_layout Standard +You can also change the delimiter from the default (tab) to +\emph on +e.g. + +\emph default + ',': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --delimit=',' +\end_layout + +\begin_layout Standard +If you want the values output in a specific order you have to supply a comma + separated list using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch that will print only those keys in that order: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --keys=SEQ_NAME,COUNT +\end_layout + +\begin_layout Standard +Alternatively, if you have some keys that you don't want in the tabular + output, use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switch. + So to print all keys except SEQ and SEQ_TYPE do: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --no_keys=SEQ,SEQ_TYPE +\end_layout + +\begin_layout Standard +Finally, if you have a stream containing a mix of different records types, + +\emph on +e.g. + +\emph default + records with sequences and records with matches, then you can use +\series bold +write_tab +\series default + to output all the records in tabluar format, however, the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment, +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys, and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switches will only respond to records of the first type encountered. + The reason is that outputting mixed records is probably not what you want + anyway, and you should remove all the unwanted records from the stream + before outputting the table: +\series bold +grab +\series default + is your friend (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to write a BED output? +\begin_inset LatexCommand label +name "sub:How-to-write-BED" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in BED format can be output if the records contain the mandatory keys + CHR, CHR_BEG, and CHR_END using +\series bold +write_bed +\series default +. + If the optional keys are also present, they will be output as well: +\end_layout + +\begin_layout LyX-Code +write_bed --result_out= +\end_layout + +\begin_layout Subsection +How to write PSL output? +\begin_inset LatexCommand label +name "sub:How-to-write-PSL" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in PSL format can be output using +\series bold +write_psl: +\end_layout + +\begin_layout LyX-Code +write_psl --result_out= +\end_layout + +\begin_layout Section +Manipulating Records +\end_layout + +\begin_layout Subsection +How to select a few records? +\begin_inset LatexCommand label +name "sub:How-to-select-a-few-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +To quickly get an overview of your data you can limit the data stream to + show a few records. + This also very useful to test the pipeline with a few records if you are + setting up a complex analysis using several biopieces. + That way you can inspect that all goes well before analyzing and waiting + for the full data set. + All of the read_ biopieces have the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch which will take a number as argument and only that number of + records will be read. + So to read in the first 10 FASTA entries from a file: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna --num=10 +\end_layout + +\begin_layout Standard +Another way of doing this is to use +\series bold +head_records +\series default + will limit the stream to show the first 10 records (default): +\end_layout + +\begin_layout LyX-Code +... + | head_records +\end_layout + +\begin_layout Standard +Using +\series bold +head_records +\series default + directly after one of the read_ biopieces will be a lot slower than + using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch with the read_ biopieces, however, +\series bold +head_records +\series default + can also be used to limit the output from all the other biopieces. + It is also possible to give +\series bold +head_records +\series default + a number of records to show using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch. + So to display the first 100 records do: +\end_layout + +\begin_layout LyX-Code +... + | head_records --num=100 +\end_layout + +\begin_layout Subsection +How to select random records? +\begin_inset LatexCommand label +name "sub:How-to-select-random-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to inspect a number of random records from the stream this can + be done with the +\series bold +random_records +\series default + biopiece. + So if you have 1 mio records in the stream and you want to select 1000 + random records do: +\end_layout + +\begin_layout LyX-Code +... + | random_records --num=1000 +\end_layout + +\begin_layout Subsection +How to count all records in the data stream? +\end_layout + +\begin_layout Standard +To count all the records in the data stream use +\series bold +count_records +\series default +, which adds one record (which is not included in the count) to the data + stream. + So to count the number of sequences in a FASTA file you can do this: +\end_layout + +\begin_layout LyX-Code +cat test.fna | read_fasta | count_records --no_stream +\end_layout + +\begin_layout Standard +Which will write the last record containing the count to 'stdout': +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records: 630 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +It is also possible to write the count to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch. +\end_layout + +\begin_layout Subsection +How to get the length of record values? +\begin_inset LatexCommand label +name "sub:How-to-get-value_length" + +\end_inset + + +\end_layout + +\begin_layout Standard +Use the +\series bold +length_vals +\series default + biopiece to get the length of each value for a comma separated list of + keys: +\end_layout + +\begin_layout LyX-Code +... + | length_vals --keys=HIT,PATTERN +\end_layout + +\begin_layout Subsection +How to grab specific records? +\begin_inset LatexCommand label +name "sub:How-to-grab" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biopiece +\series bold +grab +\series default + is related to the Unix grep and locates records based on matching keys + and/or values using either a pattern, a Perl regex, or a numerical evaluation. + To easily +\series bold +grab +\series default + all records in the stream that has any mentioning of the pattern 'human' + just pipe the data stream through +\series bold +grab +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human +\end_layout + +\begin_layout Standard +This will search for the pattern 'human' in all keys and all values. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma separated list of patterns, so in order to + match multiple patterns do: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human,mouse +\end_layout + +\begin_layout Standard +It is also possible to use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch instead of +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern. + +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in is used to read a file with one pattern per line: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern_in=patterns.txt +\end_layout + +\begin_layout Standard +If you want the opposite result --- to find all records that does not match + the patterns, add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch, which not only works with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch, but also with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +regex and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --invert +\end_layout + +\begin_layout Standard +If you want to search the record keys only, +\emph on +e.g. + +\emph default + to find all records containing the key SEQ you can add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys_only switch. + This will prevent matching of SEQ in any record value, and in fact SEQ + is a not uncommon peptide sequence you could get an unwanted record. + Also, this will give an increase in speed since only the keys are searched: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --keys_only +\end_layout + +\begin_layout Standard +However, if you are interested in finding the peptide sequence SEQ and not + the SEQ key, just add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +vals_only switch instead: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --vals_only +\end_layout + +\begin_layout Standard +Also, if you want to grab for certain key/value pairs you can supply a comma + separated list of keys whos values will then be searched using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch. + This is handy if your records contain large genomic sequences and you dont + want to search the entire sequence for +\emph on +e.g. + +\emph default + the organism name --- it is much faster to tell +\series bold +grab +\series default + which keys to search the value for: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --keys=SEQ_NAME +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +It is also possible to invoke flexible matching using regex (regular expressions +) instead of simple pattern matching. + In +\series bold +grab +\series default + the regex engine is Perl based and allows use of different type of wild + cards, alternatives, +\emph on +etc +\emph default + +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://perldoc.perl.org/perlreref.html" + +\end_inset + + +\end_layout + +\end_inset + +. + If you want to +\series bold +grab +\series default + records withs the sequence ATCG or GCTA you can do this: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='ATCG|GCTA' +\end_layout + +\begin_layout Standard +Or if you want to find sequences beginning with ATCG: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='^ATCG' +\end_layout + +\begin_layout Standard +You can also use +\series bold +grab +\series default + to locate records that fulfill a numerical property using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval switch witch takes an expression in three parts. + The first part is the key that holds the value we want to evaluate, the + second part holds one the six operators: +\end_layout + +\begin_layout Enumerate +Greater than: > +\end_layout + +\begin_layout Enumerate +Greater than or equal to: >= +\end_layout + +\begin_layout Enumerate +Less than: < +\end_layout + +\begin_layout Enumerate +Less than or equal to: <= +\end_layout + +\begin_layout Enumerate +Equal to: = +\end_layout + +\begin_layout Enumerate +Not equal to: != +\end_layout + +\begin_layout Enumerate +String wise equal to: eq +\end_layout + +\begin_layout Enumerate +String wise not equal to: ne +\end_layout + +\begin_layout Standard +And finally comes the number used in the evaluation. + So to +\series bold +grab +\series default + all records with a sequence length greater than 30: +\end_layout + +\begin_layout LyX-Code +... + length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +If you want to locate all records containing the pattern 'human' and where + the sequence length is greater that 30, you do this by running the stream + through +\series bold +grab +\series default + twice: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +Finally, it is possible to do fast matching of expressions from a file using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact switch. + Each of these expressions has to be matched exactly over the entrie length, + which if useful if you have a file with accession numbers, that you want + to locate in the stream: +\end_layout + +\begin_layout LyX-Code +... + | grab --exact acc_no.txt | ... +\end_layout + +\begin_layout Standard +Using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact is much faster than using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in, because with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact the expression has to be complete matches, where +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in looks for subpatterns. +\end_layout + +\begin_layout Standard +NB! To get the best speed performance, use the most restrictive +\series bold +grab +\series default + first. +\end_layout + +\begin_layout Subsection +How to remove keys from records? +\end_layout + +\begin_layout Standard +To remove one or more specific keys from all records in the data stream + use +\series bold +remove_keys +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | remove_keys --keys=SEQ,SEQ_NAME +\end_layout + +\begin_layout Standard +In the above example SEQ and SEQ_NAME will be removed from all records if + they exists in these. + If all keys are removed from a record, then the record will be removed. +\end_layout + +\begin_layout Subsection +How to rename keys in records? +\end_layout + +\begin_layout Standard +Sometimes you want to rename a record key, +\emph on +e.g. + +\emph default + if you have read in a two column table with sequence name and sequence + in each column (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +) without specifying the key names, then the sequence name will be called + V0 and the sequence V1 as default in the +\series bold +read_tab +\series default + biopiece. + To rename the V0 and V1 keys we need to run the stream through +\series bold +rename_keys +\series default + twice (one for each key to rename): +\end_layout + +\begin_layout LyX-Code +... + | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ +\end_layout + +\begin_layout Standard +The first instance of +\series bold +rename_keys +\series default + replaces all the V0 keys with SEQ_NAME, and the second instance of +\series bold +rename_keys +\series default + replaces all the V1 keys with SEQ. + +\emph on +Et viola +\emph default + the data can now be used in the biopieces that requires these keys. +\end_layout + +\begin_layout Section +Manipulating Sequences +\end_layout + +\begin_layout Subsection +How to get sequence lengths? +\end_layout + +\begin_layout Standard +The length for sequences in records can be determined with +\series bold +length_seq +\series default +, which adds the key SEQ_LEN to each record with the sequence length as + the value. + It also generates an extra record that is emitted last with the key TOTAL_SEQ_L +EN showing the total length of all the sequences. +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout Standard +It is also possible to determine the sequence length using the generic tool + +\series bold +length_vals +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-get-value_length" + +\end_inset + +, which determines the length of the values for a given list of keys: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout Standard +To obtain the total length of all sequences use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout LyX-Code +| sum_vals --keys=SEQ_LEN +\end_layout + +\begin_layout Standard +The biopiece +\series bold +analyze_seq +\series default + will also determine the length of each sequence (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-analyze" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to analyze sequence composition? +\begin_inset LatexCommand label +name "sub:How-to-analyze" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to find out the sequence type, composition, length, as well + as GC content, indel content and proportions of soft and hard masked sequence, + then use +\series bold +analyze_seq +\series default +. + This handy biopiece will determine all these things per sequence from which + it is easy to get an overview using the +\series bold +write_tab +\series default + biopiece to output a table (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +). + So in order to determine the sequence composition of a FASTA file with + just one entry containing the sequence 'ATCG' we just read the data with + +\series bold +read_fasta +\series default + and run the output through +\series bold +analyze_seq +\series default + which will add the analysis to the record like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:D: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MIX_INDEX: 0.55 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:W: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:G: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SOFT_MASK%: 63.75 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:B: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:V: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +HARD_MASK%: 0.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:H: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:S: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:N: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:.: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +GC%: 35.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:A: 8 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:Y: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:M: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:T: 44 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_TYPE: DNA +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:K: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:~: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc +tgtcg +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_LEN: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +80 RES:R: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:C: 12 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:-: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:U: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +Now to make a table of how may As, Ts, Cs, and Gs you can add the following: +\end_layout + +\begin_layout LyX-Code +... + | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G +\end_layout + +\begin_layout Standard +Or if you want to see the proportions of hard and soft masked sequence: +\end_layout + +\begin_layout LyX-Code +... + | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK% +\end_layout + +\begin_layout Standard +If you have a stack of sequences in one file and you want to determine the + mean GC content you can do it using the +\series bold +mean_vals +\series default + biopiece: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC% +\end_layout + +\begin_layout Standard +Or if you want the total count of Ns you can use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N +\end_layout + +\begin_layout Standard +The MIX_INDEX key is calculated as the count of the most common residue + over the sequence length, and can be used as a cut-off for removing sequence + tags consisting of mostly one nucleotide: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85' +\end_layout + +\begin_layout Subsection +How to extract subsequences? +\begin_inset LatexCommand label +name "sub:How-to-extract" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to extract a subsequence from a longer sequence use the biopiece + extract_seq, which will replace the sequence in the record with the subsequence + (this behaviour should probably be modified to be dependant of a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +replace or a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_replace switch +\begin_inset Note Note +status collapsed + +\begin_layout Standard +also in split_seq +\end_layout + +\end_inset + +). + So to extract the first 20 residues from all sequences do (first residue + is designated 1): +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=1 --len=20 +\end_layout + +\begin_layout Standard +You can also specify a begin and end coordinate set: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 --end=40 +\end_layout + +\begin_layout Standard +If you want the subsequences from position 20 to the sequence end do: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 +\end_layout + +\begin_layout Standard +If you want to extract subsequences a given distance from the sequence end + you can do this by reversing the sequence with the biopiece +\series bold +reverse_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-reverse-seq" + +\end_inset + +, followed by +\series bold +extract_seq +\series default + to get the subsequence, and then +\series bold +reverse_seq +\series default + again to get the subsequence back in the original orientation: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | reverse_seq +\end_layout + +\begin_layout LyX-Code +| extract_seq --beg=10 --len=10 | reverse_seq +\end_layout + +\begin_layout Subsection +How to get genomic sequence? +\begin_inset LatexCommand label +name "sub:How-to-get-genomic-sequence" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biopiece +\series bold +get_genomic_seq +\series default + can extract subsequences for a given genome specified with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch explicitly using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +beg and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +end/ +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +len switches: +\end_layout + +\begin_layout LyX-Code +get_genome_seq --genome= --beg=1 --len=100 +\end_layout + +\begin_layout Standard +Alternatively, +\series bold +get_genome_seq +\series default + can be used to append the corresponding sequence to BED, PSL, and BLAST + records: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to include flaking sequence using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +flank switch. + So to include 50 nucleotides upstream and 50 nucleotides downstream for + each BED entry do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= --flank=50 +\end_layout + +\begin_layout Subsection +How to upper-case sequences? +\end_layout + +\begin_layout Standard +Sequences can be shifted from lower case to upper case using +\series bold +uppercase_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | uppercase_seq +\end_layout + +\begin_layout Subsection +How to reverse sequences? +\begin_inset LatexCommand label +name "sub:How-to-reverse-seq" + +\end_inset + + +\end_layout + +\begin_layout Standard +The order of residues in a sequence can be reversed using reverse_seq: +\end_layout + +\begin_layout LyX-Code +... + | reverse_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +complement_seq +\series default + biopiece (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-complement" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to complement sequences? +\begin_inset LatexCommand label +name "sub:How-to-complement" + +\end_inset + + +\end_layout + +\begin_layout Standard +DNA and RNA sequences can be complemented with +\series bold +complement_seq +\series default +, which automagically determines the sequence type: +\end_layout + +\begin_layout LyX-Code +... + | complement_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +reverse_seq +\series default + biopiece (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-reverse-seq" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to remove indels from sequnces? +\end_layout + +\begin_layout Standard +Indels can be removed from sequences with the +\series bold +remove_indels +\series default + biopiece. + This is useful if you have aligned some sequences (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-align" + +\end_inset + +) and extracted (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-extract" + +\end_inset + +) a block of subsequences from the alignment and you want to use these sequence + in a search where you need to remove the indels first. + '-', '~', and '.' are considered indels: +\end_layout + +\begin_layout LyX-Code +... + | remove_indels +\end_layout + +\begin_layout Subsection +How to shuffle sequences? +\end_layout + +\begin_layout Standard +All residues in sequences in the stream can be shuffled to random positions + using the +\series bold +shuffle_seq +\series default + biopiece: +\end_layout + +\begin_layout LyX-Code +... + | shuffle_seq +\end_layout + +\begin_layout Subsection +How to split sequences into overlapping subsequences? +\end_layout + +\begin_layout Standard +Sequences can be slit into overlapping subsequences with the +\series bold +split_seq +\series default + biopiece. +\end_layout + +\begin_layout LyX-Code +... + | split_seq --word_size=20 --uniq +\end_layout + +\begin_layout Subsection +How to determine the oligo frequency? +\end_layout + +\begin_layout Standard +In order to determine if any oligo usage is over represented in one or more + sequences you can determine the frequency of oligos of a given size with + +\series bold +oligo_freq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 +\end_layout + +\begin_layout Standard +And if you have more than one sequence and want to accumulate the frequences + you need the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +all switch: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all +\end_layout + +\begin_layout Standard +To get a meaningful result you need to write the resulting frequencies as + a table with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +), but first it is important to +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +) the records with the frequencies to avoid full length sequences in the + table: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +And the resulting frequency table can be sorted with Unix sort (man sort). +\end_layout + +\begin_layout Subsection +How to search for sequences in genomes? +\end_layout + +\begin_layout Standard +See the following biopiece: +\end_layout + +\begin_layout Itemize + +\series bold +patscan_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blat_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blast_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +vmatch_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to search sequences for a pattern? +\begin_inset LatexCommand label +name "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Standard +It is possible to search sequences in the data stream for patterns using + the +\series bold +patscan_seq +\series default + biopiece which utilizes the powerful scan_for_matches engine. + Consult the documentation for scan_for_matches in order to learn how to + define patterns (the documentation is included in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:scan_for_matches-README" + +\end_inset + +). +\end_layout + +\begin_layout Standard +To search all sequences for a simple pattern consisting of the sequence + ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | patscan_seq --pattern='ATCGATCG[3,2,1]' +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma seperated list of patterns, so if you want + to search for more that one pattern do: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]' +\end_layout + +\begin_layout Standard +It is also possible to have a list of different patterns to search for in + a file with one pattern per line. + In order to get +\series bold +patscan_seq +\series default + to read these patterns use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern_in= +\end_layout + +\begin_layout Standard +To also scan the complementary strand in nucleotide sequences ( +\series bold +patscan_seq +\series default + automagically determines the sequence type) you need to add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comp switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --comp +\end_layout + +\begin_layout Standard +It is also possible to use +\series bold +patscan_seq +\series default + to output those records that does not contain a certain pattern by using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --invert +\end_layout + +\begin_layout Standard +Finally, +\series bold +patscan_seq +\series default + can also scan for patterns in a given genome sequence, instead of sequences + in the stream, using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch: +\end_layout + +\begin_layout LyX-Code +patscan --pattern= --genome= +\end_layout + +\begin_layout Subsection +How to use BLAT for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the data stream can be matched against supported genomes using + +\series bold +blat_seq +\series default + which is a biopiece using BLAT as the name might suggest. + Currently only Mouse and Human genomes are available and it is not possible + to use OOC files since there is still a need for a local repository for + genome files. + Otherwise it is just: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_seq --genome= +\end_layout + +\begin_layout Standard +The search results can then be written to file with +\series bold +write_psl +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +) or +\series bold +write_bed +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-BED" + +\end_inset + +) allthough with +\series bold +write_bed +\series default + some information will be lost). + It is also possible to plot chromosome distribution of the search results + using +\series bold +plot_chrdist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-chrdist" + +\end_inset + +) or the distribution of the match lengths using +\series bold +plot_lendist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-lendist" + +\end_inset + +) or a karyogram with the hits using +\series bold +plot_karyogram +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-karyogram" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to use BLAST for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Standard +Two biopieces exist for blasting sequences: +\series bold +create_blast_db +\series default + is used to create the BLAST database required for BLAST which is queried + using the biopiece +\series bold +blast_seq +\series default +. + So in order to create a BLAST database from sequences in the data stream + you simple run: +\end_layout + +\begin_layout LyX-Code +... + | create_blast_db --database=my_database --no_stream +\end_layout + +\begin_layout Standard +The type of sequence to use for the database is automagically determined + by +\series bold +create_blast_db +\series default +, but don't have a mixture of peptide and nucleic acids sequences in the + stream. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch takes a path as argument, but will default to 'blastdb_ if not set. +\end_layout + +\begin_layout Standard +The resulting database can now be queried with sequences in another data + stream using +\series bold +blast_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --database=my_database +\end_layout + +\begin_layout Standard +Again, the sequence type is determined automagically and the appropriate + BLAST program is guessed (see below table), however, the program name can + be overruled with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +program switch. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Subject sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Query sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Program guess +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastn +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastp +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastx +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +tblastn +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Finally, it is also possible to use +\series bold +blast_seq +\series default + for blasting sequences agains a preformatted genome using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch instead of the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --genome= +\end_layout + +\begin_layout Subsection +How to use Vmatch for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The powerful suffix array software package Vmatch +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.vmatch.de/" + +\end_inset + + +\end_layout + +\end_inset + + can be used for exact mapping of sequences against indexed genomes using + the biopiece +\series bold +vmatch_seq +\series default +, which will e.g. + map 700000 ESTs to the human genome locating all 160 mio hits in less than + an hour. + Only nucleotide sequences and sequences longer than 11 nucleotides will + be mapped. + It is recommended that sequences consisting of mostly one nucleotide type + are removed. + This can be done with the +\series bold +analyze_seq +\series default + biopiece +\begin_inset LatexCommand eqref +reference "sub:How-to-analyze" + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to allow for mismatches using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist switch. + So to allow for 2 mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=2 +\end_layout + +\begin_layout Standard +Or to allow for 10% mismathing nucleotides: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=10p +\end_layout + +\begin_layout Standard +To allow both indels and mismatches use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist switch. + So to allow for one mismatch or one indel: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=1 +\end_layout + +\begin_layout Standard +Or to allow for 5% indels or mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=5p +\end_layout + +\begin_layout Standard +Note that using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist or +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist greatly slows down vmatch considerably --- use with care. +\end_layout + +\begin_layout Standard +The resulting SCORE key can be replaced to hold the number of genome matches + of a given sequence (multi-mappers) is the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +count switch is given. +\end_layout + +\begin_layout Subsection +How to find all matches between sequences? +\begin_inset LatexCommand label +name "sub:How-to-find-matches" + +\end_inset + + +\end_layout + +\begin_layout Standard +All matches between two sequences can be determined with the biopiece +\series bold +match_seq +\series default +. + The match finding engine underneath the hood of +\series bold +match_seq +\series default + is the super fast suffix tree program MUMmer +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://mummer.sourceforge.net/" + +\end_inset + + +\end_layout + +\end_inset + +, which will locate all forward and reverse matches between huge sequences + in a matter of minutes (if the repeat count is not too high and if the + word size used is appropriate). + Matching two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq --word_size=20 --direction=both +\end_layout + +\begin_layout Standard +The output from +\series bold +match_seq +\series default + can be used to generate a dot plot with +\series bold +plot_matches +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-generate-dotplot" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to align sequences? +\begin_inset LatexCommand label +name "sub:How-to-align" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the stream can be aligned with the +\series bold +align_seq +\series default + biopiece that uses Muscle +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.drive5.com/muscle/muscle.html" + +\end_inset + + +\end_layout + +\end_inset + + as aligment engine. + Currently you cannot change any of the Muscle alignment parameters and + +\series bold +align_seq +\series default + will create an alignment based on the defaults (which are really good!): +\end_layout + +\begin_layout LyX-Code +... + | align_seq +\end_layout + +\begin_layout Standard +The aligned output can be written to file in FASTA format using +\series bold +write_fasta +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-fasta" + +\end_inset + +) or in pretty text using +\series bold +write_align +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-alignment" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to create a weight matrix? +\end_layout + +\begin_layout Standard +If you want a weight matrix to show the sequence composition of a stack + of sequences you can use the biopiece create_weight_matrix: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix +\end_layout + +\begin_layout Standard +The result can be output in percent using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +percent switch: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix --percent +\end_layout + +\begin_layout Standard +The weight matrix can be written as tabular output with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +) after removeing the records containing SEQ with +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix | grab --invert --keys=SEQ --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +The V0 column will hold the residue, while the rest of the columns will + hold the frequencies for each sequence position. +\end_layout + +\begin_layout Section +Plotting +\end_layout + +\begin_layout Standard +There exists several biopieces for plotting. + Some of these are based on GNUplot +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.gnuplot.info/" + +\end_inset + + +\end_layout + +\end_inset + +, which is an extremely powerful platform to generate all sorts of plots + and even though GNUplot has quite a steep learning curve, the biopieces + utilizing GNUplot are simple to use. + GNUplot is able to output a lot of different formats (called terminals + in GNUplot), but the biopieces focusses on three formats only: +\end_layout + +\begin_layout Enumerate +The 'dumb' terminal is default to the GNUplot based biopieces and will output + a plot in crude ASCII text (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +). + This is quite nice for a quick and dirty plot to get an overview of your + data . +\end_layout + +\begin_layout Enumerate +The 'post' or 'postscript' terminal output postscript code which is publication + grade graphics that can be viewed with applications such as Ghostview, + Photoshop, and Preview. +\end_layout + +\begin_layout Enumerate +The 'svg' terminal output's scalable vector graphics (SVG) which is a vector + based format. + SVG is great because you can edit the resulting plot using Photoshop or + Inkscape +\begin_inset Foot +status collapsed + +\begin_layout Standard +Inkscape is a really handy drawing program that is free and open source. + Availble at +\begin_inset LatexCommand htmlurl +target "http://www.inkscape.org" + +\end_inset + + +\end_layout + +\end_inset + + if you want to add additional labels, captions, arrows, and so on and then + save the result in different formats, such as postscript without loosing + resolution. +\end_layout + +\begin_layout Standard +The biopieces for plotting that are not based on GNUplot only output SVG + (that may change in the future). +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist_ascii.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dumb-terminal" + +\end_inset + +Dumb terminal +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +The output of a length distribution plot in the default 'dumb terminal' + to the terminal window. + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a histogram? +\begin_inset LatexCommand label +name "How-to-plot-histogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +A generic histogram for a given value can be plotted with the biopiece +\series bold +plot_histogram +\series default + (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Histogram" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | plot_histogram --key=TISSUE --no_stream +\end_layout + +\begin_layout Standard +(Figure missing) +\end_layout + +\begin_layout Standard +\noindent +\align left +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename histogram.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Histogram" + +\end_inset + +Histogram +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a length distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-lendist" + +\end_inset + + +\end_layout + +\begin_layout Standard +Plotting of length distributions, weather sequence lengths, patterns lengths, + hit lengths, +\emph on +etc. + +\emph default + is a really handy thing and can be done with the the biopiece +\series bold +plot_lendist +\series default +. + If you have a file with FASTA entries and want to plot the length distribution + you do it like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout LyX-Code +| plot_lendist --key=SEQ_LEN --no_stream +\end_layout + +\begin_layout Standard +The result will be written to the default dumb terminal and will look like + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +. +\end_layout + +\begin_layout Standard +If you instead want the result in postscript format you can do: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps +\end_layout + +\begin_layout Standard +That will generate the plot and save it to file, but not interrupt the data + stream which can then be used in further analysis. + You can also save the plot implicetly using '>', however, it is then important + to terminate the stream with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps +\end_layout + +\begin_layout Standard +The resulting plot can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Length-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Length-distribution" + +\end_inset + +Length distribution +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Length distribution of 630 piRNA like RNAs. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a chromosome distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-chrdist" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you have the result of a sequence search against a multi chromosome genome, + it is very practical to be able to plot the distribution of search hits + on the different chromosomes. + This can be done with +\series bold +plot_chrdist +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_genome | plot_chrdist --no_stream +\end_layout + +\begin_layout Standard +The above example will result in a crude plot using the 'dumb' terminal, + and if you want to mess around with the results from the BLAT search you + probably want to save the result to file first (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +). + To plot the chromosome distribution from the saved search result you can + do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +That will result in the output show in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Chromosome-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename chrdist.ps + lyxscale 50 + width 12cm + rotateAngle 90 + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Chromosome-distribution" + +\end_inset + +Chromosome distribution +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to generate a dotplot? +\begin_inset LatexCommand label +name "sub:How-to-generate-dotplot" + +\end_inset + + +\end_layout + +\begin_layout Standard +A dotplot is a powerful way to get an overview of the size and location + of sequence insertions, deletions, and duplications between two sequences. + Generating a dotplot with biopieces is a two step process where you initially + find all matches between two sequences using the tool +\series bold +match_seq +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-find-matches" + +\end_inset + +) and plot the resulting matches with +\series bold +plot_matches +\series default +. + Matching and plotting two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq | plot_matches --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +The resulting dotplot is in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dotplot" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename dotplot.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dotplot" + +\end_inset + +Dotplot +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Forward matches are displayed in green while reverse matches are displayed + in red. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a sequence logo? +\end_layout + +\begin_layout Standard +Sequence logos can be generate with +\series bold +plot_seqlogo +\series default +. + The sequnce type is determined automagically and an entropy scale of 2 + bits and 4 bits is used for nucleotide and peptide sequences, respectively +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand htmlurl +target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html" + +\end_inset + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | plot_seqlogo --no_stream --result_out=seqlogo.svg +\end_layout + +\begin_layout Standard +An example of a sequence logo can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Sequence-logo" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename seqlogo.png + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Sequence-logo" + +\end_inset + +Sequence logo +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a karyogram? +\begin_inset LatexCommand label +name "sub:How-to-plot-karyogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +To plot search hits on genomes use +\series bold +plot_karyogram +\series default +, which will output a nice karyogram in SVG graphics: +\end_layout + +\begin_layout LyX-Code +... + | plot_karyogram --result_out=karyogram.svg +\end_layout + +\begin_layout Standard +The banding data is taken from the UCSC genome browser database and currently + only Human and Mouse is supported. + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Karyogram" + +\end_inset + + shows the distribution of piRNA like RNAs matched to the Human genome. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename karyogram.png + lyxscale 35 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Karyogram" + +\end_inset + +Karyogram +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Hits from a search of piRNA like RNAs in the Human genome is displayed as + short horizontal bars. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Uploading Results +\end_layout + +\begin_layout Subsection +How do I display my results in the UCSC Genome Browser? +\end_layout + +\begin_layout Standard +Results from the list of biopieces below can be uploaded directly to a local + mirror of the UCSC Genome Browser using the biopiece +\series bold +upload_to_ucsc +\series default +: +\end_layout + +\begin_layout Itemize +patscan_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blat_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blast_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize +vmatch_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The syntax for uploading data the most simple way requires two mandatory + switches: +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database, which is the UCSC database name (such as hg18, mm9, etc.) and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +table which should be the users initials followed by an underscore and a + short description of the data: +\end_layout + +\begin_layout LyX-Code +... + | upload_to_ucsc --database=hg18 --table=mah_snoRNAs +\end_layout + +\begin_layout Standard +The +\series bold +upload_to_ucsc +\series default + biopiece modifies the users ~/ucsc/my_tracks.ra file automagically (a backup + is created with the name ~/ucsc/my_tracks.ra~) with default values that + can be overridden using the following switches: +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +short_label - Short label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +long_label - Long label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +group - Track group name - Default= +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +priority - Track display priority - Default=1 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +color - Track color - Default=147,73,42 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +chunk_size - Chunks for loading - Default=10000000 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +visibility - Track visibility - Default=pack +\end_layout + +\begin_layout Standard +Also, data in BED or PSL format can be uploaded with +\series bold +upload_to_ucsc +\series default + as long as these reference to genomes and chromosomes existing in the UCSC + Genome Browser: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout Section +Power Scripting +\end_layout + +\begin_layout Standard +It is possible to do commandline scripting of biopiece records using Perl. + Because a biopiece record essentially is a hash structure, you can pass + records to +\series bold +bioscript +\series default + command, which is a wrapper around the Perl executable that allows direct + manipulations of the records using the power of Perl. +\end_layout + +\begin_layout Standard +In the below example we replace in all records the value to the CHR key + with a forthrunning number: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{CHR}=$i++; put_record($r)}' +\end_layout + +\begin_layout Standard +Something more useful would probably be to create custom FASTA headers. + E.g. + if we read in a BED file, lookup the genomic sequence, create a custom + FASTA header with +\series bold +bioscript +\series default + and output FASTA entries: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{SEQ_NAME}= // +\end_layout + +\begin_layout LyX-Code +join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}' +\end_layout + +\begin_layout Standard +And the output: +\end_layout + +\begin_layout LyX-Code +>chr2L_21567527_21567550 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_693380_693403 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_13859534_13859557 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_9005090_9005113 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_2106825_2106848 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_14649031_14649054 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout Section +Trouble shooting +\end_layout + +\begin_layout Standard +Shoot the messenger! +\end_layout + +\begin_layout Section +\start_of_appendix +Keys +\begin_inset LatexCommand label +name "sec:Keys" + +\end_inset + + +\end_layout + +\begin_layout Standard +HIT +\end_layout + +\begin_layout Standard +HIT_BEG +\end_layout + +\begin_layout Standard +HIT_END +\end_layout + +\begin_layout Standard +HIT_LEN +\end_layout + +\begin_layout Standard +HIT_NAME +\end_layout + +\begin_layout Standard +PATTERN +\end_layout + +\begin_layout Section +Switches +\begin_inset LatexCommand label +name "sec:Switches" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num +\end_layout + +\begin_layout Section +scan_for_matches README +\begin_inset LatexCommand label +name "sec:scan_for_matches-README" + +\end_inset + + +\end_layout + +\begin_layout LyX-Code + scan_for_matches: +\end_layout + +\begin_layout LyX-Code + A Program to Scan Nucleotide or Protein Sequences for Matching Patterns +\end_layout + +\begin_layout LyX-Code + Ross Overbeek +\end_layout + +\begin_layout LyX-Code + MCS +\end_layout + +\begin_layout LyX-Code + Argonne National Laboratory +\end_layout + +\begin_layout LyX-Code + Argonne, IL 60439 +\end_layout + +\begin_layout LyX-Code + USA +\end_layout + +\begin_layout LyX-Code +Scan_for_matches is a utility that we have written to search for +\end_layout + +\begin_layout LyX-Code +patterns in DNA and protein sequences. + I wrote most of the code, +\end_layout + +\begin_layout LyX-Code +although David Joerg and Morgan Price wrote sections of an +\end_layout + +\begin_layout LyX-Code +earlier version. + The whole notion of pattern matching has a rich +\end_layout + +\begin_layout LyX-Code +history, and we borrowed liberally from many sources. + However, it is +\end_layout + +\begin_layout LyX-Code +worth noting that we were strongly influenced by the elegant tools +\end_layout + +\begin_layout LyX-Code +developed and distributed by David Searls. + My intent is to make the +\end_layout + +\begin_layout LyX-Code +existing tool available to anyone in the research community that might +\end_layout + +\begin_layout LyX-Code +find it useful. + I will continue to try to fix bugs and make suggested +\end_layout + +\begin_layout LyX-Code +enhancements, at least until I feel that a superior tool exists. +\end_layout + +\begin_layout LyX-Code +Hence, I would appreciate it if all bug reports and suggestions are +\end_layout + +\begin_layout LyX-Code +directed to me at Overbeek@mcs.anl.gov. + +\end_layout + +\begin_layout LyX-Code +I will try to log all bug fixes and report them to users that send me +\end_layout + +\begin_layout LyX-Code +their email addresses. + I do not require that you give me your name +\end_layout + +\begin_layout LyX-Code +and address. + However, if you do give it to me, I will try to notify +\end_layout + +\begin_layout LyX-Code +you of serious problems as they are discovered. +\end_layout + +\begin_layout LyX-Code +Getting Started: +\end_layout + +\begin_layout LyX-Code + The distribution should contain at least the following programs: +\end_layout + +\begin_layout LyX-Code + README - This document +\end_layout + +\begin_layout LyX-Code + ggpunit.c - One of the two source files +\end_layout + +\begin_layout LyX-Code + scan_for_matches.c - The second source file +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + run_tests - A perl script to test things +\end_layout + +\begin_layout LyX-Code + show_hits - A handy perl script +\end_layout + +\begin_layout LyX-Code + test_dna_input - Test sequences for DNA +\end_layout + +\begin_layout LyX-Code + test_dna_patterns - Test patterns for DNA scan +\end_layout + +\begin_layout LyX-Code + test_output - Desired output from test +\end_layout + +\begin_layout LyX-Code + test_prot_input - Test protein sequences +\end_layout + +\begin_layout LyX-Code + test_prot_patterns - Test patterns for proteins +\end_layout + +\begin_layout LyX-Code + testit - a perl script used for test +\end_layout + +\begin_layout LyX-Code + Only the first three files are required. + The others are useful, +\end_layout + +\begin_layout LyX-Code + but only if you have Perl installed on your system. + If you do +\end_layout + +\begin_layout LyX-Code + have Perl, I suggest that you type +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + which perl +\end_layout + +\begin_layout LyX-Code + to find out where it installed. + On my system, I get the following +\end_layout + +\begin_layout LyX-Code + response: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% which perl +\end_layout + +\begin_layout LyX-Code + /usr/local/bin/perl +\end_layout + +\begin_layout LyX-Code + indicating that Perl is installed in /usr/local/bin. + Anyway, once +\end_layout + +\begin_layout LyX-Code + you know where it is installed, edit the first line of files +\end_layout + +\begin_layout LyX-Code + testit +\end_layout + +\begin_layout LyX-Code + show_hits +\end_layout + +\begin_layout LyX-Code + replacing /usr/local/bin/perl with the appropriate location. + I +\end_layout + +\begin_layout LyX-Code + will assume that you can do this, although it is not critical (it +\end_layout + +\begin_layout LyX-Code + is needed only to test the installation and to use the "show_hits" +\end_layout + +\begin_layout LyX-Code + utility). + Perl is not required to actually install and run +\end_layout + +\begin_layout LyX-Code + scan_for_matches. + +\end_layout + +\begin_layout LyX-Code + If you do not have Perl, I suggest you get it and install it (it +\end_layout + +\begin_layout LyX-Code + is a wonderful utility). + Information about Perl and how to get it +\end_layout + +\begin_layout LyX-Code + can be found in the book "Programming Perl" by Larry Wall and +\end_layout + +\begin_layout LyX-Code + Randall L. + Schwartz, published by O'Reilly & Associates, Inc. +\end_layout + +\begin_layout LyX-Code + To get started, you will need to compile the program. + I do this +\end_layout + +\begin_layout LyX-Code + using +\end_layout + +\begin_layout LyX-Code + gcc -O -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + If you do not use GNU C, use +\end_layout + +\begin_layout LyX-Code + cc -O -DCC -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + which works on my Sun. + +\end_layout + +\begin_layout LyX-Code + Once you have compiled scan_for_matches, you can verify that it +\end_layout + +\begin_layout LyX-Code + works with +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + You may get a few strange lines of the sort +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + rm: tmp: No such file or directory +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + These should cause no concern. + However, if the "diff" shows that +\end_layout + +\begin_layout LyX-Code + tmp and test_output are different, contact me (you have a +\end_layout + +\begin_layout LyX-Code + problem). + +\end_layout + +\begin_layout LyX-Code + You should now be able to use scan_for_matches by following the +\end_layout + +\begin_layout LyX-Code + instructions given below (which is all the normal user should have +\end_layout + +\begin_layout LyX-Code + to understand, once things are installed properly). +\end_layout + +\begin_layout LyX-Code + ============================================================== +\end_layout + +\begin_layout LyX-Code +How to run scan_for_matches: +\end_layout + +\begin_layout LyX-Code + To run the program, you type need to create two files +\end_layout + +\begin_layout LyX-Code + 1. + the first file contains the pattern you wish to scan for; I'll +\end_layout + +\begin_layout LyX-Code + call this file pat_file in what follows (but any name is ok) +\end_layout + +\begin_layout LyX-Code + 2. + the second file contains a set of sequences to scan. + These +\end_layout + +\begin_layout LyX-Code + should be in "fasta format". + Just look at the contents of +\end_layout + +\begin_layout LyX-Code + test_dna_input to see examples of this format. + Basically, +\end_layout + +\begin_layout LyX-Code + each sequence begins with a line of the form +\end_layout + +\begin_layout LyX-Code + >sequence_id +\end_layout + +\begin_layout LyX-Code + and is followed by one or more lines containing the sequence. +\end_layout + +\begin_layout LyX-Code + Once these files have been created, you just use +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < input_file +\end_layout + +\begin_layout LyX-Code + to scan all of the input sequences for the given pattern. + As an +\end_layout + +\begin_layout LyX-Code + example, suppose that pat_file contains a single line of the form +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + Then, +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + should produce two "hits". + When I run this on my machine, I get +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + >tst2:[6,27] +\end_layout + +\begin_layout LyX-Code + CGUAACC GGTTAACC GGUUACG +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code +Simple Patterns Built by Matching Ranges and Reverse Complements +\end_layout + +\begin_layout LyX-Code + Let me first explain this simple pattern: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + The pattern consists of three "pattern units" separated by spaces. +\end_layout + +\begin_layout LyX-Code + The first pattern unit is +\end_layout + +\begin_layout LyX-Code + p1=4...7 +\end_layout + +\begin_layout LyX-Code + which means "match 4 to 7 characters and call them p1". + The +\end_layout + +\begin_layout LyX-Code + second pattern unit is +\end_layout + +\begin_layout LyX-Code + 3...8 +\end_layout + +\begin_layout LyX-Code + which means "then match 3 to 8 characters". + The last pattern unit +\end_layout + +\begin_layout LyX-Code + is +\end_layout + +\begin_layout LyX-Code + ~p1 +\end_layout + +\begin_layout LyX-Code + which means "match the reverse complement of p1". + The first +\end_layout + +\begin_layout LyX-Code + reported hit is shown as +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + which states that characters 6 through 27 of sequence tst1 were +\end_layout + +\begin_layout LyX-Code + matched. + "cguaac" matched the first pattern unit, "ggttaacc" the +\end_layout + +\begin_layout LyX-Code + second, and "gguuacg" the third. + This is an example of a common +\end_layout + +\begin_layout LyX-Code + type of pattern used to search for sections of DNA or RNA that +\end_layout + +\begin_layout LyX-Code + would fold into a hairpin loop. +\end_layout + +\begin_layout LyX-Code +Searching Both Strands +\end_layout + +\begin_layout LyX-Code + Now for a short aside: scan_for_matches only searched the +\end_layout + +\begin_layout LyX-Code + sequences in the input file; it did not search the opposite +\end_layout + +\begin_layout LyX-Code + strand. + With a pattern of the sort we just used, there is not +\end_layout + +\begin_layout LyX-Code + need o search the opposite strand. + However, it is normally the +\end_layout + +\begin_layout LyX-Code + case that you will wish to search both the sequence and the +\end_layout + +\begin_layout LyX-Code + opposite strand (i.e., the reverse complement of the sequence). +\end_layout + +\begin_layout LyX-Code + To do that, you would just use the "-c" command line. + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + Hits on the opposite strand will show a beginning location greater +\end_layout + +\begin_layout LyX-Code + than te end location of the match. +\end_layout + +\begin_layout LyX-Code +Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions +\end_layout + +\begin_layout LyX-Code + Let us stop now and ask "What additional features would one need to +\end_layout + +\begin_layout LyX-Code + really find the kinds of loop structures that characterize tRNAs, +\end_layout + +\begin_layout LyX-Code + rRNAs, and so forth?" I can immediately think of two: +\end_layout + +\begin_layout LyX-Code + a) you will need to be able to allow non-standard pairings +\end_layout + +\begin_layout LyX-Code + (those other than G-C and A-U), and +\end_layout + +\begin_layout LyX-Code + b) you will need to be able to tolerate some number of +\end_layout + +\begin_layout LyX-Code + mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + Let me first show you how to handle non-standard "rules for +\end_layout + +\begin_layout LyX-Code + pairing in reverse complements". + Consider the following pattern, +\end_layout + +\begin_layout LyX-Code + which I show as two line (you may use as many lines as you like in +\end_layout + +\begin_layout LyX-Code + forming a pattern, although you can only break a pattern at points +\end_layout + +\begin_layout LyX-Code + where space would be legal): +\end_layout + +\begin_layout LyX-Code + r1={au,ua,gc,cg,gu,ug,ga,ag} +\end_layout + +\begin_layout LyX-Code + p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1 +\end_layout + +\begin_layout LyX-Code + The first "pattern unit" does not actually match anything; rather, +\end_layout + +\begin_layout LyX-Code + it defines a "pairing rule" in which standard pairings are +\end_layout + +\begin_layout LyX-Code + allowed, as well as G-A and A-G (in case you wondered, Us and Ts +\end_layout + +\begin_layout LyX-Code + and upper and lower case can be used interchangably; for example +\end_layout + +\begin_layout LyX-Code + r1={AT,UA,gc,cg} could be used to define the "standard rule" for +\end_layout + +\begin_layout LyX-Code + pairings). + The second line consists of six pattern units which +\end_layout + +\begin_layout LyX-Code + may be interpreted as follows: +\end_layout + +\begin_layout LyX-Code + p1=2...3 match 2 or 3 characters (call it p1) +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + p2=2...5 match 2 to 5 characters (call it p2) +\end_layout + +\begin_layout LyX-Code + 1...5 match 1 to 5 characters +\end_layout + +\begin_layout LyX-Code + r1~p2 match the reverse complement of p2, +\end_layout + +\begin_layout LyX-Code + allowing G-A and A-G pairs +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + ~p1 match the reverse complement of p1 +\end_layout + +\begin_layout LyX-Code + allowing only G-C, C-G, A-T, and T-A pairs +\end_layout + +\begin_layout LyX-Code + Thus, r1~p2 means "match the reverse complement of p2 using rule r1". +\end_layout + +\begin_layout LyX-Code + Now let us consider the issue of tolerating mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + You may add a "qualifier" to the pattern unit that gives the +\end_layout + +\begin_layout LyX-Code + tolerable number of "mismatches, deletions, and insertions". +\end_layout + +\begin_layout LyX-Code + Thus, +\end_layout + +\begin_layout LyX-Code + p1=10...10 3...8 ~p1[1,2,1] +\end_layout + +\begin_layout LyX-Code + means that the third pattern unit must match 10 characters, +\end_layout + +\begin_layout LyX-Code + allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or +\end_layout + +\begin_layout LyX-Code + T-A), two deletions (a deletion is a character that occurs in p1, +\end_layout + +\begin_layout LyX-Code + but has been "deleted" from the string matched by ~p1), and one +\end_layout + +\begin_layout LyX-Code + insertion (an "insertion" is a character that occurs in the string +\end_layout + +\begin_layout LyX-Code + matched by ~p1, but not for which no corresponding character +\end_layout + +\begin_layout LyX-Code + occurs in p1). + In this case, the pattern would match +\end_layout + +\begin_layout LyX-Code + ACGTACGTAC GGGGGGGG GCGTTACCT +\end_layout + +\begin_layout LyX-Code + which is, you must admit, a fairly weak loop. + It is common to +\end_layout + +\begin_layout LyX-Code + allow mismatches, but you will find yourself using insertions and +\end_layout + +\begin_layout LyX-Code + deletions much more rarely. + In any event, you should note that +\end_layout + +\begin_layout LyX-Code + allowing mismatches, insertions, and deletions does force the +\end_layout + +\begin_layout LyX-Code + program to try many additional possible pairings, so it does slow +\end_layout + +\begin_layout LyX-Code + things down a bit. +\end_layout + +\begin_layout LyX-Code +How Patterns Are Matched +\end_layout + +\begin_layout LyX-Code + Now is as good a time as any to discuss the basic flow of control +\end_layout + +\begin_layout LyX-Code + when matching patterns. + Recall that a "pattern" is a sequence of +\end_layout + +\begin_layout LyX-Code + "pattern units". + Suppose that the pattern units were +\end_layout + +\begin_layout LyX-Code + u1 u2 u3 u4 ... + un +\end_layout + +\begin_layout LyX-Code + The scan of a sequence S begins by setting the current position +\end_layout + +\begin_layout LyX-Code + to 1. + Then, an attempt is made to match u1 starting at the +\end_layout + +\begin_layout LyX-Code + current position. + Each attempt to match a pattern unit can +\end_layout + +\begin_layout LyX-Code + succeed or fail. + If it succeeds, then an attempt is made to match +\end_layout + +\begin_layout LyX-Code + the next unit. + If it fails, then an attempt is made to find an +\end_layout + +\begin_layout LyX-Code + alternative match for the immediately preceding pattern unit. + If +\end_layout + +\begin_layout LyX-Code + this succeeds, then we proceed forward again to the next unit. + If +\end_layout + +\begin_layout LyX-Code + it fails we go back to the preceding unit. + This process is called +\end_layout + +\begin_layout LyX-Code + "backtracking". + If there are no previous units, then the current +\end_layout + +\begin_layout LyX-Code + position is incremented by one, and everything starts again. + This +\end_layout + +\begin_layout LyX-Code + proceeds until either the current position goes past the end of +\end_layout + +\begin_layout LyX-Code + the sequence or all of the pattern units succeed. + On success, +\end_layout + +\begin_layout LyX-Code + scan_for_matches reports the "hit", the current position is set +\end_layout + +\begin_layout LyX-Code + just past the hit, and an attempt is made to find another hit. +\end_layout + +\begin_layout LyX-Code + If you wish to limit the scan to simply finding a maximum of, say, +\end_layout + +\begin_layout LyX-Code + 10 hits, you can use the -n option (-n 10 would set the limit to +\end_layout + +\begin_layout LyX-Code + 10 reported hits). + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c -n 1 pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + would search for just the first hit (and would stop searching the +\end_layout + +\begin_layout LyX-Code + current sequences or any that follow in the input file). +\end_layout + +\begin_layout LyX-Code +Searching for repeats: +\end_layout + +\begin_layout LyX-Code + In the last section, I discussed almost all of the details +\end_layout + +\begin_layout LyX-Code + required to allow you to look for repeats. + Consider the following +\end_layout + +\begin_layout LyX-Code + set of patterns: +\end_layout + +\begin_layout LyX-Code + p1=6...6 3...8 p1 (find exact 6 character repeat separated +\end_layout + +\begin_layout LyX-Code + by to 8 characters) +\end_layout + +\begin_layout LyX-Code + p1=6...6 3..8 p1[1,0,0] (allow one mismatch) +\end_layout + +\begin_layout LyX-Code + p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0] +\end_layout + +\begin_layout LyX-Code + (match 12 characters that are the remains +\end_layout + +\begin_layout LyX-Code + of a 3-character sequence occurring 4 times) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...8 0...3 p2=6...8 p1 0...3 p2 +\end_layout + +\begin_layout LyX-Code + (This would match things like +\end_layout + +\begin_layout LyX-Code + ATCT G TCTTT ATCT TG TCTTT +\end_layout + +\begin_layout LyX-Code + ) +\end_layout + +\begin_layout LyX-Code +Searching for particular sequences: +\end_layout + +\begin_layout LyX-Code + Occasionally, one wishes to match a specific, known sequence. +\end_layout + +\begin_layout LyX-Code + In such a case, you can just give the sequence (along with an +\end_layout + +\begin_layout LyX-Code + optional statement of the allowable mismatches, insertions, and +\end_layout + +\begin_layout LyX-Code + deletions). + Thus, +\end_layout + +\begin_layout LyX-Code + p1=6...8 GAGA ~p1 (match a hairpin with GAGA as the loop) +\end_layout + +\begin_layout LyX-Code + RRRRYYYY (match 4 purines followed by 4 pyrimidines) +\end_layout + +\begin_layout LyX-Code + TATAA[1,0,0] (match TATAA, allowing 1 mismatch) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matches against a "weight matrix": +\end_layout + +\begin_layout LyX-Code + I will conclude my examples of the types of pattern units +\end_layout + +\begin_layout LyX-Code + available for matching against nucleotide sequences by discussing a +\end_layout + +\begin_layout LyX-Code + crude implemetation of matching using a "weight matrix". + While I +\end_layout + +\begin_layout LyX-Code + am less than overwhelmed with the syntax that I chose, I think that +\end_layout + +\begin_layout LyX-Code + the reader should be aware that I was thinking of generating +\end_layout + +\begin_layout LyX-Code + patterns containing such pattern units automatically from +\end_layout + +\begin_layout LyX-Code + alignments (and did not really plan on typing such things in by +\end_layout + +\begin_layout LyX-Code + hand very often). + Anyway, suppose that you wanted to match a +\end_layout + +\begin_layout LyX-Code + sequence of eight characters. + The "consensus" of these eight +\end_layout + +\begin_layout LyX-Code + characters is GRCACCGS, but the actual "frequencies of occurrence" +\end_layout + +\begin_layout LyX-Code + are given in the matrix below. + Thus, the first character is an A +\end_layout + +\begin_layout LyX-Code + 16% the time and a G 84% of the time. + The second is an A 57% of +\end_layout + +\begin_layout LyX-Code + the time, a C 10% of the time, a G 29% of the time, and a T 4% of +\end_layout + +\begin_layout LyX-Code + the time. + +\end_layout + +\begin_layout LyX-Code + C1 C2 C3 C4 C5 C6 C7 C8 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + A 16 57 0 95 0 18 0 0 +\end_layout + +\begin_layout LyX-Code + C 0 10 80 0 100 60 0 50 +\end_layout + +\begin_layout LyX-Code + G 84 29 0 0 0 20 100 50 +\end_layout + +\begin_layout LyX-Code + T 0 4 20 5 0 2 0 0 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + One could use the following pattern unit to search for inexact +\end_layout + +\begin_layout LyX-Code + matches related to such a "weight matrix": +\end_layout + +\begin_layout LyX-Code + {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + This pattern unit will attempt to match exactly eight characters. +\end_layout + +\begin_layout LyX-Code + For each character in the sequence, the entry in the corresponding +\end_layout + +\begin_layout LyX-Code + tuple is added to an accumulated sum. + If the sum is greater than +\end_layout + +\begin_layout LyX-Code + 450, the match succeeds; else it fails. +\end_layout + +\begin_layout LyX-Code + Recently, this feature was upgraded to allow ranges. + Thus, +\end_layout + +\begin_layout LyX-Code + 600 > {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + will work, as well. +\end_layout + +\begin_layout LyX-Code +Allowing Alternatives: +\end_layout + +\begin_layout LyX-Code + Very occasionally, you may wish to allow alternative pattern units +\end_layout + +\begin_layout LyX-Code + (i.e., "match either A or B"). + You can do this using something +\end_layout + +\begin_layout LyX-Code + like +\end_layout + +\begin_layout LyX-Code + ( GAGA | GCGCA) +\end_layout + +\begin_layout LyX-Code + which says "match either GAGA or GCGCA". + You may take +\end_layout + +\begin_layout LyX-Code + alternatives of a list of pattern units, for example +\end_layout + +\begin_layout LyX-Code + (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG) +\end_layout + +\begin_layout LyX-Code + would match one of two sequences of pattern units. + There is one +\end_layout + +\begin_layout LyX-Code + clumsy aspect of the syntax: to match a list of alternatives, you +\end_layout + +\begin_layout LyX-Code + need to fully the request. + Thus, +\end_layout + +\begin_layout LyX-Code + (GAGA | (GCGCA | TTCGA)) +\end_layout + +\begin_layout LyX-Code + would be needed to try the three alternatives. +\end_layout + +\begin_layout LyX-Code +One Minor Extension +\end_layout + +\begin_layout LyX-Code + Sometimes a pattern will contain a sequence of distinct ranges, +\end_layout + +\begin_layout LyX-Code + and you might wish to limit the sum of the lengths of the matched +\end_layout + +\begin_layout LyX-Code + subsequences. + For example, suppose that you basically wanted to +\end_layout + +\begin_layout LyX-Code + match something like +\end_layout + +\begin_layout LyX-Code + ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT +\end_layout + +\begin_layout LyX-Code + but that the sum of the lengths of p1, p2, and p3 must not exceed +\end_layout + +\begin_layout LyX-Code + eight characters. + To do this, you could add +\end_layout + +\begin_layout LyX-Code + length(p1+p2+p3) < 9 +\end_layout + +\begin_layout LyX-Code + as the last pattern unit. + It will just succeed or fail (but does +\end_layout + +\begin_layout LyX-Code + not actually match any characters in the sequence). +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matching Protein Sequences +\end_layout + +\begin_layout LyX-Code + Suppose that the input file contains protein sequences. + In this +\end_layout + +\begin_layout LyX-Code + case, you must invoke scan_for_matches with the "-p" option. + You +\end_layout + +\begin_layout LyX-Code + cannot use aspects of the language that relate directly to +\end_layout + +\begin_layout LyX-Code + nucleotide sequences (e.g., the -c command line option or pattern +\end_layout + +\begin_layout LyX-Code + constructs referring to the reverse complement of a previously +\end_layout + +\begin_layout LyX-Code + matched unit). + +\end_layout + +\begin_layout LyX-Code + You also have two additional constructs that allow you to match +\end_layout + +\begin_layout LyX-Code + either "one of a set of amino acids" or "any amino acid other than +\end_layout + +\begin_layout LyX-Code + those a given set". + For example, +\end_layout + +\begin_layout LyX-Code + p1=0...4 any(HQD) 1...3 notany(HK) p1 +\end_layout + +\begin_layout LyX-Code + would successfully match a string like +\end_layout + +\begin_layout LyX-Code + YWV D AA C YWV +\end_layout + +\begin_layout LyX-Code +Using the show_hits Utility +\end_layout + +\begin_layout LyX-Code + When viewing a large set of complex matches, you might find it +\end_layout + +\begin_layout LyX-Code + convenient to post-process the scan_for_matches output to get a +\end_layout + +\begin_layout LyX-Code + more readable version. + We provide a simple post-processor called +\end_layout + +\begin_layout LyX-Code + "show_hits". + To see its effect, just pipe the output of a +\end_layout + +\begin_layout LyX-Code + scan_for_matches into show_hits: +\end_layout + +\begin_layout LyX-Code + Normal Output: +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp +\end_layout + +\begin_layout LyX-Code + >tst1:[1,28] +\end_layout + +\begin_layout LyX-Code + gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst1:[28,1] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + >tst2:[2,31] +\end_layout + +\begin_layout LyX-Code + CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + >tst2:[31,2] +\end_layout + +\begin_layout LyX-Code + CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + >tst3:[3,32] +\end_layout + +\begin_layout LyX-Code + gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst3:[32,3] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + Piped Through show_hits: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + Optionally, you can specify which of the "fields" in the matches +\end_layout + +\begin_layout LyX-Code + you wish to sort on, and show_hits will sort them. + The field +\end_layout + +\begin_layout LyX-Code + numbers start with 0. + So, you might get something like +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits 2 1 +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + In this case, the hits have been sorted on fields 2 and 1 (that is, +\end_layout + +\begin_layout LyX-Code + the third and second matched subfields). +\end_layout + +\begin_layout LyX-Code + show_hits is just one possible little post-processor, and you +\end_layout + +\begin_layout LyX-Code + might well wish to write a customized one for yourself. +\end_layout + +\begin_layout LyX-Code +Reducing the Cost of a Search +\end_layout + +\begin_layout LyX-Code + The scan_for_matches utility uses a fairly simple search, and may +\end_layout + +\begin_layout LyX-Code + consume large amounts of CPU time for complex patterns. + Someday, +\end_layout + +\begin_layout LyX-Code + I may decide to optimize the code. + However, until then, let me +\end_layout + +\begin_layout LyX-Code + mention one useful technique. + +\end_layout + +\begin_layout LyX-Code + When you have a complex pattern that includes a number of varying +\end_layout + +\begin_layout LyX-Code + ranges, imprecise matches, and so forth, it is useful to +\end_layout + +\begin_layout LyX-Code + "pipeline" matches. + That is, form a simpler pattern that can be +\end_layout + +\begin_layout LyX-Code + used to scan through a large database extracting sections that +\end_layout + +\begin_layout LyX-Code + might be matched by the more complex pattern. + Let me illustrate +\end_layout + +\begin_layout LyX-Code + with a short example. + Suppose that you really wished to match the +\end_layout + +\begin_layout LyX-Code + pattern +\end_layout + +\begin_layout LyX-Code + p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0] +\end_layout + +\begin_layout LyX-Code + In this case, the pattern units AGC 3...5 RYGC can be used to rapidly +\end_layout + +\begin_layout LyX-Code + constrain the overall search. + You can preprocess the overall +\end_layout + +\begin_layout LyX-Code + database using the pattern: +\end_layout + +\begin_layout LyX-Code + 31...31 AGC 3...5 RYGC 7...7 +\end_layout + +\begin_layout LyX-Code + Put the complex pattern in pat_file1 and the simpler pattern in +\end_layout + +\begin_layout LyX-Code + pat_file2. + Then use, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file2 < nucleotide_database | +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file1 +\end_layout + +\begin_layout LyX-Code + The output will show things like +\end_layout + +\begin_layout LyX-Code + >seqid:[232,280][2,47] +\end_layout + +\begin_layout LyX-Code + matches pieces +\end_layout + +\begin_layout LyX-Code + Then, the actual section of the sequence that was matched can be +\end_layout + +\begin_layout LyX-Code + easily computed as [233,278] (remember, the positions start from +\end_layout + +\begin_layout LyX-Code + 1, not 0). +\end_layout + +\begin_layout LyX-Code + Let me finally add, you should do a few short experiments to see +\end_layout + +\begin_layout LyX-Code + whether or not such pipelining actually improves performance -- it +\end_layout + +\begin_layout LyX-Code + is not always obvious where the time is going, and I have +\end_layout + +\begin_layout LyX-Code + sometimes found that the added complexity of pipelining actually +\end_layout + +\begin_layout LyX-Code + slowed things up. + It gets its best improvements when there are +\end_layout + +\begin_layout LyX-Code + exact matches of more than just a few characters that can be +\end_layout + +\begin_layout LyX-Code + rapidly used to eliminate large sections of the database. +\end_layout + +\begin_layout LyX-Code +============= +\end_layout + +\begin_layout LyX-Code +Additions: +\end_layout + +\begin_layout LyX-Code +Feb 9, 1995: the pattern units ^ and $ now work as in normal regular +\end_layout + +\begin_layout LyX-Code + expressions. + That is +\end_layout + +\begin_layout LyX-Code + TTF $ +\end_layout + +\begin_layout LyX-Code + matches only TTF at the end of the string and +\end_layout + +\begin_layout LyX-Code + ^ TTF +\end_layout + +\begin_layout LyX-Code + matches only an initial TTF +\end_layout + +\begin_layout LyX-Code + The pattern unit +\end_layout + +\begin_layout LyX-Code + >>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<< +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +# Stuff that enables biotools. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export TOOLS_DIR="/home/m.hansen/tools" # Contains binaries for BLAST + and Vmatch. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export INST_DIR="/home/m.hansen/maasha" # Contains scripts and modules. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export DATA_DIR="/home/m.hansen/DATA" # Contains genomic data etc. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export TMP_DIR="/home/m.hansen/maasha/tmp" # Required temporary directory. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export PATH="$PATH:$TOOLS_DIR/blast-2.2.17/bin:$TOOLS_DIR/vmatch.distribution" +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export PATH="$INST_DIR/bin/:$INST_DIR/perl_scripts/:$INST_DIR/perl_scripts/b +iotools:$PATH" +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +export PERL5LIB="$PERL5LIB:$INST_DIR" +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +# Alias allowing power scripting with biotools +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +alias bioscript="perl -I $INST_DIR/Maasha -MBiotools=read_stream,get_recor +d,put_record -e" +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +# >>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<< +\end_layout + +\begin_layout Section +Getting Started +\end_layout + +\begin_layout Standard +The biotool +\series bold +list_biotools +\series default + lists all the biotools along with a description: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biotools +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +align_seq Align sequences in stream using Muscle. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_seq Analysis the residue composition of each sequence + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_vals Determine type, count, min, max, sum and mean for + values in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blast_seq BLAST sequences in stream against a specified database. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blat_seq BLAT sequences in stream against a specified genome. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +complement_seq Complement sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records Count the number of records in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_seq Count sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_vals Count the number of times values of given keys exists + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +create_blast_db Create a BLAST database from sequences in stream for + use with BLAST. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +... +\end_layout + +\begin_layout Standard +To list the biotools for writing different formats, you can use unix's grep + like this: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biotools | grep write +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_align Write aligned sequences in pretty alignment format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_bed Write records from stream as BED lines. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_blast Write BLAST records from stream in BLAST tabular format + (-m8 and 9). +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_fasta Write sequences in FASTA format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_psl Write records from stream in PSL format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_tab Write records from stream as tab separated table. +\end_layout + +\begin_layout Standard +In order to find out how a specific biotool works, you just type the program + name without any arguments and press return and the usage of the biotool + will be displayed. + E.g. + +\series bold +read_fasta +\series default + : +\end_layout + +\begin_layout Standard +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +width "100col%" +special "none" +height "1in" +height_special "totalheight" +status open + +\begin_layout LyX-Code + +\size scriptsize +Program name: read_fasta +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Author: Martin Asser Hansen - Copyright (C) - All rights reserved +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Contact: mail@maasha.dk +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Date: August 2007 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/ +gpl.html) +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Description: Read FASTA entries. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Usage: read_fasta [options] -i +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Options: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-i | --data_in=] - Comma separated list of files + to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-n | --num=] - Limit number of records to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-I | --stream_in=] - Read input stream from file + - Default=STDIN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-O | --stream_out=] - Write output stream to file + - Default=STDOUT +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Examples: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna - Read FASTA entries from file. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test1.fna,test2,fna - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i '*.fna' - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna -n 10 - Read first 10 FASTA entries from + file. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +The Data Stream +\end_layout + +\begin_layout Subsection +How to read the data stream from file? +\begin_inset LatexCommand label +name "sub:How-to-read-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +You want to read a data stream that you previously have saved to file in + biotools format. + This can be done implicetly or explicitly. + The implicit way uses the 'stdout' stream of the Unix terminal: +\end_layout + +\begin_layout LyX-Code +cat | +\end_layout + +\begin_layout Standard +cat is the Unix command that reads a file and output the result to 'stdout' + --- which in this case is piped to any biotool represented by the . + It is also possible to read the data stream using '<' to direct the 'stdout' + stream into the biotool like this: +\end_layout + +\begin_layout LyX-Code + < +\end_layout + +\begin_layout Standard +However, that will not work if you pipe more biotools together. + Then it is much safer to read the stream from a file explicitly like this: +\end_layout + +\begin_layout LyX-Code + --stream_in= +\end_layout + +\begin_layout Standard +Here the filename is explicetly given to the biotool with + the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in. + This switch works with all biotools. + It is also possible to read in data from multiple sources by repeating + the explicit read step: +\end_layout + +\begin_layout LyX-Code + --stream_in= | --stream_in= +\end_layout + +\begin_layout Subsection +How to write the data stream to file? +\begin_inset LatexCommand label +name "sub:How-to-write-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to save the output stream from a biotool to file, so you can read + in the stream again at a later time, you can do one of two things: +\end_layout + +\begin_layout LyX-Code + > +\end_layout + +\begin_layout Standard +All, the biotools write the data stream to 'stdout' by default which can + be written to a file by redirecting 'stdout' to file using '>' , however, + if one of the biotools for writing other formats is used then the both + the biotools records as well as the result output will go to 'stdout' in + a mixture causing havock! To avoid this you must use the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out that explictly tells the biotool to write the output stream to + file: +\end_layout + +\begin_layout LyX-Code + --stream_out= +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out switch works with all biotools. +\end_layout + +\begin_layout Subsection +How to terminate the data stream? +\end_layout + +\begin_layout Standard +The data stream is never stops unless the user want to save the stream or + by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch that will terminate the stream: +\end_layout + +\begin_layout LyX-Code + --no_stream +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch only works with those biotools where it makes sense that + the user might want to terminale the data stream, +\emph on +i.e +\emph default +. + after an analysis step where the user wants to output the result, but not + the data stream. +\end_layout + +\begin_layout Subsection +How to write my results to file? +\begin_inset LatexCommand label +name "sub:How-to-write-result" + +\end_inset + + +\end_layout + +\begin_layout Standard +Saving the result of an analysis to file can be done implicitly or explicitly. + The implicit way: +\end_layout + +\begin_layout LyX-Code + --no_stream > +\end_layout + +\begin_layout Standard +If you use '>' to redirect 'stdout' to file then it is important to use + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch to avoid writing a mix of biotools records and result to + the same file causing havock. + The safe way is to use the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which explicetly tells the biotool to write the result + to a given file: +\end_layout + +\begin_layout LyX-Code + --result_out= +\end_layout + +\begin_layout Standard +Using the above method will not terminate the stream, so it is possible + to pipe that into another biotool generating different results: +\end_layout + +\begin_layout LyX-Code + --result_out= | --result_out= +\end_layout + +\begin_layout Standard +And still the data stream will continue unless terminated with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream: +\end_layout + +\begin_layout LyX-Code + --result_out= --no_stream +\end_layout + +\begin_layout Standard +Or written to file using implicitly or explicity +\begin_inset LatexCommand eqref +reference "sub:How-to-write-result" + +\end_inset + +. + The explicit way: +\end_layout + +\begin_layout LyX-Code + --result_out= --stream_out= +\end_layout + +\begin_layout Subsection +How to read data from multiple sources? +\end_layout + +\begin_layout Standard +To read multiple data sources, with the same type or different type of data + do: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +where type is the data type a specific biotool reads. +\end_layout + +\begin_layout Section +Reading input +\end_layout + +\begin_layout Subsection +How to read biotools input? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-read-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to read in data? +\end_layout + +\begin_layout Standard +Data in different formats can be read with the appropriate biotool for that + format. + The biotools are typicalled named 'read_' such as +\series bold +read_fasta +\series default +, +\series bold +read_bed +\series default +, +\series bold +read_tab +\series default +, etc., and all behave in a similar manner. + Data can be read by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in switch and a file name to the file containing the data: +\end_layout + +\begin_layout LyX-Code + --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read in a saved biotools stream (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-stream" + +\end_inset + +) as well as reading data in one go: +\end_layout + +\begin_layout LyX-Code + --stream_in= --data_in= +\end_layout + +\begin_layout Standard +If you want to read data from several files you can do this: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +If you have several data files you can read in all explicitly with a comma + separated list: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,file2,file3 +\end_layout + +\begin_layout Standard +And it is also possible to use file globbing +\begin_inset Foot +status open + +\begin_layout Standard +using the short option will only work if you quote the argument -i '*.fna' +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout LyX-Code + --data_in=*.fna +\end_layout + +\begin_layout Standard +Or in a combination: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,/dir/*.fna +\end_layout + +\begin_layout Standard +Finally, it is possible to read in data in different formats using the appropria +te biotool for each format: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= ... +\end_layout + +\begin_layout Subsection +How to read FASTA input? +\end_layout + +\begin_layout Standard +Sequences in FASTA format can be read explicitly using +\series bold +read_fasta +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= +\end_layout + +\begin_layout Subsection +How to read alignment input? +\end_layout + +\begin_layout Standard +If your alignment if FASTA formatted then you can +\series bold +read_align +\series default +. + It is also possible to use +\series bold +read_fasta +\series default + since the data is FASTA formatted, however, with +\series bold +read_fasta +\series default + the key ALIGN will be omitted. + The ALIGN key is used to determine which sequences belong to what alignment + which is required for +\series bold +write_align +\series default +. +\end_layout + +\begin_layout LyX-Code +read_align --data_in= +\end_layout + +\begin_layout Subsection +How to read tabular input? +\begin_inset LatexCommand label +name "sub:How-to-read-table" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tabular input can be read with +\series bold +read_tab +\series default + which will read in all rows and chosen columns (separated by a given delimter) + from a table in text format. +\end_layout + +\begin_layout Standard +The table below: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Human +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +ATACGTCAG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +23524 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Dog +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AGCATGAC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2442 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Mouse +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +GACTG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +234 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Cat +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AAATGCA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2342 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Can be read using the command: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= +\end_layout + +\begin_layout Standard +Which will result in four records, one for each row, where the keys V0, + V1, V2 are the default keys for the organism, sequence, and count, respectively. + It is possible to select a subset of colums to read by using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +cols switch which takes a comma separated list of columns numbers (first + column is designated 0) as argument. + So to read in only the sequence and the count so that the count comes before + the sequence do: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 +\end_layout + +\begin_layout Standard +It is also possible to name the columns with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 --keys=COUNT,SEQ +\end_layout + +\begin_layout Subsection +How to read BED input? +\end_layout + +\begin_layout Standard +The BED (Browser Extensible Data +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/FAQ/FAQformat" + +\end_inset + + +\end_layout + +\end_inset + +) format is a tabular format for data pertaining to one of the Eukaryotic + genomes in the UCSC genome brower +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/" + +\end_inset + + +\end_layout + +\end_inset + +. + The BED format consists of up to 12 columns, where the first three are + mandatory CHR, CHR_BEG, and CHR_END. + The mandatory columns and any of the optional columns can all be read in + easily with the +\series bold +read_bed +\series default + biotool. +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read the BED file with +\series bold +read_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +), however, that will be more cumbersome because you need to specify the + keys: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --keys=CHR,CHR_BEG,CHR_END ... +\end_layout + +\begin_layout Subsection +How to read PSL input? +\end_layout + +\begin_layout Standard +The PSL format is the output from BLAT and contains 21 mandatory fields + that can be read with +\series bold +read_psl +\series default +: +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= +\end_layout + +\begin_layout Section +Writing output +\end_layout + +\begin_layout Standard +All result output can be written explicitly to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which all result generating biotools have. + It is also possible to write the result to file implicetly by directing + 'stdout' to file using '>', however, that requires the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream swich to prevent a mixture of data stream and results in the file. + The explicit (and safe) way: +\end_layout + +\begin_layout LyX-Code +... + | --result_out= +\end_layout + +\begin_layout Standard +The implicit way: +\end_layout + +\begin_layout LyX-Code +... + | --no_stream > +\end_layout + +\begin_layout Subsection +How to write biotools output? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-write-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to write FASTA output? +\begin_inset LatexCommand label +name "sub:How-to-write-fasta" + +\end_inset + + +\end_layout + +\begin_layout Standard +FASTA output can be written with +\series bold +write_fasta +\series default +. +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --result_out= +\end_layout + +\begin_layout Standard +It is also possible to wrap the sequences to a given width using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch allthough wrapping of sequence is generally an evil thing: +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --no_stream --wrap=80 +\end_layout + +\begin_layout Subsection +How to write alignment output? +\begin_inset LatexCommand label +name "sub:How-to-write-alignment" + +\end_inset + + +\end_layout + +\begin_layout Standard +Pretty alignments with ruler +\begin_inset Foot +status collapsed + +\begin_layout Standard +'.' for every 10 residues, ':' for every 50, and '|' for every 100 +\end_layout + +\end_inset + + and consensus sequence +\begin_inset Note Note +status collapsed + +\begin_layout Standard +which reminds me to make that an option. +\end_layout + +\end_inset + + can be created with +\series bold +write_align +\series default +, what also have the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch to break the alignment into blocks of a given width: +\end_layout + +\begin_layout LyX-Code +... + | write_align --result_out= --wrap=80 +\end_layout + +\begin_layout Standard +If the number of sequnces in the alignment is 2 then a pairwise alignment + will be output otherwise a multiple alignment. + And if the sequence type, determined automagically, is protein, then residues + and symbols (+,\InsetSpace ~ +:,\InsetSpace ~ +.) will be used to show consensus according to the Blosum62 + matrix. +\end_layout + +\begin_layout Subsection +How to write tabular output? +\begin_inset LatexCommand label +name "sub:How-to-write-tab" + +\end_inset + + +\end_layout + +\begin_layout Standard +Outputting the data stream as a table can be done with +\series bold +write_tab +\series default +, which will write generate one row per record with the values as columns. + If you supply the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment switch, when the first row in the table will be a 'comment' line + prefixed with a '#': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --comment +\end_layout + +\begin_layout Standard +You can also change the delimiter from the default (tab) to +\emph on +e.g. + +\emph default + ',': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --delimit=',' +\end_layout + +\begin_layout Standard +If you want the values output in a specific order you have to supply a comma + separated list using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch that will print only those keys in that order: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --keys=SEQ_NAME,COUNT +\end_layout + +\begin_layout Standard +Alternatively, if you have some keys that you don't want in the tabular + output, use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switch. + So to print all keys except SEQ and SEQ_TYPE do: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --no_keys=SEQ,SEQ_TYPE +\end_layout + +\begin_layout Standard +Finally, if you have a stream containing a mix of different records types, + +\emph on +e.g. + +\emph default + records with sequences and records with matches, then you can use +\series bold +write_tab +\series default + to output all the records in tabluar format, however, the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment, +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys, and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switches will only respond to records of the first type encountered. + The reason is that outputting mixed records is probably not what you want + anyway, and you should remove all the unwanted records from the stream + before outputting the table: +\series bold +grab +\series default + is your friend (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to write a BED output? +\begin_inset LatexCommand label +name "sub:How-to-write-BED" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in BED format can be output if the records contain the mandatory keys + CHR, CHR_BEG, and CHR_END using +\series bold +write_bed +\series default +. + If the optional keys are also present, they will be output as well: +\end_layout + +\begin_layout LyX-Code +write_bed --result_out= +\end_layout + +\begin_layout Subsection +How to write PSL output? +\begin_inset LatexCommand label +name "sub:How-to-write-PSL" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in PSL format can be output using +\series bold +write_psl: +\end_layout + +\begin_layout LyX-Code +write_psl --result_out= +\end_layout + +\begin_layout Section +Manipulating Records +\end_layout + +\begin_layout Subsection +How to select a few records? +\begin_inset LatexCommand label +name "sub:How-to-select-a-few-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +To quickly get an overview of your data you can limit the data stream to + show a few records. + This also very useful to test the pipeline with a few records if you are + setting up a complex analysis using several biotools. + That way you can inspect that all goes well before analyzing and waiting + for the full data set. + All of the read_ biotools have the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch which will take a number as argument and only that number of + records will be read. + So to read in the first 10 FASTA entries from a file: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna --num=10 +\end_layout + +\begin_layout Standard +Another way of doing this is to use +\series bold +head_records +\series default + will limit the stream to show the first 10 records (default): +\end_layout + +\begin_layout LyX-Code +... + | head_records +\end_layout + +\begin_layout Standard +Using +\series bold +head_records +\series default + directly after one of the read_ biotools will be a lot slower than + using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch with the read_ biotools, however, +\series bold +head_records +\series default + can also be used to limit the output from all the other biotools. + It is also possible to give +\series bold +head_records +\series default + a number of records to show using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch. + So to display the first 100 records do: +\end_layout + +\begin_layout LyX-Code +... + | head_records --num=100 +\end_layout + +\begin_layout Subsection +How to select random records? +\begin_inset LatexCommand label +name "sub:How-to-select-random-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to inspect a number of random records from the stream this can + be done with the +\series bold +random_records +\series default + biotool. + So if you have 1 mio records in the stream and you want to select 1000 + random records do: +\end_layout + +\begin_layout LyX-Code +... + | random_records --num=1000 +\end_layout + +\begin_layout Subsection +How to count all records in the data stream? +\end_layout + +\begin_layout Standard +To count all the records in the data stream use +\series bold +count_records +\series default +, which adds one record (which is not included in the count) to the data + stream. + So to count the number of sequences in a FASTA file you can do this: +\end_layout + +\begin_layout LyX-Code +cat test.fna | read_fasta | count_records --no_stream +\end_layout + +\begin_layout Standard +Which will write the last record containing the count to 'stdout': +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records: 630 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +It is also possible to write the count to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch. +\end_layout + +\begin_layout Subsection +How to get the length of record values? +\begin_inset LatexCommand label +name "sub:How-to-get-value_length" + +\end_inset + + +\end_layout + +\begin_layout Standard +Use the +\series bold +length_vals +\series default + biotool to get the length of each value for a comma separated list of keys: +\end_layout + +\begin_layout LyX-Code +... + | length_vals --keys=HIT,PATTERN +\end_layout + +\begin_layout Subsection +How to grab specific records? +\begin_inset LatexCommand label +name "sub:How-to-grab" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biotool +\series bold +grab +\series default + is related to the Unix grep and locates records based on matching keys + and/or values using either a pattern, a Perl regex, or a numerical evaluation. + To easily +\series bold +grab +\series default + all records in the stream that has any mentioning of the pattern 'human' + just pipe the data stream through +\series bold +grab +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human +\end_layout + +\begin_layout Standard +This will search for the pattern 'human' in all keys and all values. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma separated list of patterns, so in order to + match multiple patterns do: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human,mouse +\end_layout + +\begin_layout Standard +It is also possible to use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch instead of +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern. + +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in is used to read a file with one pattern per line: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern_in=patterns.txt +\end_layout + +\begin_layout Standard +If you want the opposite result --- to find all records that does not match + the patterns, add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch, which not only works with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch, but also with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +regex and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --invert +\end_layout + +\begin_layout Standard +If you want to search the record keys only, +\emph on +e.g. + +\emph default + to find all records containing the key SEQ you can add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys_only switch. + This will prevent matching of SEQ in any record value, and in fact SEQ + is a not uncommon peptide sequence you could get an unwanted record. + Also, this will give an increase in speed since only the keys are searched: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --keys_only +\end_layout + +\begin_layout Standard +However, if you are interested in finding the peptide sequence SEQ and not + the SEQ key, just add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +vals_only switch instead: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --vals_only +\end_layout + +\begin_layout Standard +Also, if you want to grab for certain key/value pairs you can supply a comma + separated list of keys whos values will then be searched using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch. + This is handy if your records contain large genomic sequences and you dont + want to search the entire sequence for +\emph on +e.g. + +\emph default + the organism name --- it is much faster to tell +\series bold +grab +\series default + which keys to search the value for: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --keys=SEQ_NAME +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +It is also possible to invoke flexible matching using regex (regular expressions +) instead of simple pattern matching. + In +\series bold +grab +\series default + the regex engine is Perl based and allows use of different type of wild + cards, alternatives, +\emph on +etc +\emph default + +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://perldoc.perl.org/perlreref.html" + +\end_inset + + +\end_layout + +\end_inset + +. + If you want to +\series bold +grab +\series default + records withs the sequence ATCG or GCTA you can do this: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='ATCG|GCTA' +\end_layout + +\begin_layout Standard +Or if you want to find sequences beginning with ATCG: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='^ATCG' +\end_layout + +\begin_layout Standard +You can also use +\series bold +grab +\series default + to locate records that fulfill a numerical property using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval switch witch takes an expression in three parts. + The first part is the key that holds the value we want to evaluate, the + second part holds one the six operators: +\end_layout + +\begin_layout Enumerate +Greater than: > +\end_layout + +\begin_layout Enumerate +Greater than or equal to: >= +\end_layout + +\begin_layout Enumerate +Less than: < +\end_layout + +\begin_layout Enumerate +Less than or equal to: <= +\end_layout + +\begin_layout Enumerate +Equal to: = +\end_layout + +\begin_layout Enumerate +Not equal to: != +\end_layout + +\begin_layout Enumerate +String wise equal to: eq +\end_layout + +\begin_layout Enumerate +String wise not equal to: ne +\end_layout + +\begin_layout Standard +And finally comes the number used in the evaluation. + So to +\series bold +grab +\series default + all records with a sequence length greater than 30: +\end_layout + +\begin_layout LyX-Code +... + length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +If you want to locate all records containing the pattern 'human' and where + the sequence length is greater that 30, you do this by running the stream + through +\series bold +grab +\series default + twice: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +Finally, it is possible to do fast matching of expressions from a file using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact switch. + Each of these expressions has to be matched exactly over the entrie length, + which if useful if you have a file with accession numbers, that you want + to locate in the stream: +\end_layout + +\begin_layout LyX-Code +... + | grab --exact acc_no.txt | ... +\end_layout + +\begin_layout Standard +Using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact is much faster than using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in, because with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact the expression has to be complete matches, where +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in looks for subpatterns. +\end_layout + +\begin_layout Standard +NB! To get the best speed performance, use the most restrictive +\series bold +grab +\series default + first. +\end_layout + +\begin_layout Subsection +How to remove keys from records? +\end_layout + +\begin_layout Standard +To remove one or more specific keys from all records in the data stream + use +\series bold +remove_keys +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | remove_keys --keys=SEQ,SEQ_NAME +\end_layout + +\begin_layout Standard +In the above example SEQ and SEQ_NAME will be removed from all records if + they exists in these. + If all keys are removed from a record, then the record will be removed. +\end_layout + +\begin_layout Subsection +How to rename keys in records? +\end_layout + +\begin_layout Standard +Sometimes you want to rename a record key, +\emph on +e.g. + +\emph default + if you have read in a two column table with sequence name and sequence + in each column (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +) without specifying the key names, then the sequence name will be called + V0 and the sequence V1 as default in the +\series bold +read_tab +\series default + biotool. + To rename the V0 and V1 keys we need to run the stream through +\series bold +rename_keys +\series default + twice (one for each key to rename): +\end_layout + +\begin_layout LyX-Code +... + | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ +\end_layout + +\begin_layout Standard +The first instance of +\series bold +rename_keys +\series default + replaces all the V0 keys with SEQ_NAME, and the second instance of +\series bold +rename_keys +\series default + replaces all the V1 keys with SEQ. + +\emph on +Et viola +\emph default + the data can now be used in the biotools that requires these keys. +\end_layout + +\begin_layout Section +Manipulating Sequences +\end_layout + +\begin_layout Subsection +How to get sequence lengths? +\end_layout + +\begin_layout Standard +The length for sequences in records can be determined with +\series bold +length_seq +\series default +, which adds the key SEQ_LEN to each record with the sequence length as + the value. + It also generates an extra record that is emitted last with the key TOTAL_SEQ_L +EN showing the total length of all the sequences. +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout Standard +It is also possible to determine the sequence length using the generic tool + +\series bold +length_vals +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-get-value_length" + +\end_inset + +, which determines the length of the values for a given list of keys: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout Standard +To obtain the total length of all sequences use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout LyX-Code +| sum_vals --keys=SEQ_LEN +\end_layout + +\begin_layout Standard +The biotool +\series bold +analyze_seq +\series default + will also determine the length of each sequence (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-analyze" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to analyze sequence composition? +\begin_inset LatexCommand label +name "sub:How-to-analyze" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to find out the sequence type, composition, length, as well + as GC content, indel content and proportions of soft and hard masked sequence, + then use +\series bold +analyze_seq +\series default +. + This handy biotool will determine all these things per sequence from which + it is easy to get an overview using the +\series bold +write_tab +\series default + biotool to output a table (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +). + So in order to determine the sequence composition of a FASTA file with + just one entry containing the sequence 'ATCG' we just read the data with + +\series bold +read_fasta +\series default + and run the output through +\series bold +analyze_seq +\series default + which will add the analysis to the record like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:D: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MIX_INDEX: 0.55 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:W: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:G: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SOFT_MASK%: 63.75 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:B: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:V: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +HARD_MASK%: 0.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:H: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:S: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:N: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:.: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +GC%: 35.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:A: 8 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:Y: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:M: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:T: 44 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_TYPE: DNA +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:K: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:~: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc +tgtcg +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_LEN: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +80 RES:R: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:C: 12 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:-: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:U: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +Now to make a table of how may As, Ts, Cs, and Gs you can add the following: +\end_layout + +\begin_layout LyX-Code +... + | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G +\end_layout + +\begin_layout Standard +Or if you want to see the proportions of hard and soft masked sequence: +\end_layout + +\begin_layout LyX-Code +... + | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK% +\end_layout + +\begin_layout Standard +If you have a stack of sequences in one file and you want to determine the + mean GC content you can do it using the +\series bold +mean_vals +\series default + biotool: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC% +\end_layout + +\begin_layout Standard +Or if you want the total count of Ns you can use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N +\end_layout + +\begin_layout Standard +The MIX_INDEX key is calculated as the count of the most common residue + over the sequence length, and can be used as a cut-off for removing sequence + tags consisting of mostly one nucleotide: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85' +\end_layout + +\begin_layout Subsection +How to extract subsequences? +\begin_inset LatexCommand label +name "sub:How-to-extract" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to extract a subsequence from a longer sequence use the biotool + extract_seq, which will replace the sequence in the record with the subsequence + (this behaviour should probably be modified to be dependant of a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +replace or a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_replace switch +\begin_inset Note Note +status collapsed + +\begin_layout Standard +also in split_seq +\end_layout + +\end_inset + +). + So to extract the first 20 residues from all sequences do (first residue + is designated 1): +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=1 --len=20 +\end_layout + +\begin_layout Standard +You can also specify a begin and end coordinate set: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 --end=40 +\end_layout + +\begin_layout Standard +If you want the subsequences from position 20 to the sequence end do: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 +\end_layout + +\begin_layout Standard +If you want to extract subsequences a given distance from the sequence end + you can do this by reversing the sequence with the biotool +\series bold +reverse_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-reverse-seq" + +\end_inset + +, followed by +\series bold +extract_seq +\series default + to get the subsequence, and then +\series bold +reverse_seq +\series default + again to get the subsequence back in the original orientation: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | reverse_seq +\end_layout + +\begin_layout LyX-Code +| extract_seq --beg=10 --len=10 | reverse_seq +\end_layout + +\begin_layout Subsection +How to get genomic sequence? +\begin_inset LatexCommand label +name "sub:How-to-get-genomic-sequence" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biotool +\series bold +get_genomic_seq +\series default + can extract subsequences for a given genome specified with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch explicitly using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +beg and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +end/ +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +len switches: +\end_layout + +\begin_layout LyX-Code +get_genome_seq --genome= --beg=1 --len=100 +\end_layout + +\begin_layout Standard +Alternatively, +\series bold +get_genome_seq +\series default + can be used to append the corresponding sequence to BED, PSL, and BLAST + records: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to include flaking sequence using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +flank switch. + So to include 50 nucleotides upstream and 50 nucleotides downstream for + each BED entry do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= --flank=50 +\end_layout + +\begin_layout Subsection +How to upper-case sequences? +\end_layout + +\begin_layout Standard +Sequences can be shifted from lower case to upper case using +\series bold +uppercase_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | uppercase_seq +\end_layout + +\begin_layout Subsection +How to reverse sequences? +\begin_inset LatexCommand label +name "sub:How-to-reverse-seq" + +\end_inset + + +\end_layout + +\begin_layout Standard +The order of residues in a sequence can be reversed using reverse_seq: +\end_layout + +\begin_layout LyX-Code +... + | reverse_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +complement_seq +\series default + biotool (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-complement" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to complement sequences? +\begin_inset LatexCommand label +name "sub:How-to-complement" + +\end_inset + + +\end_layout + +\begin_layout Standard +DNA and RNA sequences can be complemented with +\series bold +complement_seq +\series default +, which automagically determines the sequence type: +\end_layout + +\begin_layout LyX-Code +... + | complement_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +reverse_seq +\series default + biotool (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-reverse-seq" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to remove indels from sequnces? +\end_layout + +\begin_layout Standard +Indels can be removed from sequences with the +\series bold +remove_indels +\series default + biotool. + This is useful if you have aligned some sequences (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-align" + +\end_inset + +) and extracted (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-extract" + +\end_inset + +) a block of subsequences from the alignment and you want to use these sequence + in a search where you need to remove the indels first. + '-', '~', and '.' are considered indels: +\end_layout + +\begin_layout LyX-Code +... + | remove_indels +\end_layout + +\begin_layout Subsection +How to shuffle sequences? +\end_layout + +\begin_layout Standard +All residues in sequences in the stream can be shuffled to random positions + using the +\series bold +shuffle_seq +\series default + biotool: +\end_layout + +\begin_layout LyX-Code +... + | shuffle_seq +\end_layout + +\begin_layout Subsection +How to split sequences into overlapping subsequences? +\end_layout + +\begin_layout Standard +Sequences can be slit into overlapping subsequences with the +\series bold +split_seq +\series default + biotool. +\end_layout + +\begin_layout LyX-Code +... + | split_seq --word_size=20 --uniq +\end_layout + +\begin_layout Subsection +How to determine the oligo frequency? +\end_layout + +\begin_layout Standard +In order to determine if any oligo usage is over represented in one or more + sequences you can determine the frequency of oligos of a given size with + +\series bold +oligo_freq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 +\end_layout + +\begin_layout Standard +And if you have more than one sequence and want to accumulate the frequences + you need the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +all switch: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all +\end_layout + +\begin_layout Standard +To get a meaningful result you need to write the resulting frequencies as + a table with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +), but first it is important to +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +) the records with the frequencies to avoid full length sequences in the + table: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +And the resulting frequency table can be sorted with Unix sort (man sort). +\end_layout + +\begin_layout Subsection +How to search for sequences in genomes? +\end_layout + +\begin_layout Standard +See the following biotool: +\end_layout + +\begin_layout Itemize + +\series bold +patscan_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blat_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blast_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +vmatch_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to search sequences for a pattern? +\begin_inset LatexCommand label +name "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Standard +It is possible to search sequences in the data stream for patterns using + the +\series bold +patscan_seq +\series default + biotool which utilizes the powerful scan_for_matches engine. + Consult the documentation for scan_for_matches in order to learn how to + define patterns (the documentation is included in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:scan_for_matches-README" + +\end_inset + +). +\end_layout + +\begin_layout Standard +To search all sequences for a simple pattern consisting of the sequence + ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | patscan_seq --pattern='ATCGATCG[3,2,1]' +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma seperated list of patterns, so if you want + to search for more that one pattern do: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]' +\end_layout + +\begin_layout Standard +It is also possible to have a list of different patterns to search for in + a file with one pattern per line. + In order to get +\series bold +patscan_seq +\series default + to read these patterns use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern_in= +\end_layout + +\begin_layout Standard +To also scan the complementary strand in nucleotide sequences ( +\series bold +patscan_seq +\series default + automagically determines the sequence type) you need to add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comp switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --comp +\end_layout + +\begin_layout Standard +It is also possible to use +\series bold +patscan_seq +\series default + to output those records that does not contain a certain pattern by using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --invert +\end_layout + +\begin_layout Standard +Finally, +\series bold +patscan_seq +\series default + can also scan for patterns in a given genome sequence, instead of sequences + in the stream, using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch: +\end_layout + +\begin_layout LyX-Code +patscan --pattern= --genome= +\end_layout + +\begin_layout Subsection +How to use BLAT for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the data stream can be matched against supported genomes using + +\series bold +blat_seq +\series default + which is a biotool using BLAT as the name might suggest. + Currently only Mouse and Human genomes are available and it is not possible + to use OOC files since there is still a need for a local repository for + genome files. + Otherwise it is just: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_seq --genome= +\end_layout + +\begin_layout Standard +The search results can then be written to file with +\series bold +write_psl +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +) or +\series bold +write_bed +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-BED" + +\end_inset + +) allthough with +\series bold +write_bed +\series default + some information will be lost). + It is also possible to plot chromosome distribution of the search results + using +\series bold +plot_chrdist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-chrdist" + +\end_inset + +) or the distribution of the match lengths using +\series bold +plot_lendist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-lendist" + +\end_inset + +) or a karyogram with the hits using +\series bold +plot_karyogram +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-karyogram" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to use BLAST for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Standard +Two biotools exist for blasting sequences: +\series bold +create_blast_db +\series default + is used to create the BLAST database required for BLAST which is queried + using the biotool +\series bold +blast_seq +\series default +. + So in order to create a BLAST database from sequences in the data stream + you simple run: +\end_layout + +\begin_layout LyX-Code +... + | create_blast_db --database=my_database --no_stream +\end_layout + +\begin_layout Standard +The type of sequence to use for the database is automagically determined + by +\series bold +create_blast_db +\series default +, but don't have a mixture of peptide and nucleic acids sequences in the + stream. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch takes a path as argument, but will default to 'blastdb_ if not set. +\end_layout + +\begin_layout Standard +The resulting database can now be queried with sequences in another data + stream using +\series bold +blast_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --database=my_database +\end_layout + +\begin_layout Standard +Again, the sequence type is determined automagically and the appropriate + BLAST program is guessed (see below table), however, the program name can + be overruled with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +program switch. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Subject sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Query sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Program guess +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastn +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastp +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastx +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +tblastn +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Finally, it is also possible to use +\series bold +blast_seq +\series default + for blasting sequences agains a preformatted genome using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch instead of the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --genome= +\end_layout + +\begin_layout Subsection +How to use Vmatch for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The powerful suffix array software package Vmatch +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.vmatch.de/" + +\end_inset + + +\end_layout + +\end_inset + + can be used for exact mapping of sequences against indexed genomes using + the biotool +\series bold +vmatch_seq +\series default +, which will e.g. + map 700000 ESTs to the human genome locating all 160 mio hits in less than + an hour. + Only nucleotide sequences and sequences longer than 11 nucleotides will + be mapped. + It is recommended that sequences consisting of mostly one nucleotide type + are removed. + This can be done with the +\series bold +analyze_seq +\series default + biotool +\begin_inset LatexCommand eqref +reference "sub:How-to-analyze" + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to allow for mismatches using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist switch. + So to allow for 2 mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=2 +\end_layout + +\begin_layout Standard +Or to allow for 10% mismathing nucleotides: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=10p +\end_layout + +\begin_layout Standard +To allow both indels and mismatches use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist switch. + So to allow for one mismatch or one indel: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=1 +\end_layout + +\begin_layout Standard +Or to allow for 5% indels or mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=5p +\end_layout + +\begin_layout Standard +Note that using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist or +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist greatly slows down vmatch considerably --- use with care. +\end_layout + +\begin_layout Standard +The resulting SCORE key can be replaced to hold the number of genome matches + of a given sequence (multi-mappers) is the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +count switch is given. +\end_layout + +\begin_layout Subsection +How to find all matches between sequences? +\begin_inset LatexCommand label +name "sub:How-to-find-matches" + +\end_inset + + +\end_layout + +\begin_layout Standard +All matches between two sequences can be determined with the biotool +\series bold +match_seq +\series default +. + The match finding engine underneath the hood of +\series bold +match_seq +\series default + is the super fast suffix tree program MUMmer +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://mummer.sourceforge.net/" + +\end_inset + + +\end_layout + +\end_inset + +, which will locate all forward and reverse matches between huge sequences + in a matter of minutes (if the repeat count is not too high and if the + word size used is appropriate). + Matching two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq --word_size=20 --direction=both +\end_layout + +\begin_layout Standard +The output from +\series bold +match_seq +\series default + can be used to generate a dot plot with +\series bold +plot_matches +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-generate-dotplot" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to align sequences? +\begin_inset LatexCommand label +name "sub:How-to-align" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the stream can be aligned with the +\series bold +align_seq +\series default + biotool that uses Muscle +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.drive5.com/muscle/muscle.html" + +\end_inset + + +\end_layout + +\end_inset + + as aligment engine. + Currently you cannot change any of the Muscle alignment parameters and + +\series bold +align_seq +\series default + will create an alignment based on the defaults (which are really good!): +\end_layout + +\begin_layout LyX-Code +... + | align_seq +\end_layout + +\begin_layout Standard +The aligned output can be written to file in FASTA format using +\series bold +write_fasta +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-fasta" + +\end_inset + +) or in pretty text using +\series bold +write_align +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-alignment" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to create a weight matrix? +\end_layout + +\begin_layout Standard +If you want a weight matrix to show the sequence composition of a stack + of sequences you can use the biotool create_weight_matrix: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix +\end_layout + +\begin_layout Standard +The result can be output in percent using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +percent switch: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix --percent +\end_layout + +\begin_layout Standard +The weight matrix can be written as tabular output with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +) after removeing the records containing SEQ with +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix | grab --invert --keys=SEQ --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +The V0 column will hold the residue, while the rest of the columns will + hold the frequencies for each sequence position. +\end_layout + +\begin_layout Section +Plotting +\end_layout + +\begin_layout Standard +There exists several biotools for plotting. + Some of these are based on GNUplot +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.gnuplot.info/" + +\end_inset + + +\end_layout + +\end_inset + +, which is an extremely powerful platform to generate all sorts of plots + and even though GNUplot has quite a steep learning curve, the biotools + utilizing GNUplot are simple to use. + GNUplot is able to output a lot of different formats (called terminals + in GNUplot), but the biotools focusses on three formats only: +\end_layout + +\begin_layout Enumerate +The 'dumb' terminal is default to the GNUplot based biotools and will output + a plot in crude ASCII text (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +). + This is quite nice for a quick and dirty plot to get an overview of your + data . +\end_layout + +\begin_layout Enumerate +The 'post' or 'postscript' terminal output postscript code which is publication + grade graphics that can be viewed with applications such as Ghostview, + Photoshop, and Preview. +\end_layout + +\begin_layout Enumerate +The 'svg' terminal output's scalable vector graphics (SVG) which is a vector + based format. + SVG is great because you can edit the resulting plot using Photoshop or + Inkscape +\begin_inset Foot +status collapsed + +\begin_layout Standard +Inkscape is a really handy drawing program that is free and open source. + Availble at +\begin_inset LatexCommand htmlurl +target "http://www.inkscape.org" + +\end_inset + + +\end_layout + +\end_inset + + if you want to add additional labels, captions, arrows, and so on and then + save the result in different formats, such as postscript without loosing + resolution. +\end_layout + +\begin_layout Standard +The biotools for plotting that are not based on GNUplot only output SVG + (that may change in the future). +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist_ascii.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dumb-terminal" + +\end_inset + +Dumb terminal +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +The output of a length distribution plot in the default 'dumb terminal' + to the terminal window. + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a histogram? +\begin_inset LatexCommand label +name "How-to-plot-histogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +A generic histogram for a given value can be plotted with the biotool +\series bold +plot_histogram +\series default + (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Histogram" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | plot_histogram --key=TISSUE --no_stream +\end_layout + +\begin_layout Standard +(Figure missing) +\end_layout + +\begin_layout Standard +\noindent +\align left +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename histogram.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Histogram" + +\end_inset + +Histogram +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a length distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-lendist" + +\end_inset + + +\end_layout + +\begin_layout Standard +Plotting of length distributions, weather sequence lengths, patterns lengths, + hit lengths, +\emph on +etc. + +\emph default + is a really handy thing and can be done with the the biotool +\series bold +plot_lendist +\series default +. + If you have a file with FASTA entries and want to plot the length distribution + you do it like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout LyX-Code +| plot_lendist --key=SEQ_LEN --no_stream +\end_layout + +\begin_layout Standard +The result will be written to the default dumb terminal and will look like + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +. +\end_layout + +\begin_layout Standard +If you instead want the result in postscript format you can do: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps +\end_layout + +\begin_layout Standard +That will generate the plot and save it to file, but not interrupt the data + stream which can then be used in further analysis. + You can also save the plot implicetly using '>', however, it is then important + to terminate the stream with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps +\end_layout + +\begin_layout Standard +The resulting plot can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Length-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Length-distribution" + +\end_inset + +Length distribution +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Length distribution of 630 piRNA like RNAs. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a chromosome distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-chrdist" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you have the result of a sequence search against a multi chromosome genome, + it is very practical to be able to plot the distribution of search hits + on the different chromosomes. + This can be done with +\series bold +plot_chrdist +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_genome | plot_chrdist --no_stream +\end_layout + +\begin_layout Standard +The above example will result in a crude plot using the 'dumb' terminal, + and if you want to mess around with the results from the BLAT search you + probably want to save the result to file first (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +). + To plot the chromosome distribution from the saved search result you can + do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +That will result in the output show in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Chromosome-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename chrdist.ps + lyxscale 50 + width 12cm + rotateAngle 90 + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Chromosome-distribution" + +\end_inset + +Chromosome distribution +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to generate a dotplot? +\begin_inset LatexCommand label +name "sub:How-to-generate-dotplot" + +\end_inset + + +\end_layout + +\begin_layout Standard +A dotplot is a powerful way to get an overview of the size and location + of sequence insertions, deletions, and duplications between two sequences. + Generating a dotplot with biotools is a two step process where you initially + find all matches between two sequences using the tool +\series bold +match_seq +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-find-matches" + +\end_inset + +) and plot the resulting matches with +\series bold +plot_matches +\series default +. + Matching and plotting two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq | plot_matches --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +The resulting dotplot is in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dotplot" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename dotplot.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dotplot" + +\end_inset + +Dotplot +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Forward matches are displayed in green while reverse matches are displayed + in red. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a sequence logo? +\end_layout + +\begin_layout Standard +Sequence logos can be generate with +\series bold +plot_seqlogo +\series default +. + The sequnce type is determined automagically and an entropy scale of 2 + bits and 4 bits is used for nucleotide and peptide sequences, respectively +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand htmlurl +target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html" + +\end_inset + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | plot_seqlogo --no_stream --result_out=seqlogo.svg +\end_layout + +\begin_layout Standard +An example of a sequence logo can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Sequence-logo" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename seqlogo.png + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Sequence-logo" + +\end_inset + +Sequence logo +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a karyogram? +\begin_inset LatexCommand label +name "sub:How-to-plot-karyogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +To plot search hits on genomes use +\series bold +plot_karyogram +\series default +, which will output a nice karyogram in SVG graphics: +\end_layout + +\begin_layout LyX-Code +... + | plot_karyogram --result_out=karyogram.svg +\end_layout + +\begin_layout Standard +The banding data is taken from the UCSC genome browser database and currently + only Human and Mouse is supported. + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Karyogram" + +\end_inset + + shows the distribution of piRNA like RNAs matched to the Human genome. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename karyogram.png + lyxscale 35 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Karyogram" + +\end_inset + +Karyogram +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Hits from a search of piRNA like RNAs in the Human genome is displayed as + short horizontal bars. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Uploading Results +\end_layout + +\begin_layout Subsection +How do I display my results in the UCSC Genome Browser? +\end_layout + +\begin_layout Standard +Results from the list of biotools below can be uploaded directly to a local + mirror of the UCSC Genome Browser using the biotool +\series bold +upload_to_ucsc +\series default +: +\end_layout + +\begin_layout Itemize +patscan_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blat_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blast_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize +vmatch_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The syntax for uploading data the most simple way requires two mandatory + switches: +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database, which is the UCSC database name (such as hg18, mm9, etc.) and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +table which should be the users initials followed by an underscore and a + short description of the data: +\end_layout + +\begin_layout LyX-Code +... + | upload_to_ucsc --database=hg18 --table=mah_snoRNAs +\end_layout + +\begin_layout Standard +The +\series bold +upload_to_ucsc +\series default + biotool modifies the users ~/ucsc/my_tracks.ra file automagically (a backup + is created with the name ~/ucsc/my_tracks.ra~) with default values that + can be overridden using the following switches: +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +short_label - Short label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +long_label - Long label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +group - Track group name - Default= +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +priority - Track display priority - Default=1 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +color - Track color - Default=147,73,42 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +chunk_size - Chunks for loading - Default=10000000 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +visibility - Track visibility - Default=pack +\end_layout + +\begin_layout Standard +Also, data in BED or PSL format can be uploaded with +\series bold +upload_to_ucsc +\series default + as long as these reference to genomes and chromosomes existing in the UCSC + Genome Browser: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout Section +Power Scripting +\end_layout + +\begin_layout Standard +It is possible to do commandline scripting of biotool records using Perl. + Because a biotool record essentially is a hash structure, you can pass + records to +\series bold +bioscript +\series default + command, which is a wrapper around the Perl executable that allows direct + manipulations of the records using the power of Perl. +\end_layout + +\begin_layout Standard +In the below example we replace in all records the value to the CHR key + with a forthrunning number: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{CHR}=$i++; put_record($r)}' +\end_layout + +\begin_layout Standard +Something more useful would probably be to create custom FASTA headers. + E.g. + if we read in a BED file, lookup the genomic sequence, create a custom + FASTA header with +\series bold +bioscript +\series default + and output FASTA entries: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{SEQ_NAME}= // +\end_layout + +\begin_layout LyX-Code +join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}' +\end_layout + +\begin_layout Standard +And the output: +\end_layout + +\begin_layout LyX-Code +>chr2L_21567527_21567550 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_693380_693403 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_13859534_13859557 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_9005090_9005113 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_2106825_2106848 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_14649031_14649054 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout Section +Trouble shooting +\end_layout + +\begin_layout Standard +Shoot the messenger! +\end_layout + +\begin_layout Section +\start_of_appendix +Keys +\begin_inset LatexCommand label +name "sec:Keys" + +\end_inset + + +\end_layout + +\begin_layout Standard +HIT +\end_layout + +\begin_layout Standard +HIT_BEG +\end_layout + +\begin_layout Standard +HIT_END +\end_layout + +\begin_layout Standard +HIT_LEN +\end_layout + +\begin_layout Standard +HIT_NAME +\end_layout + +\begin_layout Standard +PATTERN +\end_layout + +\begin_layout Section +Switches +\begin_inset LatexCommand label +name "sec:Switches" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num +\end_layout + +\begin_layout Section +scan_for_matches README +\begin_inset LatexCommand label +name "sec:scan_for_matches-README" + +\end_inset + + +\end_layout + +\begin_layout LyX-Code + scan_for_matches: +\end_layout + +\begin_layout LyX-Code + A Program to Scan Nucleotide or Protein Sequences for Matching Patterns +\end_layout + +\begin_layout LyX-Code + Ross Overbeek +\end_layout + +\begin_layout LyX-Code + MCS +\end_layout + +\begin_layout LyX-Code + Argonne National Laboratory +\end_layout + +\begin_layout LyX-Code + Argonne, IL 60439 +\end_layout + +\begin_layout LyX-Code + USA +\end_layout + +\begin_layout LyX-Code +Scan_for_matches is a utility that we have written to search for +\end_layout + +\begin_layout LyX-Code +patterns in DNA and protein sequences. + I wrote most of the code, +\end_layout + +\begin_layout LyX-Code +although David Joerg and Morgan Price wrote sections of an +\end_layout + +\begin_layout LyX-Code +earlier version. + The whole notion of pattern matching has a rich +\end_layout + +\begin_layout LyX-Code +history, and we borrowed liberally from many sources. + However, it is +\end_layout + +\begin_layout LyX-Code +worth noting that we were strongly influenced by the elegant tools +\end_layout + +\begin_layout LyX-Code +developed and distributed by David Searls. + My intent is to make the +\end_layout + +\begin_layout LyX-Code +existing tool available to anyone in the research community that might +\end_layout + +\begin_layout LyX-Code +find it useful. + I will continue to try to fix bugs and make suggested +\end_layout + +\begin_layout LyX-Code +enhancements, at least until I feel that a superior tool exists. +\end_layout + +\begin_layout LyX-Code +Hence, I would appreciate it if all bug reports and suggestions are +\end_layout + +\begin_layout LyX-Code +directed to me at Overbeek@mcs.anl.gov. + +\end_layout + +\begin_layout LyX-Code +I will try to log all bug fixes and report them to users that send me +\end_layout + +\begin_layout LyX-Code +their email addresses. + I do not require that you give me your name +\end_layout + +\begin_layout LyX-Code +and address. + However, if you do give it to me, I will try to notify +\end_layout + +\begin_layout LyX-Code +you of serious problems as they are discovered. +\end_layout + +\begin_layout LyX-Code +Getting Started: +\end_layout + +\begin_layout LyX-Code + The distribution should contain at least the following programs: +\end_layout + +\begin_layout LyX-Code + README - This document +\end_layout + +\begin_layout LyX-Code + ggpunit.c - One of the two source files +\end_layout + +\begin_layout LyX-Code + scan_for_matches.c - The second source file +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + run_tests - A perl script to test things +\end_layout + +\begin_layout LyX-Code + show_hits - A handy perl script +\end_layout + +\begin_layout LyX-Code + test_dna_input - Test sequences for DNA +\end_layout + +\begin_layout LyX-Code + test_dna_patterns - Test patterns for DNA scan +\end_layout + +\begin_layout LyX-Code + test_output - Desired output from test +\end_layout + +\begin_layout LyX-Code + test_prot_input - Test protein sequences +\end_layout + +\begin_layout LyX-Code + test_prot_patterns - Test patterns for proteins +\end_layout + +\begin_layout LyX-Code + testit - a perl script used for test +\end_layout + +\begin_layout LyX-Code + Only the first three files are required. + The others are useful, +\end_layout + +\begin_layout LyX-Code + but only if you have Perl installed on your system. + If you do +\end_layout + +\begin_layout LyX-Code + have Perl, I suggest that you type +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + which perl +\end_layout + +\begin_layout LyX-Code + to find out where it installed. + On my system, I get the following +\end_layout + +\begin_layout LyX-Code + response: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% which perl +\end_layout + +\begin_layout LyX-Code + /usr/local/bin/perl +\end_layout + +\begin_layout LyX-Code + indicating that Perl is installed in /usr/local/bin. + Anyway, once +\end_layout + +\begin_layout LyX-Code + you know where it is installed, edit the first line of files +\end_layout + +\begin_layout LyX-Code + testit +\end_layout + +\begin_layout LyX-Code + show_hits +\end_layout + +\begin_layout LyX-Code + replacing /usr/local/bin/perl with the appropriate location. + I +\end_layout + +\begin_layout LyX-Code + will assume that you can do this, although it is not critical (it +\end_layout + +\begin_layout LyX-Code + is needed only to test the installation and to use the "show_hits" +\end_layout + +\begin_layout LyX-Code + utility). + Perl is not required to actually install and run +\end_layout + +\begin_layout LyX-Code + scan_for_matches. + +\end_layout + +\begin_layout LyX-Code + If you do not have Perl, I suggest you get it and install it (it +\end_layout + +\begin_layout LyX-Code + is a wonderful utility). + Information about Perl and how to get it +\end_layout + +\begin_layout LyX-Code + can be found in the book "Programming Perl" by Larry Wall and +\end_layout + +\begin_layout LyX-Code + Randall L. + Schwartz, published by O'Reilly & Associates, Inc. +\end_layout + +\begin_layout LyX-Code + To get started, you will need to compile the program. + I do this +\end_layout + +\begin_layout LyX-Code + using +\end_layout + +\begin_layout LyX-Code + gcc -O -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + If you do not use GNU C, use +\end_layout + +\begin_layout LyX-Code + cc -O -DCC -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + which works on my Sun. + +\end_layout + +\begin_layout LyX-Code + Once you have compiled scan_for_matches, you can verify that it +\end_layout + +\begin_layout LyX-Code + works with +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + You may get a few strange lines of the sort +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + rm: tmp: No such file or directory +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + These should cause no concern. + However, if the "diff" shows that +\end_layout + +\begin_layout LyX-Code + tmp and test_output are different, contact me (you have a +\end_layout + +\begin_layout LyX-Code + problem). + +\end_layout + +\begin_layout LyX-Code + You should now be able to use scan_for_matches by following the +\end_layout + +\begin_layout LyX-Code + instructions given below (which is all the normal user should have +\end_layout + +\begin_layout LyX-Code + to understand, once things are installed properly). +\end_layout + +\begin_layout LyX-Code + ============================================================== +\end_layout + +\begin_layout LyX-Code +How to run scan_for_matches: +\end_layout + +\begin_layout LyX-Code + To run the program, you type need to create two files +\end_layout + +\begin_layout LyX-Code + 1. + the first file contains the pattern you wish to scan for; I'll +\end_layout + +\begin_layout LyX-Code + call this file pat_file in what follows (but any name is ok) +\end_layout + +\begin_layout LyX-Code + 2. + the second file contains a set of sequences to scan. + These +\end_layout + +\begin_layout LyX-Code + should be in "fasta format". + Just look at the contents of +\end_layout + +\begin_layout LyX-Code + test_dna_input to see examples of this format. + Basically, +\end_layout + +\begin_layout LyX-Code + each sequence begins with a line of the form +\end_layout + +\begin_layout LyX-Code + >sequence_id +\end_layout + +\begin_layout LyX-Code + and is followed by one or more lines containing the sequence. +\end_layout + +\begin_layout LyX-Code + Once these files have been created, you just use +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < input_file +\end_layout + +\begin_layout LyX-Code + to scan all of the input sequences for the given pattern. + As an +\end_layout + +\begin_layout LyX-Code + example, suppose that pat_file contains a single line of the form +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + Then, +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + should produce two "hits". + When I run this on my machine, I get +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + >tst2:[6,27] +\end_layout + +\begin_layout LyX-Code + CGUAACC GGTTAACC GGUUACG +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code +Simple Patterns Built by Matching Ranges and Reverse Complements +\end_layout + +\begin_layout LyX-Code + Let me first explain this simple pattern: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + The pattern consists of three "pattern units" separated by spaces. +\end_layout + +\begin_layout LyX-Code + The first pattern unit is +\end_layout + +\begin_layout LyX-Code + p1=4...7 +\end_layout + +\begin_layout LyX-Code + which means "match 4 to 7 characters and call them p1". + The +\end_layout + +\begin_layout LyX-Code + second pattern unit is +\end_layout + +\begin_layout LyX-Code + 3...8 +\end_layout + +\begin_layout LyX-Code + which means "then match 3 to 8 characters". + The last pattern unit +\end_layout + +\begin_layout LyX-Code + is +\end_layout + +\begin_layout LyX-Code + ~p1 +\end_layout + +\begin_layout LyX-Code + which means "match the reverse complement of p1". + The first +\end_layout + +\begin_layout LyX-Code + reported hit is shown as +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + which states that characters 6 through 27 of sequence tst1 were +\end_layout + +\begin_layout LyX-Code + matched. + "cguaac" matched the first pattern unit, "ggttaacc" the +\end_layout + +\begin_layout LyX-Code + second, and "gguuacg" the third. + This is an example of a common +\end_layout + +\begin_layout LyX-Code + type of pattern used to search for sections of DNA or RNA that +\end_layout + +\begin_layout LyX-Code + would fold into a hairpin loop. +\end_layout + +\begin_layout LyX-Code +Searching Both Strands +\end_layout + +\begin_layout LyX-Code + Now for a short aside: scan_for_matches only searched the +\end_layout + +\begin_layout LyX-Code + sequences in the input file; it did not search the opposite +\end_layout + +\begin_layout LyX-Code + strand. + With a pattern of the sort we just used, there is not +\end_layout + +\begin_layout LyX-Code + need o search the opposite strand. + However, it is normally the +\end_layout + +\begin_layout LyX-Code + case that you will wish to search both the sequence and the +\end_layout + +\begin_layout LyX-Code + opposite strand (i.e., the reverse complement of the sequence). +\end_layout + +\begin_layout LyX-Code + To do that, you would just use the "-c" command line. + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + Hits on the opposite strand will show a beginning location greater +\end_layout + +\begin_layout LyX-Code + than te end location of the match. +\end_layout + +\begin_layout LyX-Code +Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions +\end_layout + +\begin_layout LyX-Code + Let us stop now and ask "What additional features would one need to +\end_layout + +\begin_layout LyX-Code + really find the kinds of loop structures that characterize tRNAs, +\end_layout + +\begin_layout LyX-Code + rRNAs, and so forth?" I can immediately think of two: +\end_layout + +\begin_layout LyX-Code + a) you will need to be able to allow non-standard pairings +\end_layout + +\begin_layout LyX-Code + (those other than G-C and A-U), and +\end_layout + +\begin_layout LyX-Code + b) you will need to be able to tolerate some number of +\end_layout + +\begin_layout LyX-Code + mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + Let me first show you how to handle non-standard "rules for +\end_layout + +\begin_layout LyX-Code + pairing in reverse complements". + Consider the following pattern, +\end_layout + +\begin_layout LyX-Code + which I show as two line (you may use as many lines as you like in +\end_layout + +\begin_layout LyX-Code + forming a pattern, although you can only break a pattern at points +\end_layout + +\begin_layout LyX-Code + where space would be legal): +\end_layout + +\begin_layout LyX-Code + r1={au,ua,gc,cg,gu,ug,ga,ag} +\end_layout + +\begin_layout LyX-Code + p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1 +\end_layout + +\begin_layout LyX-Code + The first "pattern unit" does not actually match anything; rather, +\end_layout + +\begin_layout LyX-Code + it defines a "pairing rule" in which standard pairings are +\end_layout + +\begin_layout LyX-Code + allowed, as well as G-A and A-G (in case you wondered, Us and Ts +\end_layout + +\begin_layout LyX-Code + and upper and lower case can be used interchangably; for example +\end_layout + +\begin_layout LyX-Code + r1={AT,UA,gc,cg} could be used to define the "standard rule" for +\end_layout + +\begin_layout LyX-Code + pairings). + The second line consists of six pattern units which +\end_layout + +\begin_layout LyX-Code + may be interpreted as follows: +\end_layout + +\begin_layout LyX-Code + p1=2...3 match 2 or 3 characters (call it p1) +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + p2=2...5 match 2 to 5 characters (call it p2) +\end_layout + +\begin_layout LyX-Code + 1...5 match 1 to 5 characters +\end_layout + +\begin_layout LyX-Code + r1~p2 match the reverse complement of p2, +\end_layout + +\begin_layout LyX-Code + allowing G-A and A-G pairs +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + ~p1 match the reverse complement of p1 +\end_layout + +\begin_layout LyX-Code + allowing only G-C, C-G, A-T, and T-A pairs +\end_layout + +\begin_layout LyX-Code + Thus, r1~p2 means "match the reverse complement of p2 using rule r1". +\end_layout + +\begin_layout LyX-Code + Now let us consider the issue of tolerating mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + You may add a "qualifier" to the pattern unit that gives the +\end_layout + +\begin_layout LyX-Code + tolerable number of "mismatches, deletions, and insertions". +\end_layout + +\begin_layout LyX-Code + Thus, +\end_layout + +\begin_layout LyX-Code + p1=10...10 3...8 ~p1[1,2,1] +\end_layout + +\begin_layout LyX-Code + means that the third pattern unit must match 10 characters, +\end_layout + +\begin_layout LyX-Code + allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or +\end_layout + +\begin_layout LyX-Code + T-A), two deletions (a deletion is a character that occurs in p1, +\end_layout + +\begin_layout LyX-Code + but has been "deleted" from the string matched by ~p1), and one +\end_layout + +\begin_layout LyX-Code + insertion (an "insertion" is a character that occurs in the string +\end_layout + +\begin_layout LyX-Code + matched by ~p1, but not for which no corresponding character +\end_layout + +\begin_layout LyX-Code + occurs in p1). + In this case, the pattern would match +\end_layout + +\begin_layout LyX-Code + ACGTACGTAC GGGGGGGG GCGTTACCT +\end_layout + +\begin_layout LyX-Code + which is, you must admit, a fairly weak loop. + It is common to +\end_layout + +\begin_layout LyX-Code + allow mismatches, but you will find yourself using insertions and +\end_layout + +\begin_layout LyX-Code + deletions much more rarely. + In any event, you should note that +\end_layout + +\begin_layout LyX-Code + allowing mismatches, insertions, and deletions does force the +\end_layout + +\begin_layout LyX-Code + program to try many additional possible pairings, so it does slow +\end_layout + +\begin_layout LyX-Code + things down a bit. +\end_layout + +\begin_layout LyX-Code +How Patterns Are Matched +\end_layout + +\begin_layout LyX-Code + Now is as good a time as any to discuss the basic flow of control +\end_layout + +\begin_layout LyX-Code + when matching patterns. + Recall that a "pattern" is a sequence of +\end_layout + +\begin_layout LyX-Code + "pattern units". + Suppose that the pattern units were +\end_layout + +\begin_layout LyX-Code + u1 u2 u3 u4 ... + un +\end_layout + +\begin_layout LyX-Code + The scan of a sequence S begins by setting the current position +\end_layout + +\begin_layout LyX-Code + to 1. + Then, an attempt is made to match u1 starting at the +\end_layout + +\begin_layout LyX-Code + current position. + Each attempt to match a pattern unit can +\end_layout + +\begin_layout LyX-Code + succeed or fail. + If it succeeds, then an attempt is made to match +\end_layout + +\begin_layout LyX-Code + the next unit. + If it fails, then an attempt is made to find an +\end_layout + +\begin_layout LyX-Code + alternative match for the immediately preceding pattern unit. + If +\end_layout + +\begin_layout LyX-Code + this succeeds, then we proceed forward again to the next unit. + If +\end_layout + +\begin_layout LyX-Code + it fails we go back to the preceding unit. + This process is called +\end_layout + +\begin_layout LyX-Code + "backtracking". + If there are no previous units, then the current +\end_layout + +\begin_layout LyX-Code + position is incremented by one, and everything starts again. + This +\end_layout + +\begin_layout LyX-Code + proceeds until either the current position goes past the end of +\end_layout + +\begin_layout LyX-Code + the sequence or all of the pattern units succeed. + On success, +\end_layout + +\begin_layout LyX-Code + scan_for_matches reports the "hit", the current position is set +\end_layout + +\begin_layout LyX-Code + just past the hit, and an attempt is made to find another hit. +\end_layout + +\begin_layout LyX-Code + If you wish to limit the scan to simply finding a maximum of, say, +\end_layout + +\begin_layout LyX-Code + 10 hits, you can use the -n option (-n 10 would set the limit to +\end_layout + +\begin_layout LyX-Code + 10 reported hits). + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c -n 1 pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + would search for just the first hit (and would stop searching the +\end_layout + +\begin_layout LyX-Code + current sequences or any that follow in the input file). +\end_layout + +\begin_layout LyX-Code +Searching for repeats: +\end_layout + +\begin_layout LyX-Code + In the last section, I discussed almost all of the details +\end_layout + +\begin_layout LyX-Code + required to allow you to look for repeats. + Consider the following +\end_layout + +\begin_layout LyX-Code + set of patterns: +\end_layout + +\begin_layout LyX-Code + p1=6...6 3...8 p1 (find exact 6 character repeat separated +\end_layout + +\begin_layout LyX-Code + by to 8 characters) +\end_layout + +\begin_layout LyX-Code + p1=6...6 3..8 p1[1,0,0] (allow one mismatch) +\end_layout + +\begin_layout LyX-Code + p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0] +\end_layout + +\begin_layout LyX-Code + (match 12 characters that are the remains +\end_layout + +\begin_layout LyX-Code + of a 3-character sequence occurring 4 times) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...8 0...3 p2=6...8 p1 0...3 p2 +\end_layout + +\begin_layout LyX-Code + (This would match things like +\end_layout + +\begin_layout LyX-Code + ATCT G TCTTT ATCT TG TCTTT +\end_layout + +\begin_layout LyX-Code + ) +\end_layout + +\begin_layout LyX-Code +Searching for particular sequences: +\end_layout + +\begin_layout LyX-Code + Occasionally, one wishes to match a specific, known sequence. +\end_layout + +\begin_layout LyX-Code + In such a case, you can just give the sequence (along with an +\end_layout + +\begin_layout LyX-Code + optional statement of the allowable mismatches, insertions, and +\end_layout + +\begin_layout LyX-Code + deletions). + Thus, +\end_layout + +\begin_layout LyX-Code + p1=6...8 GAGA ~p1 (match a hairpin with GAGA as the loop) +\end_layout + +\begin_layout LyX-Code + RRRRYYYY (match 4 purines followed by 4 pyrimidines) +\end_layout + +\begin_layout LyX-Code + TATAA[1,0,0] (match TATAA, allowing 1 mismatch) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matches against a "weight matrix": +\end_layout + +\begin_layout LyX-Code + I will conclude my examples of the types of pattern units +\end_layout + +\begin_layout LyX-Code + available for matching against nucleotide sequences by discussing a +\end_layout + +\begin_layout LyX-Code + crude implemetation of matching using a "weight matrix". + While I +\end_layout + +\begin_layout LyX-Code + am less than overwhelmed with the syntax that I chose, I think that +\end_layout + +\begin_layout LyX-Code + the reader should be aware that I was thinking of generating +\end_layout + +\begin_layout LyX-Code + patterns containing such pattern units automatically from +\end_layout + +\begin_layout LyX-Code + alignments (and did not really plan on typing such things in by +\end_layout + +\begin_layout LyX-Code + hand very often). + Anyway, suppose that you wanted to match a +\end_layout + +\begin_layout LyX-Code + sequence of eight characters. + The "consensus" of these eight +\end_layout + +\begin_layout LyX-Code + characters is GRCACCGS, but the actual "frequencies of occurrence" +\end_layout + +\begin_layout LyX-Code + are given in the matrix below. + Thus, the first character is an A +\end_layout + +\begin_layout LyX-Code + 16% the time and a G 84% of the time. + The second is an A 57% of +\end_layout + +\begin_layout LyX-Code + the time, a C 10% of the time, a G 29% of the time, and a T 4% of +\end_layout + +\begin_layout LyX-Code + the time. + +\end_layout + +\begin_layout LyX-Code + C1 C2 C3 C4 C5 C6 C7 C8 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + A 16 57 0 95 0 18 0 0 +\end_layout + +\begin_layout LyX-Code + C 0 10 80 0 100 60 0 50 +\end_layout + +\begin_layout LyX-Code + G 84 29 0 0 0 20 100 50 +\end_layout + +\begin_layout LyX-Code + T 0 4 20 5 0 2 0 0 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + One could use the following pattern unit to search for inexact +\end_layout + +\begin_layout LyX-Code + matches related to such a "weight matrix": +\end_layout + +\begin_layout LyX-Code + {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + This pattern unit will attempt to match exactly eight characters. +\end_layout + +\begin_layout LyX-Code + For each character in the sequence, the entry in the corresponding +\end_layout + +\begin_layout LyX-Code + tuple is added to an accumulated sum. + If the sum is greater than +\end_layout + +\begin_layout LyX-Code + 450, the match succeeds; else it fails. +\end_layout + +\begin_layout LyX-Code + Recently, this feature was upgraded to allow ranges. + Thus, +\end_layout + +\begin_layout LyX-Code + 600 > {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + will work, as well. +\end_layout + +\begin_layout LyX-Code +Allowing Alternatives: +\end_layout + +\begin_layout LyX-Code + Very occasionally, you may wish to allow alternative pattern units +\end_layout + +\begin_layout LyX-Code + (i.e., "match either A or B"). + You can do this using something +\end_layout + +\begin_layout LyX-Code + like +\end_layout + +\begin_layout LyX-Code + ( GAGA | GCGCA) +\end_layout + +\begin_layout LyX-Code + which says "match either GAGA or GCGCA". + You may take +\end_layout + +\begin_layout LyX-Code + alternatives of a list of pattern units, for example +\end_layout + +\begin_layout LyX-Code + (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG) +\end_layout + +\begin_layout LyX-Code + would match one of two sequences of pattern units. + There is one +\end_layout + +\begin_layout LyX-Code + clumsy aspect of the syntax: to match a list of alternatives, you +\end_layout + +\begin_layout LyX-Code + need to fully the request. + Thus, +\end_layout + +\begin_layout LyX-Code + (GAGA | (GCGCA | TTCGA)) +\end_layout + +\begin_layout LyX-Code + would be needed to try the three alternatives. +\end_layout + +\begin_layout LyX-Code +One Minor Extension +\end_layout + +\begin_layout LyX-Code + Sometimes a pattern will contain a sequence of distinct ranges, +\end_layout + +\begin_layout LyX-Code + and you might wish to limit the sum of the lengths of the matched +\end_layout + +\begin_layout LyX-Code + subsequences. + For example, suppose that you basically wanted to +\end_layout + +\begin_layout LyX-Code + match something like +\end_layout + +\begin_layout LyX-Code + ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT +\end_layout + +\begin_layout LyX-Code + but that the sum of the lengths of p1, p2, and p3 must not exceed +\end_layout + +\begin_layout LyX-Code + eight characters. + To do this, you could add +\end_layout + +\begin_layout LyX-Code + length(p1+p2+p3) < 9 +\end_layout + +\begin_layout LyX-Code + as the last pattern unit. + It will just succeed or fail (but does +\end_layout + +\begin_layout LyX-Code + not actually match any characters in the sequence). +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matching Protein Sequences +\end_layout + +\begin_layout LyX-Code + Suppose that the input file contains protein sequences. + In this +\end_layout + +\begin_layout LyX-Code + case, you must invoke scan_for_matches with the "-p" option. + You +\end_layout + +\begin_layout LyX-Code + cannot use aspects of the language that relate directly to +\end_layout + +\begin_layout LyX-Code + nucleotide sequences (e.g., the -c command line option or pattern +\end_layout + +\begin_layout LyX-Code + constructs referring to the reverse complement of a previously +\end_layout + +\begin_layout LyX-Code + matched unit). + +\end_layout + +\begin_layout LyX-Code + You also have two additional constructs that allow you to match +\end_layout + +\begin_layout LyX-Code + either "one of a set of amino acids" or "any amino acid other than +\end_layout + +\begin_layout LyX-Code + those a given set". + For example, +\end_layout + +\begin_layout LyX-Code + p1=0...4 any(HQD) 1...3 notany(HK) p1 +\end_layout + +\begin_layout LyX-Code + would successfully match a string like +\end_layout + +\begin_layout LyX-Code + YWV D AA C YWV +\end_layout + +\begin_layout LyX-Code +Using the show_hits Utility +\end_layout + +\begin_layout LyX-Code + When viewing a large set of complex matches, you might find it +\end_layout + +\begin_layout LyX-Code + convenient to post-process the scan_for_matches output to get a +\end_layout + +\begin_layout LyX-Code + more readable version. + We provide a simple post-processor called +\end_layout + +\begin_layout LyX-Code + "show_hits". + To see its effect, just pipe the output of a +\end_layout + +\begin_layout LyX-Code + scan_for_matches into show_hits: +\end_layout + +\begin_layout LyX-Code + Normal Output: +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp +\end_layout + +\begin_layout LyX-Code + >tst1:[1,28] +\end_layout + +\begin_layout LyX-Code + gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst1:[28,1] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + >tst2:[2,31] +\end_layout + +\begin_layout LyX-Code + CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + >tst2:[31,2] +\end_layout + +\begin_layout LyX-Code + CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + >tst3:[3,32] +\end_layout + +\begin_layout LyX-Code + gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst3:[32,3] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + Piped Through show_hits: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + Optionally, you can specify which of the "fields" in the matches +\end_layout + +\begin_layout LyX-Code + you wish to sort on, and show_hits will sort them. + The field +\end_layout + +\begin_layout LyX-Code + numbers start with 0. + So, you might get something like +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits 2 1 +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + In this case, the hits have been sorted on fields 2 and 1 (that is, +\end_layout + +\begin_layout LyX-Code + the third and second matched subfields). +\end_layout + +\begin_layout LyX-Code + show_hits is just one possible little post-processor, and you +\end_layout + +\begin_layout LyX-Code + might well wish to write a customized one for yourself. +\end_layout + +\begin_layout LyX-Code +Reducing the Cost of a Search +\end_layout + +\begin_layout LyX-Code + The scan_for_matches utility uses a fairly simple search, and may +\end_layout + +\begin_layout LyX-Code + consume large amounts of CPU time for complex patterns. + Someday, +\end_layout + +\begin_layout LyX-Code + I may decide to optimize the code. + However, until then, let me +\end_layout + +\begin_layout LyX-Code + mention one useful technique. + +\end_layout + +\begin_layout LyX-Code + When you have a complex pattern that includes a number of varying +\end_layout + +\begin_layout LyX-Code + ranges, imprecise matches, and so forth, it is useful to +\end_layout + +\begin_layout LyX-Code + "pipeline" matches. + That is, form a simpler pattern that can be +\end_layout + +\begin_layout LyX-Code + used to scan through a large database extracting sections that +\end_layout + +\begin_layout LyX-Code + might be matched by the more complex pattern. + Let me illustrate +\end_layout + +\begin_layout LyX-Code + with a short example. + Suppose that you really wished to match the +\end_layout + +\begin_layout LyX-Code + pattern +\end_layout + +\begin_layout LyX-Code + p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0] +\end_layout + +\begin_layout LyX-Code + In this case, the pattern units AGC 3...5 RYGC can be used to rapidly +\end_layout + +\begin_layout LyX-Code + constrain the overall search. + You can preprocess the overall +\end_layout + +\begin_layout LyX-Code + database using the pattern: +\end_layout + +\begin_layout LyX-Code + 31...31 AGC 3...5 RYGC 7...7 +\end_layout + +\begin_layout LyX-Code + Put the complex pattern in pat_file1 and the simpler pattern in +\end_layout + +\begin_layout LyX-Code + pat_file2. + Then use, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file2 < nucleotide_database | +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file1 +\end_layout + +\begin_layout LyX-Code + The output will show things like +\end_layout + +\begin_layout LyX-Code + >seqid:[232,280][2,47] +\end_layout + +\begin_layout LyX-Code + matches pieces +\end_layout + +\begin_layout LyX-Code + Then, the actual section of the sequence that was matched can be +\end_layout + +\begin_layout LyX-Code + easily computed as [233,278] (remember, the positions start from +\end_layout + +\begin_layout LyX-Code + 1, not 0). +\end_layout + +\begin_layout LyX-Code + Let me finally add, you should do a few short experiments to see +\end_layout + +\begin_layout LyX-Code + whether or not such pipelining actually improves performance -- it +\end_layout + +\begin_layout LyX-Code + is not always obvious where the time is going, and I have +\end_layout + +\begin_layout LyX-Code + sometimes found that the added complexity of pipelining actually +\end_layout + +\begin_layout LyX-Code + slowed things up. + It gets its best improvements when there are +\end_layout + +\begin_layout LyX-Code + exact matches of more than just a few characters that can be +\end_layout + +\begin_layout LyX-Code + rapidly used to eliminate large sections of the database. +\end_layout + +\begin_layout LyX-Code +============= +\end_layout + +\begin_layout LyX-Code +Additions: +\end_layout + +\begin_layout LyX-Code +Feb 9, 1995: the pattern units ^ and $ now work as in normal regular +\end_layout + +\begin_layout LyX-Code + expressions. + That is +\end_layout + +\begin_layout LyX-Code + TTF $ +\end_layout + +\begin_layout LyX-Code + matches only TTF at the end of the string and +\end_layout + +\begin_layout LyX-Code + ^ TTF +\end_layout + +\begin_layout LyX-Code + matches only an initial TTF +\end_layout + +\begin_layout LyX-Code + The pattern unit +\end_layout + +\begin_layout LyX-Code + ' followed by the sequence name on a line of +its own, followed by one or my lines of sequence until the next entry or the end of the file. The resulting biopiece record consists of the following record type: + +{{{ +SEQ_NAME: test +SEQ_LEN: 10 +SEQ: ATCGATCGAC +--- +}}} + +For more about the FASTA format: + +http://en.wikipedia.org/wiki/Fasta_format + +==Usage== + +{{{ +read_fasta [options] -i +}}} + +==Options== + +{{{ +[-i | --data_in=] - Comma separated list of files or glob expression to read. +[-n | --num=] - Limit number of records to read. +[-I | --stream_in=] - Read input stream from file - Default=STDIN +[-O | --stream_out=] - Write output stream to file - Default=STDOUT +}}} + +==Examples= + +To read all FASTA entries from a file: + +{{{ +read_fasta -i test.fna +}}} + +To read in only 10 records from a FASTA file: + +{{{ +read_fasta -n 10 -i test.fna +}}} + +To read all FASTA entries from multiple files: + +{{{ +read_fasta -i test1.fna,test2.fna +}}} + +To read FASTS entries from multiple files using a glob expression: + +{{{ +read_fasta -i '*.fna' +}}} + + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. +mail@maasha.dk +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +*read_fasta* is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ + diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 3841595..e3b765c 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -23,7 +23,7 @@ package Maasha::Biopieces; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Routines for manipulation, parsing and emitting of human/machine readable biotool records. +# Routines for manipulation, parsing and emitting of human/machine readable biopieces records. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -5656,12 +5656,12 @@ sub read_stream if ( not -t STDIN ) { $fh = &Maasha::Common::read_stdin(); } elsif ( not $path ) { -# &Maasha::Common::error( qq(no data stream) ); + &Maasha::Common::error( qq(no data stream) ); } else { $fh = &Maasha::Common::read_open( $path ); } -# $fh->autoflush(1) if $fh; +# $fh->autoflush(1) if $fh; # Disable file buffer for debugging. return $fh; }