From: martinahansen Date: Thu, 26 Jun 2008 01:33:55 +0000 (+0000) Subject: Here we go X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=54de880e571a403a6a67f83f1edbe222482a1de6;p=biopieces.git Here we go git-svn-id: http://biopieces.googlecode.com/svn/trunk@5 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..e69de29 diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..a52a6e5 --- /dev/null +++ b/LICENCE @@ -0,0 +1,17 @@ +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html diff --git a/bp_conf/README b/bp_conf/README new file mode 100644 index 0000000..cdc56d6 --- /dev/null +++ b/bp_conf/README @@ -0,0 +1,10 @@ +This directory contains configuration files for Biopieces. + + bashrc - contains environment settings for bash shell environment + + +Equivalent configuration files may be written for other shell types, tcsh, csh, etc. + + + +Martin A. Hansen, July 2008 diff --git a/bp_conf/bashrc b/bp_conf/bashrc new file mode 100644 index 0000000..60b0b78 --- /dev/null +++ b/bp_conf/bashrc @@ -0,0 +1,21 @@ +# >>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Stuff that enables biotools. + +export TOOLS_DIR="/Users/m.hansen/tools" # Contains binaries for BLAST and Vmatch. +export INST_DIR="/Users/m.hansen/maasha" # Contains scripts and modules. +export DATA_DIR="/Users/m.hansen/DATA" # Contains genomic data etc. +export TMP_DIR="/Users/m.hansen/maasha/tmp" # Required temporary directory. +export LOG_DIR="/Users/m.hansen/maasha/log" # Log directory + +export PATH="$PATH:$TOOLS_DIR/blast-2.2.17/bin:$TOOLS_DIR/vmatch.distribution" +export PATH="$INST_DIR/bin/:$INST_DIR/perl/bin:$INST_DIR/perl_scripts/:$INST_DIR/biotools:$PATH" +export PERL5LIB="$PERL5LIB:$INST_DIR/perl_modules" + +# Alias allowing power scripting with biotools + +alias bioscript="perl -MMaasha::Biotools=read_stream,get_record,put_record -e" + + +# >>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_data/README b/bp_data/README new file mode 100644 index 0000000..43cc76a --- /dev/null +++ b/bp_data/README @@ -0,0 +1,7 @@ +This directory contains add-hoc data files needed by Biopieces + + human_cytobands.txt - Cytoband information for plot_karyogram + mouse_cytobands.txt - Cytoband information for plot_karyogram + + +Martin A. Hansen, July 2008 diff --git a/bp_data/human_cytobands.txt b/bp_data/human_cytobands.txt new file mode 100644 index 0000000..ebd9f1c --- /dev/null +++ b/bp_data/human_cytobands.txt @@ -0,0 +1,863 @@ +#chrom chromStart chromEnd name gieStain +chr1 0 2300000 p36.33 gneg +chr1 2300000 5300000 p36.32 gpos25 +chr1 5300000 7100000 p36.31 gneg +chr1 7100000 9200000 p36.23 gpos25 +chr1 9200000 12600000 p36.22 gneg +chr1 12600000 16100000 p36.21 gpos50 +chr1 16100000 20300000 p36.13 gneg +chr1 20300000 23800000 p36.12 gpos25 +chr1 23800000 27800000 p36.11 gneg +chr1 27800000 30000000 p35.3 gpos25 +chr1 30000000 32200000 p35.2 gneg +chr1 32200000 34400000 p35.1 gpos25 +chr1 34400000 39600000 p34.3 gneg +chr1 39600000 43900000 p34.2 gpos25 +chr1 43900000 46500000 p34.1 gneg +chr1 46500000 51300000 p33 gpos75 +chr1 51300000 56200000 p32.3 gneg +chr1 56200000 58700000 p32.2 gpos50 +chr1 58700000 60900000 p32.1 gneg +chr1 60900000 68700000 p31.3 gpos50 +chr1 68700000 69500000 p31.2 gneg +chr1 69500000 84700000 p31.1 gpos100 +chr1 84700000 88100000 p22.3 gneg +chr1 88100000 92000000 p22.2 gpos75 +chr1 92000000 94500000 p22.1 gneg +chr1 94500000 99400000 p21.3 gpos75 +chr1 99400000 102000000 p21.2 gneg +chr1 102000000 107000000 p21.1 gpos100 +chr1 107000000 111600000 p13.3 gneg +chr1 111600000 115900000 p13.2 gpos50 +chr1 115900000 117600000 p13.1 gneg +chr1 117600000 120700000 p12 gpos50 +chr1 120700000 121100000 p11.2 gneg +chr1 121100000 124300000 p11.1 acen +chr1 124300000 128000000 q11 acen +chr1 128000000 142400000 q12 gvar +chr1 142400000 148000000 q21.1 gneg +chr1 148000000 149600000 q21.2 gpos50 +chr1 149600000 153300000 q21.3 gneg +chr1 153300000 154800000 q22 gpos50 +chr1 154800000 157300000 q23.1 gneg +chr1 157300000 158800000 q23.2 gpos50 +chr1 158800000 163800000 q23.3 gneg +chr1 163800000 165500000 q24.1 gpos50 +chr1 165500000 169100000 q24.2 gneg +chr1 169100000 171200000 q24.3 gpos75 +chr1 171200000 174300000 q25.1 gneg +chr1 174300000 178600000 q25.2 gpos50 +chr1 178600000 184000000 q25.3 gneg +chr1 184000000 189000000 q31.1 gpos100 +chr1 189000000 192100000 q31.2 gneg +chr1 192100000 197500000 q31.3 gpos100 +chr1 197500000 205300000 q32.1 gneg +chr1 205300000 209500000 q32.2 gpos25 +chr1 209500000 212100000 q32.3 gneg +chr1 212100000 222100000 q41 gpos100 +chr1 222100000 222700000 q42.11 gneg +chr1 222700000 225100000 q42.12 gpos25 +chr1 225100000 228800000 q42.13 gneg +chr1 228800000 232700000 q42.2 gpos50 +chr1 232700000 234600000 q42.3 gneg +chr1 234600000 241700000 q43 gpos75 +chr1 241700000 247249719 q44 gneg +chr2 0 4300000 p25.3 gneg +chr2 4300000 7000000 p25.2 gpos50 +chr2 7000000 12800000 p25.1 gneg +chr2 12800000 17000000 p24.3 gpos75 +chr2 17000000 19100000 p24.2 gneg +chr2 19100000 23900000 p24.1 gpos75 +chr2 23900000 27700000 p23.3 gneg +chr2 27700000 29800000 p23.2 gpos25 +chr2 29800000 31900000 p23.1 gneg +chr2 31900000 36400000 p22.3 gpos75 +chr2 36400000 38400000 p22.2 gneg +chr2 38400000 41600000 p22.1 gpos50 +chr2 41600000 47600000 p21 gneg +chr2 47600000 52700000 p16.3 gpos100 +chr2 52700000 54800000 p16.2 gneg +chr2 54800000 61100000 p16.1 gpos100 +chr2 61100000 64000000 p15 gneg +chr2 64000000 70500000 p14 gpos50 +chr2 70500000 72600000 p13.3 gneg +chr2 72600000 73900000 p13.2 gpos50 +chr2 73900000 75400000 p13.1 gneg +chr2 75400000 83700000 p12 gpos100 +chr2 83700000 91000000 p11.2 gneg +chr2 91000000 93300000 p11.1 acen +chr2 93300000 95700000 q11.1 acen +chr2 95700000 102100000 q11.2 gneg +chr2 102100000 105300000 q12.1 gpos50 +chr2 105300000 106700000 q12.2 gneg +chr2 106700000 108600000 q12.3 gpos25 +chr2 108600000 113800000 q13 gneg +chr2 113800000 118600000 q14.1 gpos50 +chr2 118600000 122100000 q14.2 gneg +chr2 122100000 129600000 q14.3 gpos50 +chr2 129600000 132200000 q21.1 gneg +chr2 132200000 134800000 q21.2 gpos25 +chr2 134800000 136600000 q21.3 gneg +chr2 136600000 142400000 q22.1 gpos100 +chr2 142400000 144700000 q22.2 gneg +chr2 144700000 148400000 q22.3 gpos100 +chr2 148400000 149600000 q23.1 gneg +chr2 149600000 150300000 q23.2 gpos25 +chr2 150300000 154600000 q23.3 gneg +chr2 154600000 159600000 q24.1 gpos75 +chr2 159600000 163500000 q24.2 gneg +chr2 163500000 169500000 q24.3 gpos75 +chr2 169500000 177700000 q31.1 gneg +chr2 177700000 180400000 q31.2 gpos50 +chr2 180400000 182700000 q31.3 gneg +chr2 182700000 189100000 q32.1 gpos75 +chr2 189100000 191600000 q32.2 gneg +chr2 191600000 197100000 q32.3 gpos75 +chr2 197100000 203500000 q33.1 gneg +chr2 203500000 205600000 q33.2 gpos50 +chr2 205600000 209100000 q33.3 gneg +chr2 209100000 215100000 q34 gpos100 +chr2 215100000 221300000 q35 gneg +chr2 221300000 224900000 q36.1 gpos75 +chr2 224900000 225800000 q36.2 gneg +chr2 225800000 230700000 q36.3 gpos100 +chr2 230700000 235300000 q37.1 gneg +chr2 235300000 237000000 q37.2 gpos50 +chr2 237000000 242951149 q37.3 gneg +chr3 0 3500000 p26.3 gpos50 +chr3 3500000 5500000 p26.2 gneg +chr3 5500000 8700000 p26.1 gpos50 +chr3 8700000 11500000 p25.3 gneg +chr3 11500000 12400000 p25.2 gpos25 +chr3 12400000 14700000 p25.1 gneg +chr3 14700000 23800000 p24.3 gpos100 +chr3 23800000 26400000 p24.2 gneg +chr3 26400000 30800000 p24.1 gpos75 +chr3 30800000 32100000 p23 gneg +chr3 32100000 36500000 p22.3 gpos50 +chr3 36500000 39300000 p22.2 gneg +chr3 39300000 43600000 p22.1 gpos75 +chr3 43600000 44400000 p21.33 gneg +chr3 44400000 44700000 p21.32 gpos50 +chr3 44700000 51400000 p21.31 gneg +chr3 51400000 51700000 p21.2 gpos25 +chr3 51700000 54400000 p21.1 gneg +chr3 54400000 58500000 p14.3 gpos50 +chr3 58500000 63700000 p14.2 gneg +chr3 63700000 71800000 p14.1 gpos50 +chr3 71800000 74200000 p13 gneg +chr3 74200000 81800000 p12.3 gpos75 +chr3 81800000 83700000 p12.2 gneg +chr3 83700000 87200000 p12.1 gpos75 +chr3 87200000 89400000 p11.2 gneg +chr3 89400000 91700000 p11.1 acen +chr3 91700000 93200000 q11.1 acen +chr3 93200000 99800000 q11.2 gvar +chr3 99800000 101500000 q12.1 gneg +chr3 101500000 102500000 q12.2 gpos25 +chr3 102500000 104400000 q12.3 gneg +chr3 104400000 107800000 q13.11 gpos75 +chr3 107800000 109500000 q13.12 gneg +chr3 109500000 112800000 q13.13 gpos50 +chr3 112800000 115000000 q13.2 gneg +chr3 115000000 118800000 q13.31 gpos75 +chr3 118800000 120500000 q13.32 gneg +chr3 120500000 123400000 q13.33 gpos75 +chr3 123400000 125400000 q21.1 gneg +chr3 125400000 127700000 q21.2 gpos25 +chr3 127700000 131500000 q21.3 gneg +chr3 131500000 135700000 q22.1 gpos25 +chr3 135700000 137400000 q22.2 gneg +chr3 137400000 140400000 q22.3 gpos25 +chr3 140400000 144400000 q23 gneg +chr3 144400000 150400000 q24 gpos100 +chr3 150400000 153500000 q25.1 gneg +chr3 153500000 156300000 q25.2 gpos50 +chr3 156300000 158100000 q25.31 gneg +chr3 158100000 159900000 q25.32 gpos50 +chr3 159900000 161200000 q25.33 gneg +chr3 161200000 169200000 q26.1 gpos100 +chr3 169200000 172500000 q26.2 gneg +chr3 172500000 177300000 q26.31 gpos75 +chr3 177300000 180600000 q26.32 gneg +chr3 180600000 184200000 q26.33 gpos75 +chr3 184200000 186000000 q27.1 gneg +chr3 186000000 187500000 q27.2 gpos25 +chr3 187500000 189400000 q27.3 gneg +chr3 189400000 193800000 q28 gpos75 +chr3 193800000 199501827 q29 gneg +chr4 0 3100000 p16.3 gneg +chr4 3100000 5200000 p16.2 gpos25 +chr4 5200000 10900000 p16.1 gneg +chr4 10900000 15300000 p15.33 gpos50 +chr4 15300000 18500000 p15.32 gneg +chr4 18500000 23100000 p15.31 gpos75 +chr4 23100000 27900000 p15.2 gneg +chr4 27900000 35500000 p15.1 gpos100 +chr4 35500000 40900000 p14 gneg +chr4 40900000 45600000 p13 gpos50 +chr4 45600000 48700000 p12 gneg +chr4 48700000 50700000 p11 acen +chr4 50700000 52400000 q11 acen +chr4 52400000 59200000 q12 gneg +chr4 59200000 66300000 q13.1 gpos100 +chr4 66300000 70400000 q13.2 gneg +chr4 70400000 76500000 q13.3 gpos75 +chr4 76500000 79200000 q21.1 gneg +chr4 79200000 82600000 q21.21 gpos50 +chr4 82600000 84300000 q21.22 gneg +chr4 84300000 87100000 q21.23 gpos25 +chr4 87100000 88200000 q21.3 gneg +chr4 88200000 94000000 q22.1 gpos75 +chr4 94000000 95400000 q22.2 gneg +chr4 95400000 99100000 q22.3 gpos75 +chr4 99100000 102500000 q23 gneg +chr4 102500000 107900000 q24 gpos50 +chr4 107900000 114100000 q25 gneg +chr4 114100000 120600000 q26 gpos75 +chr4 120600000 124000000 q27 gneg +chr4 124000000 129100000 q28.1 gpos50 +chr4 129100000 131300000 q28.2 gneg +chr4 131300000 139500000 q28.3 gpos100 +chr4 139500000 141700000 q31.1 gneg +chr4 141700000 145000000 q31.21 gpos25 +chr4 145000000 147700000 q31.22 gneg +chr4 147700000 151000000 q31.23 gpos25 +chr4 151000000 155100000 q31.3 gneg +chr4 155100000 161500000 q32.1 gpos100 +chr4 161500000 164500000 q32.2 gneg +chr4 164500000 170400000 q32.3 gpos100 +chr4 170400000 172200000 q33 gneg +chr4 172200000 176600000 q34.1 gpos75 +chr4 176600000 177800000 q34.2 gneg +chr4 177800000 182600000 q34.3 gpos100 +chr4 182600000 187300000 q35.1 gneg +chr4 187300000 191273063 q35.2 gpos25 +chr5 0 4400000 p15.33 gneg +chr5 4400000 6000000 p15.32 gpos25 +chr5 6000000 8200000 p15.31 gneg +chr5 8200000 15100000 p15.2 gpos50 +chr5 15100000 18500000 p15.1 gneg +chr5 18500000 23300000 p14.3 gpos100 +chr5 23300000 24700000 p14.2 gneg +chr5 24700000 29300000 p14.1 gpos100 +chr5 29300000 34400000 p13.3 gneg +chr5 34400000 38500000 p13.2 gpos25 +chr5 38500000 42400000 p13.1 gneg +chr5 42400000 45800000 p12 gpos50 +chr5 45800000 47700000 p11 acen +chr5 47700000 50500000 q11.1 acen +chr5 50500000 58900000 q11.2 gneg +chr5 58900000 63000000 q12.1 gpos75 +chr5 63000000 63700000 q12.2 gneg +chr5 63700000 66500000 q12.3 gpos75 +chr5 66500000 68400000 q13.1 gneg +chr5 68400000 73300000 q13.2 gpos50 +chr5 73300000 76400000 q13.3 gneg +chr5 76400000 81300000 q14.1 gpos50 +chr5 81300000 82800000 q14.2 gneg +chr5 82800000 91900000 q14.3 gpos100 +chr5 91900000 97300000 q15 gneg +chr5 97300000 102800000 q21.1 gpos100 +chr5 102800000 104500000 q21.2 gneg +chr5 104500000 109600000 q21.3 gpos100 +chr5 109600000 111500000 q22.1 gneg +chr5 111500000 113100000 q22.2 gpos50 +chr5 113100000 115200000 q22.3 gneg +chr5 115200000 121500000 q23.1 gpos100 +chr5 121500000 127300000 q23.2 gneg +chr5 127300000 130400000 q23.3 gpos100 +chr5 130400000 135400000 q31.1 gneg +chr5 135400000 139000000 q31.2 gpos25 +chr5 139000000 143100000 q31.3 gneg +chr5 143100000 147200000 q32 gpos75 +chr5 147200000 152100000 q33.1 gneg +chr5 152100000 155600000 q33.2 gpos50 +chr5 155600000 159900000 q33.3 gneg +chr5 159900000 167400000 q34 gpos100 +chr5 167400000 172200000 q35.1 gneg +chr5 172200000 176500000 q35.2 gpos25 +chr5 176500000 180857866 q35.3 gneg +chr6 0 2300000 p25.3 gneg +chr6 2300000 4100000 p25.2 gpos25 +chr6 4100000 7000000 p25.1 gneg +chr6 7000000 10600000 p24.3 gpos50 +chr6 10600000 11200000 p24.2 gneg +chr6 11200000 13500000 p24.1 gpos25 +chr6 13500000 15500000 p23 gneg +chr6 15500000 23500000 p22.3 gpos75 +chr6 23500000 26100000 p22.2 gneg +chr6 26100000 29900000 p22.1 gpos50 +chr6 29900000 31900000 p21.33 gneg +chr6 31900000 33600000 p21.32 gpos25 +chr6 33600000 36800000 p21.31 gneg +chr6 36800000 40600000 p21.2 gpos25 +chr6 40600000 45200000 p21.1 gneg +chr6 45200000 51100000 p12.3 gpos100 +chr6 51100000 52600000 p12.2 gneg +chr6 52600000 57200000 p12.1 gpos100 +chr6 57200000 58400000 p11.2 gneg +chr6 58400000 60500000 p11.1 acen +chr6 60500000 63400000 q11.1 acen +chr6 63400000 63500000 q11.2 gneg +chr6 63500000 70000000 q12 gpos100 +chr6 70000000 75900000 q13 gneg +chr6 75900000 83900000 q14.1 gpos50 +chr6 83900000 84700000 q14.2 gneg +chr6 84700000 87500000 q14.3 gpos50 +chr6 87500000 92100000 q15 gneg +chr6 92100000 98700000 q16.1 gpos100 +chr6 98700000 99900000 q16.2 gneg +chr6 99900000 104800000 q16.3 gpos100 +chr6 104800000 113900000 q21 gneg +chr6 113900000 117100000 q22.1 gpos75 +chr6 117100000 118600000 q22.2 gneg +chr6 118600000 126200000 q22.31 gpos100 +chr6 126200000 127300000 q22.32 gneg +chr6 127300000 130400000 q22.33 gpos75 +chr6 130400000 131300000 q23.1 gneg +chr6 131300000 135200000 q23.2 gpos50 +chr6 135200000 139100000 q23.3 gneg +chr6 139100000 142900000 q24.1 gpos75 +chr6 142900000 145700000 q24.2 gneg +chr6 145700000 149100000 q24.3 gpos75 +chr6 149100000 152600000 q25.1 gneg +chr6 152600000 155600000 q25.2 gpos50 +chr6 155600000 160900000 q25.3 gneg +chr6 160900000 164400000 q26 gpos50 +chr6 164400000 170899992 q27 gneg +chr7 0 2100000 p22.3 gneg +chr7 2100000 4500000 p22.2 gpos25 +chr7 4500000 7200000 p22.1 gneg +chr7 7200000 13300000 p21.3 gpos100 +chr7 13300000 15200000 p21.2 gneg +chr7 15200000 19500000 p21.1 gpos100 +chr7 19500000 24900000 p15.3 gneg +chr7 24900000 28000000 p15.2 gpos50 +chr7 28000000 31800000 p15.1 gneg +chr7 31800000 35600000 p14.3 gpos75 +chr7 35600000 37500000 p14.2 gneg +chr7 37500000 43300000 p14.1 gpos75 +chr7 43300000 46600000 p13 gneg +chr7 46600000 49800000 p12.3 gpos75 +chr7 49800000 50900000 p12.2 gneg +chr7 50900000 53900000 p12.1 gpos75 +chr7 53900000 57400000 p11.2 gneg +chr7 57400000 59100000 p11.1 acen +chr7 59100000 61100000 q11.1 acen +chr7 61100000 66100000 q11.21 gneg +chr7 66100000 71800000 q11.22 gpos50 +chr7 71800000 77400000 q11.23 gneg +chr7 77400000 86200000 q21.11 gpos100 +chr7 86200000 88000000 q21.12 gneg +chr7 88000000 90900000 q21.13 gpos75 +chr7 90900000 92600000 q21.2 gneg +chr7 92600000 97900000 q21.3 gpos75 +chr7 97900000 104400000 q22.1 gneg +chr7 104400000 105900000 q22.2 gpos50 +chr7 105900000 107200000 q22.3 gneg +chr7 107200000 114400000 q31.1 gpos75 +chr7 114400000 117200000 q31.2 gneg +chr7 117200000 120900000 q31.31 gpos75 +chr7 120900000 123600000 q31.32 gneg +chr7 123600000 126900000 q31.33 gpos75 +chr7 126900000 129000000 q32.1 gneg +chr7 129000000 130100000 q32.2 gpos25 +chr7 130100000 132400000 q32.3 gneg +chr7 132400000 137300000 q33 gpos50 +chr7 137300000 142800000 q34 gneg +chr7 142800000 147500000 q35 gpos75 +chr7 147500000 152200000 q36.1 gneg +chr7 152200000 154700000 q36.2 gpos25 +chr7 154700000 158821424 q36.3 gneg +chr8 0 2200000 p23.3 gneg +chr8 2200000 6200000 p23.2 gpos75 +chr8 6200000 12700000 p23.1 gneg +chr8 12700000 19100000 p22 gpos100 +chr8 19100000 23400000 p21.3 gneg +chr8 23400000 27400000 p21.2 gpos50 +chr8 27400000 29700000 p21.1 gneg +chr8 29700000 38500000 p12 gpos75 +chr8 38500000 39500000 p11.23 gneg +chr8 39500000 39900000 p11.22 gpos25 +chr8 39900000 43200000 p11.21 gneg +chr8 43200000 45200000 p11.1 acen +chr8 45200000 48100000 q11.1 acen +chr8 48100000 50400000 q11.21 gneg +chr8 50400000 52800000 q11.22 gpos75 +chr8 52800000 55600000 q11.23 gneg +chr8 55600000 61700000 q12.1 gpos50 +chr8 61700000 62400000 q12.2 gneg +chr8 62400000 66100000 q12.3 gpos50 +chr8 66100000 68100000 q13.1 gneg +chr8 68100000 70600000 q13.2 gpos50 +chr8 70600000 74000000 q13.3 gneg +chr8 74000000 78500000 q21.11 gpos100 +chr8 78500000 80300000 q21.12 gneg +chr8 80300000 84900000 q21.13 gpos75 +chr8 84900000 87200000 q21.2 gneg +chr8 87200000 93500000 q21.3 gpos100 +chr8 93500000 99100000 q22.1 gneg +chr8 99100000 101600000 q22.2 gpos25 +chr8 101600000 106100000 q22.3 gneg +chr8 106100000 110600000 q23.1 gpos75 +chr8 110600000 112200000 q23.2 gneg +chr8 112200000 117700000 q23.3 gpos100 +chr8 117700000 119200000 q24.11 gneg +chr8 119200000 122500000 q24.12 gpos50 +chr8 122500000 127300000 q24.13 gneg +chr8 127300000 131500000 q24.21 gpos50 +chr8 131500000 136500000 q24.22 gneg +chr8 136500000 140000000 q24.23 gpos75 +chr8 140000000 146274826 q24.3 gneg +chr9 0 2200000 p24.3 gneg +chr9 2200000 4600000 p24.2 gpos25 +chr9 4600000 9000000 p24.1 gneg +chr9 9000000 14100000 p23 gpos75 +chr9 14100000 16600000 p22.3 gneg +chr9 16600000 18500000 p22.2 gpos25 +chr9 18500000 19900000 p22.1 gneg +chr9 19900000 25500000 p21.3 gpos100 +chr9 25500000 28100000 p21.2 gneg +chr9 28100000 32800000 p21.1 gpos100 +chr9 32800000 36300000 p13.3 gneg +chr9 36300000 38000000 p13.2 gpos25 +chr9 38000000 40200000 p13.1 gneg +chr9 40200000 42400000 p12 gpos50 +chr9 42400000 46700000 p11.2 gneg +chr9 46700000 51800000 p11.1 acen +chr9 51800000 60300000 q11 acen +chr9 60300000 70000000 q12 gvar +chr9 70000000 70500000 q13 gneg +chr9 70500000 72700000 q21.11 gpos25 +chr9 72700000 73100000 q21.12 gneg +chr9 73100000 79300000 q21.13 gpos50 +chr9 79300000 80300000 q21.2 gneg +chr9 80300000 83400000 q21.31 gpos50 +chr9 83400000 86100000 q21.32 gneg +chr9 86100000 89600000 q21.33 gpos50 +chr9 89600000 91000000 q22.1 gneg +chr9 91000000 93000000 q22.2 gpos25 +chr9 93000000 95600000 q22.31 gneg +chr9 95600000 98200000 q22.32 gpos25 +chr9 98200000 101600000 q22.33 gneg +chr9 101600000 107200000 q31.1 gpos100 +chr9 107200000 110300000 q31.2 gneg +chr9 110300000 113900000 q31.3 gpos25 +chr9 113900000 116700000 q32 gneg +chr9 116700000 122000000 q33.1 gpos75 +chr9 122000000 125800000 q33.2 gneg +chr9 125800000 129300000 q33.3 gpos25 +chr9 129300000 132500000 q34.11 gneg +chr9 132500000 132800000 q34.12 gpos25 +chr9 132800000 134900000 q34.13 gneg +chr9 134900000 136600000 q34.2 gpos25 +chr9 136600000 140273252 q34.3 gneg +chrX 0 4300000 p22.33 gneg +chrX 4300000 6000000 p22.32 gpos50 +chrX 6000000 9500000 p22.31 gneg +chrX 9500000 17100000 p22.2 gpos50 +chrX 17100000 19200000 p22.13 gneg +chrX 19200000 21800000 p22.12 gpos50 +chrX 21800000 24900000 p22.11 gneg +chrX 24900000 29400000 p21.3 gpos100 +chrX 29400000 31500000 p21.2 gneg +chrX 31500000 37500000 p21.1 gpos100 +chrX 37500000 42300000 p11.4 gneg +chrX 42300000 47300000 p11.3 gpos75 +chrX 47300000 49700000 p11.23 gneg +chrX 49700000 54700000 p11.22 gpos25 +chrX 54700000 56600000 p11.21 gneg +chrX 56600000 59500000 p11.1 acen +chrX 59500000 65000000 q11.1 acen +chrX 65000000 65100000 q11.2 gneg +chrX 65100000 67700000 q12 gpos50 +chrX 67700000 72200000 q13.1 gneg +chrX 72200000 73800000 q13.2 gpos50 +chrX 73800000 76000000 q13.3 gneg +chrX 76000000 84500000 q21.1 gpos100 +chrX 84500000 86200000 q21.2 gneg +chrX 86200000 91900000 q21.31 gpos100 +chrX 91900000 93500000 q21.32 gneg +chrX 93500000 98200000 q21.33 gpos75 +chrX 98200000 102500000 q22.1 gneg +chrX 102500000 103600000 q22.2 gpos50 +chrX 103600000 110500000 q22.3 gneg +chrX 110500000 116800000 q23 gpos75 +chrX 116800000 120700000 q24 gneg +chrX 120700000 129800000 q25 gpos100 +chrX 129800000 130300000 q26.1 gneg +chrX 130300000 133500000 q26.2 gpos25 +chrX 133500000 137800000 q26.3 gneg +chrX 137800000 140100000 q27.1 gpos75 +chrX 140100000 141900000 q27.2 gneg +chrX 141900000 146900000 q27.3 gpos100 +chrX 146900000 154913754 q28 gneg +chrY 0 1700000 p11.32 gneg +chrY 1700000 3300000 p11.31 gpos50 +chrY 3300000 11200000 p11.2 gneg +chrY 11200000 11300000 p11.1 acen +chrY 11300000 12500000 q11.1 acen +chrY 12500000 14300000 q11.21 gneg +chrY 14300000 19000000 q11.221 gpos50 +chrY 19000000 21300000 q11.222 gneg +chrY 21300000 25400000 q11.223 gpos50 +chrY 25400000 27200000 q11.23 gneg +chrY 27200000 57772954 q12 gvar +chr10 0 3000000 p15.3 gneg +chr10 3000000 3800000 p15.2 gpos25 +chr10 3800000 6700000 p15.1 gneg +chr10 6700000 12300000 p14 gpos75 +chr10 12300000 17300000 p13 gneg +chr10 17300000 19900000 p12.33 gpos75 +chr10 19900000 20500000 p12.32 gneg +chr10 20500000 22800000 p12.31 gpos75 +chr10 22800000 24100000 p12.2 gneg +chr10 24100000 28300000 p12.1 gpos50 +chr10 28300000 31400000 p11.23 gneg +chr10 31400000 34500000 p11.22 gpos25 +chr10 34500000 38800000 p11.21 gneg +chr10 38800000 40300000 p11.1 acen +chr10 40300000 42100000 q11.1 acen +chr10 42100000 46100000 q11.21 gneg +chr10 46100000 50100000 q11.22 gpos25 +chr10 50100000 53300000 q11.23 gneg +chr10 53300000 61200000 q21.1 gpos100 +chr10 61200000 64800000 q21.2 gneg +chr10 64800000 71300000 q21.3 gpos100 +chr10 71300000 74600000 q22.1 gneg +chr10 74600000 77400000 q22.2 gpos50 +chr10 77400000 82000000 q22.3 gneg +chr10 82000000 87900000 q23.1 gpos100 +chr10 87900000 89600000 q23.2 gneg +chr10 89600000 92900000 q23.31 gpos75 +chr10 92900000 94200000 q23.32 gneg +chr10 94200000 98000000 q23.33 gpos50 +chr10 98000000 99400000 q24.1 gneg +chr10 99400000 102000000 q24.2 gpos50 +chr10 102000000 103000000 q24.31 gneg +chr10 103000000 104900000 q24.32 gpos25 +chr10 104900000 105700000 q24.33 gneg +chr10 105700000 111800000 q25.1 gpos100 +chr10 111800000 114900000 q25.2 gneg +chr10 114900000 119100000 q25.3 gpos75 +chr10 119100000 121700000 q26.11 gneg +chr10 121700000 123100000 q26.12 gpos50 +chr10 123100000 127400000 q26.13 gneg +chr10 127400000 130500000 q26.2 gpos50 +chr10 130500000 135374737 q26.3 gneg +chr11 0 2800000 p15.5 gneg +chr11 2800000 10700000 p15.4 gpos50 +chr11 10700000 12600000 p15.3 gneg +chr11 12600000 16100000 p15.2 gpos50 +chr11 16100000 21600000 p15.1 gneg +chr11 21600000 26000000 p14.3 gpos100 +chr11 26000000 27200000 p14.2 gneg +chr11 27200000 31000000 p14.1 gpos75 +chr11 31000000 36400000 p13 gneg +chr11 36400000 43400000 p12 gpos100 +chr11 43400000 48800000 p11.2 gneg +chr11 48800000 51400000 p11.12 gpos75 +chr11 51400000 52900000 p11.11 acen +chr11 52900000 56400000 q11 acen +chr11 56400000 59700000 q12.1 gpos75 +chr11 59700000 61400000 q12.2 gneg +chr11 61400000 63100000 q12.3 gpos25 +chr11 63100000 67100000 q13.1 gneg +chr11 67100000 69200000 q13.2 gpos25 +chr11 69200000 70700000 q13.3 gneg +chr11 70700000 74900000 q13.4 gpos50 +chr11 74900000 76700000 q13.5 gneg +chr11 76700000 85300000 q14.1 gpos100 +chr11 85300000 87900000 q14.2 gneg +chr11 87900000 92300000 q14.3 gpos100 +chr11 92300000 96700000 q21 gneg +chr11 96700000 101600000 q22.1 gpos100 +chr11 101600000 102400000 q22.2 gneg +chr11 102400000 110000000 q22.3 gpos100 +chr11 110000000 112800000 q23.1 gneg +chr11 112800000 115400000 q23.2 gpos50 +chr11 115400000 120700000 q23.3 gneg +chr11 120700000 123500000 q24.1 gpos50 +chr11 123500000 127400000 q24.2 gneg +chr11 127400000 130300000 q24.3 gpos50 +chr11 130300000 134452384 q25 gneg +chr12 0 3100000 p13.33 gneg +chr12 3100000 5300000 p13.32 gpos25 +chr12 5300000 10000000 p13.31 gneg +chr12 10000000 12600000 p13.2 gpos75 +chr12 12600000 14800000 p13.1 gneg +chr12 14800000 19900000 p12.3 gpos100 +chr12 19900000 21200000 p12.2 gneg +chr12 21200000 26300000 p12.1 gpos100 +chr12 26300000 27700000 p11.23 gneg +chr12 27700000 30600000 p11.22 gpos50 +chr12 30600000 33200000 p11.21 gneg +chr12 33200000 35400000 p11.1 acen +chr12 35400000 36500000 q11 acen +chr12 36500000 44600000 q12 gpos100 +chr12 44600000 47400000 q13.11 gneg +chr12 47400000 48400000 q13.12 gpos25 +chr12 48400000 53100000 q13.13 gneg +chr12 53100000 55200000 q13.2 gpos25 +chr12 55200000 56300000 q13.3 gneg +chr12 56300000 61400000 q14.1 gpos75 +chr12 61400000 63400000 q14.2 gneg +chr12 63400000 66000000 q14.3 gpos50 +chr12 66000000 69800000 q15 gneg +chr12 69800000 74100000 q21.1 gpos75 +chr12 74100000 78700000 q21.2 gneg +chr12 78700000 85100000 q21.31 gpos100 +chr12 85100000 87500000 q21.32 gneg +chr12 87500000 91200000 q21.33 gpos100 +chr12 91200000 94800000 q22 gneg +chr12 94800000 100000000 q23.1 gpos75 +chr12 100000000 102400000 q23.2 gneg +chr12 102400000 107500000 q23.3 gpos50 +chr12 107500000 110200000 q24.11 gneg +chr12 110200000 110800000 q24.12 gpos25 +chr12 110800000 112800000 q24.13 gneg +chr12 112800000 115300000 q24.21 gpos50 +chr12 115300000 116700000 q24.22 gneg +chr12 116700000 119100000 q24.23 gpos50 +chr12 119100000 124500000 q24.31 gneg +chr12 124500000 128700000 q24.32 gpos50 +chr12 128700000 132349534 q24.33 gneg +chr13 0 3800000 p13 gvar +chr13 3800000 8300000 p12 stalk +chr13 8300000 13500000 p11.2 gvar +chr13 13500000 16000000 p11.1 acen +chr13 16000000 18400000 q11 acen +chr13 18400000 22200000 q12.11 gneg +chr13 22200000 24400000 q12.12 gpos25 +chr13 24400000 26700000 q12.13 gneg +chr13 26700000 27800000 q12.2 gpos25 +chr13 27800000 31100000 q12.3 gneg +chr13 31100000 32900000 q13.1 gpos50 +chr13 32900000 34700000 q13.2 gneg +chr13 34700000 39500000 q13.3 gpos75 +chr13 39500000 44300000 q14.11 gneg +chr13 44300000 45900000 q14.12 gpos25 +chr13 45900000 46200000 q14.13 gneg +chr13 46200000 48900000 q14.2 gpos50 +chr13 48900000 52200000 q14.3 gneg +chr13 52200000 57600000 q21.1 gpos100 +chr13 57600000 60500000 q21.2 gneg +chr13 60500000 64100000 q21.31 gpos75 +chr13 64100000 67200000 q21.32 gneg +chr13 67200000 72100000 q21.33 gpos100 +chr13 72100000 74200000 q22.1 gneg +chr13 74200000 76000000 q22.2 gpos50 +chr13 76000000 77800000 q22.3 gneg +chr13 77800000 86500000 q31.1 gpos100 +chr13 86500000 88800000 q31.2 gneg +chr13 88800000 93800000 q31.3 gpos100 +chr13 93800000 97000000 q32.1 gneg +chr13 97000000 98100000 q32.2 gpos25 +chr13 98100000 100500000 q32.3 gneg +chr13 100500000 103700000 q33.1 gpos100 +chr13 103700000 105800000 q33.2 gneg +chr13 105800000 109100000 q33.3 gpos100 +chr13 109100000 114142980 q34 gneg +chr14 0 3100000 p13 gvar +chr14 3100000 6700000 p12 stalk +chr14 6700000 13600000 p11.2 gvar +chr14 13600000 15600000 p11.1 acen +chr14 15600000 19100000 q11.1 acen +chr14 19100000 23600000 q11.2 gneg +chr14 23600000 31800000 q12 gpos100 +chr14 31800000 34100000 q13.1 gneg +chr14 34100000 35600000 q13.2 gpos50 +chr14 35600000 36900000 q13.3 gneg +chr14 36900000 41000000 q21.1 gpos100 +chr14 41000000 43200000 q21.2 gneg +chr14 43200000 48300000 q21.3 gpos100 +chr14 48300000 52300000 q22.1 gneg +chr14 52300000 54400000 q22.2 gpos25 +chr14 54400000 55800000 q22.3 gneg +chr14 55800000 61200000 q23.1 gpos75 +chr14 61200000 64000000 q23.2 gneg +chr14 64000000 67000000 q23.3 gpos50 +chr14 67000000 69300000 q24.1 gneg +chr14 69300000 72900000 q24.2 gpos50 +chr14 72900000 78400000 q24.3 gneg +chr14 78400000 82600000 q31.1 gpos100 +chr14 82600000 84000000 q31.2 gneg +chr14 84000000 88900000 q31.3 gpos100 +chr14 88900000 90500000 q32.11 gneg +chr14 90500000 92800000 q32.12 gpos25 +chr14 92800000 95400000 q32.13 gneg +chr14 95400000 100400000 q32.2 gpos50 +chr14 100400000 102200000 q32.31 gneg +chr14 102200000 103000000 q32.32 gpos50 +chr14 103000000 106368585 q32.33 gneg +chr15 0 3500000 p13 gvar +chr15 3500000 7900000 p12 stalk +chr15 7900000 14100000 p11.2 gvar +chr15 14100000 17000000 p11.1 acen +chr15 17000000 18400000 q11.1 acen +chr15 18400000 23300000 q11.2 gneg +chr15 23300000 25700000 q12 gpos50 +chr15 25700000 28000000 q13.1 gneg +chr15 28000000 29000000 q13.2 gpos50 +chr15 29000000 31400000 q13.3 gneg +chr15 31400000 37900000 q14 gpos75 +chr15 37900000 40700000 q15.1 gneg +chr15 40700000 41400000 q15.2 gpos25 +chr15 41400000 42700000 q15.3 gneg +chr15 42700000 47600000 q21.1 gpos75 +chr15 47600000 51100000 q21.2 gneg +chr15 51100000 55800000 q21.3 gpos75 +chr15 55800000 57100000 q22.1 gneg +chr15 57100000 61500000 q22.2 gpos25 +chr15 61500000 64900000 q22.31 gneg +chr15 64900000 65000000 q22.32 gpos25 +chr15 65000000 65300000 q22.33 gneg +chr15 65300000 70400000 q23 gpos25 +chr15 70400000 73100000 q24.1 gneg +chr15 73100000 74400000 q24.2 gpos25 +chr15 74400000 76100000 q24.3 gneg +chr15 76100000 79500000 q25.1 gpos50 +chr15 79500000 83000000 q25.2 gneg +chr15 83000000 86900000 q25.3 gpos50 +chr15 86900000 92100000 q26.1 gneg +chr15 92100000 96300000 q26.2 gpos50 +chr15 96300000 100338915 q26.3 gneg +chr16 0 6300000 p13.3 gneg +chr16 6300000 10300000 p13.2 gpos50 +chr16 10300000 12500000 p13.13 gneg +chr16 12500000 14700000 p13.12 gpos50 +chr16 14700000 16700000 p13.11 gneg +chr16 16700000 20500000 p12.3 gpos50 +chr16 20500000 21700000 p12.2 gneg +chr16 21700000 27600000 p12.1 gpos50 +chr16 27600000 34400000 p11.2 gneg +chr16 34400000 38200000 p11.1 acen +chr16 38200000 40700000 q11.1 acen +chr16 40700000 45500000 q11.2 gvar +chr16 45500000 51200000 q12.1 gneg +chr16 51200000 54500000 q12.2 gpos50 +chr16 54500000 56700000 q13 gneg +chr16 56700000 65200000 q21 gpos100 +chr16 65200000 69400000 q22.1 gneg +chr16 69400000 69800000 q22.2 gpos50 +chr16 69800000 73300000 q22.3 gneg +chr16 73300000 78200000 q23.1 gpos75 +chr16 78200000 80500000 q23.2 gneg +chr16 80500000 82700000 q23.3 gpos50 +chr16 82700000 85600000 q24.1 gneg +chr16 85600000 87200000 q24.2 gpos25 +chr16 87200000 88827254 q24.3 gneg +chr17 0 3600000 p13.3 gneg +chr17 3600000 6800000 p13.2 gpos50 +chr17 6800000 11200000 p13.1 gneg +chr17 11200000 15900000 p12 gpos75 +chr17 15900000 22100000 p11.2 gneg +chr17 22100000 22200000 p11.1 acen +chr17 22200000 23200000 q11.1 acen +chr17 23200000 28800000 q11.2 gneg +chr17 28800000 35400000 q12 gpos50 +chr17 35400000 35600000 q21.1 gneg +chr17 35600000 37800000 q21.2 gpos25 +chr17 37800000 41900000 q21.31 gneg +chr17 41900000 44800000 q21.32 gpos25 +chr17 44800000 47600000 q21.33 gneg +chr17 47600000 54900000 q22 gpos75 +chr17 54900000 55600000 q23.1 gneg +chr17 55600000 58400000 q23.2 gpos75 +chr17 58400000 59900000 q23.3 gneg +chr17 59900000 61600000 q24.1 gpos50 +chr17 61600000 64600000 q24.2 gneg +chr17 64600000 68400000 q24.3 gpos75 +chr17 68400000 72200000 q25.1 gneg +chr17 72200000 72900000 q25.2 gpos25 +chr17 72900000 78774742 q25.3 gneg +chr18 0 2900000 p11.32 gneg +chr18 2900000 7200000 p11.31 gpos50 +chr18 7200000 8500000 p11.23 gneg +chr18 8500000 10900000 p11.22 gpos25 +chr18 10900000 15400000 p11.21 gneg +chr18 15400000 16100000 p11.1 acen +chr18 16100000 17300000 q11.1 acen +chr18 17300000 23300000 q11.2 gneg +chr18 23300000 31000000 q12.1 gpos100 +chr18 31000000 35500000 q12.2 gneg +chr18 35500000 41800000 q12.3 gpos75 +chr18 41800000 46400000 q21.1 gneg +chr18 46400000 52000000 q21.2 gpos75 +chr18 52000000 54400000 q21.31 gneg +chr18 54400000 57100000 q21.32 gpos50 +chr18 57100000 59800000 q21.33 gneg +chr18 59800000 64900000 q22.1 gpos100 +chr18 64900000 66900000 q22.2 gneg +chr18 66900000 71300000 q22.3 gpos25 +chr18 71300000 76117153 q23 gneg +chr19 0 6900000 p13.3 gneg +chr19 6900000 12600000 p13.2 gpos25 +chr19 12600000 13800000 p13.13 gneg +chr19 13800000 16100000 p13.12 gpos25 +chr19 16100000 19800000 p13.11 gneg +chr19 19800000 26700000 p12 gvar +chr19 26700000 28500000 p11 acen +chr19 28500000 30200000 q11 acen +chr19 30200000 37100000 q12 gvar +chr19 37100000 40300000 q13.11 gneg +chr19 40300000 43000000 q13.12 gpos25 +chr19 43000000 43400000 q13.13 gneg +chr19 43400000 47800000 q13.2 gpos25 +chr19 47800000 50000000 q13.31 gneg +chr19 50000000 53800000 q13.32 gpos25 +chr19 53800000 57600000 q13.33 gneg +chr19 57600000 59100000 q13.41 gpos25 +chr19 59100000 61400000 q13.42 gneg +chr19 61400000 63811651 q13.43 gpos25 +chr20 0 5000000 p13 gneg +chr20 5000000 9000000 p12.3 gpos75 +chr20 9000000 11900000 p12.2 gneg +chr20 11900000 17800000 p12.1 gpos75 +chr20 17800000 21200000 p11.23 gneg +chr20 21200000 22300000 p11.22 gpos25 +chr20 22300000 25700000 p11.21 gneg +chr20 25700000 27100000 p11.1 acen +chr20 27100000 28400000 q11.1 acen +chr20 28400000 31500000 q11.21 gneg +chr20 31500000 33900000 q11.22 gpos25 +chr20 33900000 37100000 q11.23 gneg +chr20 37100000 41100000 q12 gpos75 +chr20 41100000 41600000 q13.11 gneg +chr20 41600000 45800000 q13.12 gpos25 +chr20 45800000 49200000 q13.13 gneg +chr20 49200000 54400000 q13.2 gpos75 +chr20 54400000 55900000 q13.31 gneg +chr20 55900000 57900000 q13.32 gpos50 +chr20 57900000 62435964 q13.33 gneg +chr21 0 2900000 p13 gvar +chr21 2900000 6300000 p12 stalk +chr21 6300000 10000000 p11.2 gvar +chr21 10000000 12300000 p11.1 acen +chr21 12300000 13200000 q11.1 acen +chr21 13200000 15300000 q11.2 gneg +chr21 15300000 22900000 q21.1 gpos100 +chr21 22900000 25800000 q21.2 gneg +chr21 25800000 30500000 q21.3 gpos75 +chr21 30500000 34700000 q22.11 gneg +chr21 34700000 36700000 q22.12 gpos50 +chr21 36700000 38600000 q22.13 gneg +chr21 38600000 41400000 q22.2 gpos50 +chr21 41400000 46944323 q22.3 gneg +chr22 0 3000000 p13 gvar +chr22 3000000 6600000 p12 stalk +chr22 6600000 9600000 p11.2 gvar +chr22 9600000 11800000 p11.1 acen +chr22 11800000 16300000 q11.1 acen +chr22 16300000 20500000 q11.21 gneg +chr22 20500000 21800000 q11.22 gpos25 +chr22 21800000 24300000 q11.23 gneg +chr22 24300000 27900000 q12.1 gpos50 +chr22 27900000 30500000 q12.2 gneg +chr22 30500000 35900000 q12.3 gpos50 +chr22 35900000 39300000 q13.1 gneg +chr22 39300000 42600000 q13.2 gpos50 +chr22 42600000 47000000 q13.31 gneg +chr22 47000000 48200000 q13.32 gpos50 +chr22 48200000 49691432 q13.33 gneg diff --git a/bp_data/mouse_cytobands.txt b/bp_data/mouse_cytobands.txt new file mode 100644 index 0000000..b2e9cbb --- /dev/null +++ b/bp_data/mouse_cytobands.txt @@ -0,0 +1,404 @@ +#chrom chromStart chromEnd name gieStain +chr1 0 8918386 qA1 gpos100 +chr1 8918386 12386647 qA2 gneg +chr1 12386647 20314102 qA3 gpos33 +chr1 20314102 22295965 qA4 gneg +chr1 22295965 31214352 qA5 gpos100 +chr1 31214352 43601000 qB gneg +chr1 43601000 54996715 qC1.1 gpos66 +chr1 54996715 56483113 qC1.2 gneg +chr1 56483113 59951375 qC1.3 gpos75 +chr1 59951375 65896965 qC2 gneg +chr1 65896965 75310818 qC3 gpos33 +chr1 75310818 80760943 qC4 gneg +chr1 80760943 88192931 qC5 gpos33 +chr1 88192931 100579579 qD gneg +chr1 100579579 103552375 qE1.1 gpos33 +chr1 103552375 104543306 qE1.2 gneg +chr1 104543306 113461693 qE2.1 gpos100 +chr1 113461693 114948091 qE2.2 gneg +chr1 114948091 126839272 qE2.3 gpos100 +chr1 126839272 129812068 qE3 gneg +chr1 129812068 140712318 qE4 gpos66 +chr1 140712318 148639772 qF gneg +chr1 148639772 153098966 qG1 gpos100 +chr1 153098966 154089897 qG2 gneg +chr1 154089897 158549091 qG3 gpos100 +chr1 158549091 161521886 qH1 gneg +chr1 161521886 165485613 qH2.1 gpos33 +chr1 165485613 166972011 qH2.2 gneg +chr1 166972011 171431204 qH2.3 gpos33 +chr1 171431204 177376795 qH3 gneg +chr1 177376795 183322386 qH4 gpos33 +chr1 183322386 189763443 qH5 gneg +chr1 189763443 197195432 qH6 gpos33 +chr2 0 14052687 qA1 gpos100 +chr2 14052687 16394801 qA2 gneg +chr2 16394801 29042220 qA3 gpos33 +chr2 29042220 48247559 qB gneg +chr2 48247559 60426554 qC1.1 gpos100 +chr2 60426554 60894977 qC1.2 gneg +chr2 60894977 68389744 qC1.3 gpos100 +chr2 68389744 71668704 qC2 gneg +chr2 71668704 81037162 qC3 gpos66 +chr2 81037162 88531928 qD gneg +chr2 88531928 101179347 qE1 gpos100 +chr2 101179347 104926730 qE2 gneg +chr2 104926730 112889920 qE3 gpos33 +chr2 112889920 115700457 qE4 gneg +chr2 115700457 123663646 qE5 gpos66 +chr2 123663646 131626836 qF1 gneg +chr2 131626836 134437373 qF2 gpos33 +chr2 134437373 140995294 qF3 gneg +chr2 140995294 146616369 qG1 gpos100 +chr2 146616369 147553214 qG2 gneg +chr2 147553214 152237444 qG3 gpos100 +chr2 152237444 158795364 qH1 gneg +chr2 158795364 163011170 qH2 gpos33 +chr2 163011170 173316474 qH3 gneg +chr2 173316474 181748087 qH4 gpos33 +chr3 0 18490218 qA1 gpos100 +chr3 18490218 20436557 qA2 gneg +chr3 20436557 35520683 qA3 gpos66 +chr3 35520683 46712131 qB gneg +chr3 46712131 56443825 qC gpos100 +chr3 56443825 60823088 qD gneg +chr3 60823088 69581612 qE1 gpos33 +chr3 69581612 72501120 qE2 gneg +chr3 72501120 83692569 qE3 gpos100 +chr3 83692569 92937678 qF1 gneg +chr3 92937678 97316940 qF2.1 gpos33 +chr3 97316940 106075465 qF2.2 gneg +chr3 106075465 108021804 qF2.3 gpos33 +chr3 108021804 114833990 qF3 gneg +chr3 114833990 126512023 qG1 gpos100 +chr3 126512023 128458361 qG2 gneg +chr3 128458361 138190056 qG3 gpos66 +chr3 138190056 143542487 qH1 gneg +chr3 143542487 147921750 qH2 gpos33 +chr3 147921750 153760766 qH3 gneg +chr3 153760766 159599783 qH4 gpos33 +chr4 0 14799183 qA1 gpos100 +chr4 14799183 17663541 qA2 gneg +chr4 17663541 28166187 qA3 gpos100 +chr4 28166187 30075759 qA4 gneg +chr4 30075759 43442763 qA5 gpos66 +chr4 43442763 51558444 qB1 gneg +chr4 51558444 54900195 qB2 gpos33 +chr4 54900195 63015876 qB3 gneg +chr4 63015876 69221985 qC1 gpos33 +chr4 69221985 71608950 qC2 gneg +chr4 71608950 83543776 qC3 gpos100 +chr4 83543776 89272492 qC4 gneg +chr4 89272492 96910780 qC5 gpos66 +chr4 96910780 105026461 qC6 gneg +chr4 105026461 110277784 qC7 gpos66 +chr4 110277784 116961286 qD1 gneg +chr4 116961286 119825644 qD2.1 gpos33 +chr4 119825644 130328290 qD2.2 gneg +chr4 130328290 133192648 qD2.3 gpos33 +chr4 133192648 140830936 qD3 gneg +chr4 140830936 147037045 qE1 gpos100 +chr4 147037045 155630120 qE2 gneg +chr5 0 14964098 qA1 gpos100 +chr5 14964098 16412236 qA2 gneg +chr5 16412236 25583780 qA3 gpos66 +chr5 25583780 33789899 qB1 gneg +chr5 33789899 35720750 qB2 gpos33 +chr5 35720750 50684848 qB3 gneg +chr5 50684848 58890967 qC1 gpos33 +chr5 58890967 61304531 qC2 gneg +chr5 61304531 71924213 qC3.1 gpos100 +chr5 71924213 73855065 qC3.2 gneg +chr5 73855065 78199480 qC3.3 gpos66 +chr5 78199480 82061183 qD gneg +chr5 82061183 91715440 qE1 gpos100 +chr5 91715440 94129004 qE2 gneg +chr5 94129004 99921558 qE3 gpos33 +chr5 99921558 102335123 qE4 gneg +chr5 102335123 108127677 qE5 gpos33 +chr5 108127677 125505339 qF gneg +chr5 125505339 127436191 qG1.1 gpos33 +chr5 127436191 128401616 qG1.2 gneg +chr5 128401616 131297893 qG1.3 gpos33 +chr5 131297893 146744704 qG2 gneg +chr5 146744704 152537259 qG3 gpos33 +chr6 0 16613004 qA1 gpos100 +chr6 16613004 21499181 qA2 gneg +chr6 21499181 27362595 qA3.1 gpos100 +chr6 27362595 28339830 qA3.2 gneg +chr6 28339830 34203243 qA3.3 gpos100 +chr6 34203243 41532510 qB1 gneg +chr6 41532510 44464216 qB2.1 gpos66 +chr6 44464216 45930070 qB2.2 gneg +chr6 45930070 50816247 qB2.3 gpos66 +chr6 50816247 62543074 qB3 gneg +chr6 62543074 74269900 qC1 gpos100 +chr6 74269900 76712989 qC2 gneg +chr6 76712989 85996727 qC3 gpos66 +chr6 85996727 94303229 qD1 gneg +chr6 94303229 95769082 qD2 gpos33 +chr6 95769082 103098349 qD3 gneg +chr6 103098349 108473144 qE1 gpos100 +chr6 108473144 109450380 qE2 gneg +chr6 109450380 116779646 qE3 gpos100 +chr6 116779646 122643059 qF1 gneg +chr6 122643059 125086148 qF2 gpos33 +chr6 125086148 131926797 qF3 gneg +chr6 131926797 139256063 qG1 gpos66 +chr6 139256063 142676388 qG2 gneg +chr6 142676388 149517037 qG3 gpos33 +chr7 0 15943333 qA1 gpos100 +chr7 15943333 19131999 qA2 gneg +chr7 19131999 29760888 qA3 gpos33 +chr7 29760888 36138221 qB1 gneg +chr7 36138221 39326888 qB2 gpos33 +chr7 39326888 49955776 qB3 gneg +chr7 49955776 56864554 qB4 gpos33 +chr7 56864554 63773332 qB5 gneg +chr7 63773332 74933665 qC gpos100 +chr7 74933665 80779554 qD1 gneg +chr7 80779554 84499665 qD2 gpos66 +chr7 84499665 94597109 qD3 gneg +chr7 94597109 104694553 qE1 gpos100 +chr7 104694553 107351775 qE2 gneg +chr7 107351775 116917775 qE3 gpos33 +chr7 116917775 124357997 qF1 gneg +chr7 124357997 129140997 qF2 gpos33 +chr7 129140997 144021442 qF3 gneg +chr7 144021442 147741553 qF4 gpos33 +chr7 147741553 152524553 qF5 gneg +chr8 0 16228701 qA1.1 gpos100 +chr8 16228701 17183331 qA1.2 gneg +chr8 17183331 20524534 qA1.3 gpos33 +chr8 20524534 30070829 qA2 gneg +chr8 30070829 34366662 qA3 gpos33 +chr8 34366662 44867586 qA4 gneg +chr8 44867586 49163419 qB1.1 gpos66 +chr8 49163419 51072678 qB1.2 gneg +chr8 51072678 56800455 qB1.3 gpos66 +chr8 56800455 60618973 qB2 gneg +chr8 60618973 68256009 qB3.1 gpos100 +chr8 68256009 69210638 qB3.2 gneg +chr8 69210638 75893045 qB3.3 gpos100 +chr8 75893045 81620822 qC1 gneg +chr8 81620822 86393969 qC2 gpos33 +chr8 86393969 91644432 qC3 gneg +chr8 91644432 93076376 qC4 gpos33 +chr8 93076376 97372209 qC5 gneg +chr8 97372209 105009245 qD1 gpos100 +chr8 105009245 105963874 qD2 gneg +chr8 105963874 112646281 qD3 gpos33 +chr8 112646281 126011094 qE1 gneg +chr8 126011094 131738871 qE2 gpos33 +chr9 0 14352094 qA1 gpos100 +chr9 14352094 19444773 qA2 gneg +chr9 19444773 24074481 qA3 gpos33 +chr9 24074481 37963604 qA4 gneg +chr9 37963604 43982225 qA5.1 gpos66 +chr9 43982225 46297079 qA5.2 gneg +chr9 46297079 54630553 qA5.3 gpos66 +chr9 54630553 62964027 qB gneg +chr9 62964027 69445618 qC gpos33 +chr9 69445618 77316122 qD gneg +chr9 77316122 82408800 qE1 gpos33 +chr9 82408800 84260683 qE2 gneg +chr9 84260683 90742275 qE3.1 gpos100 +chr9 90742275 91205245 qE3.2 gneg +chr9 91205245 100464661 qE3.3 gpos100 +chr9 100464661 101390603 qE4 gpos66 +chr9 101390603 107872194 qF1 gneg +chr9 107872194 110650019 qF2 gpos33 +chr9 110650019 119446464 qF3 gneg +chr9 119446464 124076172 qF4 gpos33 +chrX 0 15368327 qA1.1 gpos100 +chrX 15368327 17769628 qA1.2 gneg +chrX 17769628 20651189 qA1.3 gpos33 +chrX 20651189 27374832 qA2 gneg +chrX 27374832 32657694 qA3.1 gpos66 +chrX 32657694 33618215 qA3.2 gneg +chrX 33618215 38901077 qA3.3 gpos66 +chrX 38901077 46585241 qA4 gneg +chrX 46585241 54749664 qA5 gpos66 +chrX 54749664 61473308 qA6 gneg +chrX 61473308 67716690 qA7.1 gpos66 +chrX 67716690 69157471 qA7.2 gneg +chrX 69157471 75400854 qA7.3 gpos66 +chrX 75400854 80203456 qB gneg +chrX 80203456 88848140 qC1 gpos100 +chrX 88848140 89808660 qC2 gneg +chrX 89808660 98453344 qC3 gpos100 +chrX 98453344 107098028 qD gneg +chrX 107098028 117183493 qE1 gpos100 +chrX 117183493 118144013 qE2 gneg +chrX 118144013 131591300 qE3 gpos100 +chrX 131591300 137834682 qF1 gneg +chrX 137834682 145038586 qF2 gpos33 +chrX 145038586 152242489 qF3 gneg +chrX 152242489 159446392 qF4 gpos33 +chrX 159446392 166650296 qF5 gneg +chrY 0 3578074 qA1 gpos100 +chrY 3578074 5665285 qA2 gpos66 +chrY 5665285 7851886 qB gpos33 +chrY 7851886 9442142 qC1 gpos100 +chrY 9442142 10734224 qC2 gpos33 +chrY 10734224 12523262 qC3 gpos100 +chrY 12523262 14411690 qD gpos33 +chrY 14411690 15902555 qE gpos66 +chr10 0 12754055 qA1 gpos100 +chr10 12754055 17659461 qA2 gneg +chr10 17659461 23545948 qA3 gpos33 +chr10 23545948 33356759 qA4 gneg +chr10 33356759 41205409 qB1 gpos100 +chr10 41205409 48072977 qB2 gneg +chr10 48072977 55921626 qB3 gpos100 +chr10 55921626 63770276 qB4 gneg +chr10 63770276 67694600 qB5.1 gpos100 +chr10 67694600 68185141 qB5.2 gneg +chr10 68185141 74562168 qB5.3 gpos100 +chr10 74562168 88787845 qC1 gneg +chr10 88787845 95655414 qC2 gpos33 +chr10 95655414 98598657 qC3 gneg +chr10 98598657 111352712 qD1 gpos100 +chr10 111352712 124106767 qD2 gneg +chr10 124106767 129993255 qD3 gpos33 +chr11 0 13021480 qA1 gpos100 +chr11 13021480 17206956 qA2 gneg +chr11 17206956 21857485 qA3.1 gpos100 +chr11 21857485 25577908 qA3.2 gneg +chr11 25577908 30228437 qA3.3 gpos100 +chr11 30228437 36274125 qA4 gneg +chr11 36274125 43249918 qA5 gpos100 +chr11 43249918 47900447 qB1.1 gneg +chr11 47900447 49760658 qB1.2 gpos33 +chr11 49760658 59991822 qB1.3 gneg +chr11 59991822 62782139 qB2 gpos33 +chr11 62782139 70688038 qB3 gneg +chr11 70688038 73943408 qB4 gpos33 +chr11 73943408 81849307 qB5 gneg +chr11 81849307 90220259 qC gpos100 +chr11 90220259 102311634 qD gneg +chr11 102311634 110217533 qE1 gpos66 +chr11 110217533 121843856 qE2 gneg +chr12 0 17766671 qA1.1 gpos100 +chr12 17766671 21320005 qA1.2 gneg +chr12 21320005 26205839 qA1.3 gpos66 +chr12 26205839 31980007 qA2 gneg +chr12 31980007 39530843 qA3 gpos33 +chr12 39530843 44416677 qB1 gneg +chr12 44416677 45305011 qB2 gpos33 +chr12 45305011 52411679 qB3 gneg +chr12 52411679 66625016 qC1 gpos100 +chr12 66625016 71955017 qC2 gneg +chr12 71955017 81726686 qC3 gpos100 +chr12 81726686 86168354 qD1 gneg +chr12 86168354 89277522 qD2 gpos33 +chr12 89277522 96384190 qD3 gneg +chr12 96384190 107044193 qE gpos100 +chr12 107044193 115483361 qF1 gneg +chr12 115483361 121257530 qF2 gpos66 +chr13 0 16267960 qA1 gpos100 +chr13 16267960 21197645 qA2 gneg +chr13 21197645 29578109 qA3.1 gpos66 +chr13 29578109 33028888 qA3.2 gneg +chr13 33028888 41409353 qA3.3 gpos33 +chr13 41409353 44367164 qA4 gneg +chr13 44367164 52747628 qA5 gpos33 +chr13 52747628 59156219 qB1 gneg +chr13 59156219 61621061 qB2 gpos33 +chr13 61621061 69508557 qB3 gneg +chr13 69508557 78381990 qC1 gpos33 +chr13 78381990 80846832 qC2 gneg +chr13 80846832 94649950 qC3 gpos100 +chr13 94649950 106481194 qD1 gneg +chr13 106481194 110424942 qD2.1 gpos33 +chr13 110424942 116340564 qD2.2 gneg +chr13 116340564 120284312 qD2.3 gpos33 +chr14 0 15023383 qA1 gpos100 +chr14 15023383 19530398 qA2 gneg +chr14 19530398 30046767 qA3 gpos33 +chr14 30046767 43567812 qB gneg +chr14 43567812 52081063 qC1 gpos100 +chr14 52081063 55085740 qC2 gneg +chr14 55085740 60093534 qC3 gpos66 +chr14 60093534 69107564 qD1 gneg +chr14 69107564 73113800 qD2 gpos33 +chr14 73113800 85132507 qD3 gneg +chr14 85132507 89138743 qE1 gpos66 +chr14 89138743 99154332 qE2.1 gpos100 +chr14 99154332 100155891 qE2.2 gneg +chr14 100155891 107667583 qE2.3 gpos100 +chr14 107667583 111173039 qE3 gneg +chr14 111173039 121188628 qE4 gpos100 +chr14 121188628 125194864 qE5 gneg +chr15 0 16413299 qA1 gpos100 +chr15 16413299 24164024 qA2 gneg +chr15 24164024 29635124 qB1 gpos33 +chr15 29635124 31914749 qB2 gneg +chr15 31914749 42856949 qB3.1 gpos100 +chr15 42856949 44680649 qB3.2 gneg +chr15 44680649 49695824 qB3.3 gpos66 +chr15 49695824 53343224 qC gneg +chr15 53343224 66109124 qD1 gpos100 +chr15 66109124 68388749 qD2 gneg +chr15 68388749 77051324 qD3 gpos66 +chr15 77051324 83434274 qE1 gneg +chr15 83434274 86625749 qE2 gpos33 +chr15 86625749 95288324 qE3 gneg +chr15 95288324 100759424 qF1 gpos66 +chr15 100759424 101671274 qF2 gneg +chr15 101671274 103494974 qF3 gpos33 +chr16 0 15450152 qA1 gpos100 +chr16 15450152 16386525 qA2 gneg +chr16 16386525 20600202 qA3 gpos33 +chr16 20600202 26218440 qB1 gneg +chr16 26218440 32304863 qB2 gpos33 +chr16 32304863 38391287 qB3 gneg +chr16 38391287 44945897 qB4 gpos33 +chr16 44945897 53841439 qB5 gneg +chr16 53841439 58055117 qC1.1 gpos66 +chr16 58055117 58991490 qC1.2 gneg +chr16 58991490 66950659 qC1.3 gpos66 +chr16 66950659 70696150 qC2 gneg +chr16 70696150 79123506 qC3.1 gpos100 +chr16 79123506 79591692 qC3.2 gneg +chr16 79591692 91764540 qC3.3 gpos100 +chr16 91764540 98319150 qC4 gneg +chr17 0 13984976 qA1 gpos100 +chr17 13984976 16170128 qA2 gneg +chr17 16170128 17481220 qA3.1 gpos33 +chr17 17481220 21851525 qA3.2 gneg +chr17 21851525 31466196 qA3.3 gpos66 +chr17 31466196 40206806 qB1 gneg +chr17 40206806 41517898 qB2 gpos33 +chr17 41517898 45888203 qB3 gneg +chr17 45888203 55939905 qC gpos66 +chr17 55939905 60310210 qD gneg +chr17 60310210 68176759 qE1.1 gpos100 +chr17 68176759 69050820 qE1.2 gneg +chr17 69050820 73421125 qE1.3 gpos100 +chr17 73421125 78665491 qE2 gneg +chr17 78665491 83035796 qE3 gpos33 +chr17 83035796 89154223 qE4 gneg +chr17 89154223 95272651 qE5 gpos33 +chr18 0 19420992 qA1 gpos100 +chr18 19420992 29553684 qA2 gneg +chr18 29553684 35464421 qB1 gpos66 +chr18 35464421 37153203 qB2 gneg +chr18 37153203 45597113 qB3 gpos100 +chr18 45597113 49819068 qC gneg +chr18 49819068 54041023 qD1 gpos100 +chr18 54041023 54463218 qD2 gneg +chr18 54463218 60796150 qD3 gpos100 +chr18 60796150 67973474 qE1 gneg +chr18 67973474 75150797 qE2 gpos33 +chr18 75150797 83594707 qE3 gneg +chr18 83594707 90772031 qE4 gpos33 +chr19 0 16655891 qA gpos100 +chr19 16655891 25593199 qB gneg +chr19 25593199 34936748 qC1 gpos66 +chr19 34936748 38186678 qC2 gneg +chr19 38186678 47530227 qC3 gpos66 +chr19 47530227 51592639 qD1 gneg +chr19 51592639 58904982 qD2 gpos33 +chr19 58904982 61342430 qD3 gneg diff --git a/bp_doc/biopieces_cookbook.lyx b/bp_doc/biopieces_cookbook.lyx new file mode 100644 index 0000000..1dc2694 --- /dev/null +++ b/bp_doc/biopieces_cookbook.lyx @@ -0,0 +1,7258 @@ +#LyX 1.5.1 created this file. For more info see http://www.lyx.org/ +\lyxformat 276 +\begin_document +\begin_header +\textclass scrartcl +\begin_preamble +\usepackage[colorlinks=true, urlcolor=blue, linkcolor=black]{hyperref} +\end_preamble +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 +\graphics default +\paperfontsize default +\spacing single +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes false +\output_changes false +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +Biopieces Cookbook +\end_layout + +\begin_layout Author +Martin Asser Hansen +\end_layout + +\begin_layout Publishers +John Mattick Group +\newline +Institute for Molecular Bioscience +\newline +University of Queensland +\newline +Aust +ralia +\newline +E-mail: mail@maasha.dk +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + + +\backslash +thispagestyle{empty} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard + +\newpage + +\end_layout + +\begin_layout Standard +\begin_inset LatexCommand tableofcontents + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset FloatList figure + +\end_inset + + +\end_layout + +\begin_layout Standard + +\newpage + +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +Biopieces is a collection of bioinformatic tools that can be linked together + (piped as we shall call it) in a very flexible manner to perform both simple + and complex tasks. + The fundamental idea is that biopieces work on a data stream that will + only terminate at the end of an analysis and that this data stream can + be passed through several different biopieces, each performing one specific + task. + The advantage of this approach is that a user can perform simple and complex + tasks without having to write advanced code. + Moreover, since the data format used to pass data between biopieces is + text based, biopieces can be written by different developers in their favorite + programming language --- and still the biopieces will be able to work together. +\end_layout + +\begin_layout Standard +In the most simple form bioools can be piped together on the command line + like this (using the pipe character '|'): +\end_layout + +\begin_layout LyX-Code +read_data | calculate_something | write_result +\end_layout + +\begin_layout Standard +However, a more comprehensive analysis could be composed: +\end_layout + +\begin_layout LyX-Code +read_data | select_entries | convert_entries | search_database +\end_layout + +\begin_layout LyX-Code +evaluate_results | plot_diagram | plot_another_diagram | +\end_layout + +\begin_layout LyX-Code +load_to_database +\end_layout + +\begin_layout Standard +The data stream that is piped through the biopieces consists of records + of key/value pairs in the same way a hash does in order to keep as simple + a structure as possible. + An example record can be seen below: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +REC_TYPE: PATSCAN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MATCH: AGATCAAGTG +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_BEG: 7 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_END: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +ALIGN_LEN: 9 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +S_ID: piR-t6 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +STRAND: + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +PATTERN: AGATCAAGTG +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +The ' +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\backslash +/- +\end_layout + +\end_inset + +' denotes the delimiter of the records, and each key is a word followed + by a ':' and a white-space and then the value. + By convention the biopieces only uses upper case keys (a list of used keys + can be seen in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:Keys" + +\end_inset + +). + Since the records basically are hash structures this mean that the order + of the keys in the stream is unordered, and in the above example it is + pure coincidence that HIT_BEG is displayed before HIT_END, however, when + the order of the keys is importent, the biopieces will automagically see + to that. +\end_layout + +\begin_layout Standard +All of the biopieces are able to read and write a data stream to and from + file as long as the records are in the biopieces format. + This means that if you are undertaking a lengthy analysis where one of + the steps is time consuming, you may save the stream after this step, and + subsequently start one or more analysis from that last step +\begin_inset Foot +status collapsed + +\begin_layout Standard +It is a goal that the biopieces at some point will be able to dump the data + stream to file in case one of the tools fail critically. +\end_layout + +\end_inset + +. + If you are running a lengthy analysis it is highly recommended that you + create a small test sample of the data and run that through the pipeline + --- and once you are satisfied with the result proceed with the full data + set (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-select-a-few-records" + +\end_inset + +). +\end_layout + +\begin_layout Standard +All of the biopieces can be supplied with long arguments prefixed with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + + switches or single character arguments prefixed with - switches that can + be grouped together (e.g. + -xok). + In this cookbook only the long switches are used to emphasize what these + switches do. +\end_layout + +\begin_layout Section +Setup +\end_layout + +\begin_layout Standard +In order to get the biopieces to work, you need to add environment settings + to include the code binaries, scripts, and modules that constitute the + biopieces package. + Assuming that you are using bash, add the following to your '~/.bashrc' + file using your favorite editor. + After the changes has been saved you need to either run 'source ~/.bashrc' + or relogin. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +if [ -f "/home/m.hansen/maasha/conf/bashrc" ]; then +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + source "/home/m.hansen/maasha/conf/bashrc" +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +fi +\end_layout + +\begin_layout Section +Getting Started +\end_layout + +\begin_layout Standard +The biopiece +\series bold +list_biopieces +\series default + lists all the biopieces along with a description: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biopieces +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +align_seq Align sequences in stream using Muscle. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_seq Analysis the residue composition of each sequence + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +analyze_vals Determine type, count, min, max, sum and mean for + values in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blast_seq BLAST sequences in stream against a specified database. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +blat_seq BLAT sequences in stream against a specified genome. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +complement_seq Complement sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records Count the number of records in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_seq Count sequences in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_vals Count the number of times values of given keys exists + in stream. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +create_blast_db Create a BLAST database from sequences in stream for + use with BLAST. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +... +\end_layout + +\begin_layout Standard +To list the biopieces for writing different formats, you can use unix's + grep like this: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +list_biopieces | grep write +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_align Write aligned sequences in pretty alignment format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_bed Write records from stream as BED lines. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_blast Write BLAST records from stream in BLAST tabular format + (-m8 and 9). +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_fasta Write sequences in FASTA format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_psl Write records from stream in PSL format. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +write_tab Write records from stream as tab separated table. +\end_layout + +\begin_layout Standard +In order to find out how a specific biopiece works, you just type the program + name without any arguments and press return and the usage of the biopiece + will be displayed. + E.g. + +\series bold +read_fasta +\series default + : +\end_layout + +\begin_layout Standard +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +width "100col%" +special "none" +height "1in" +height_special "totalheight" +status open + +\begin_layout LyX-Code + +\size scriptsize +Program name: read_fasta +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Author: Martin Asser Hansen - Copyright (C) - All rights reserved +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Contact: mail@maasha.dk +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Date: August 2007 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/ +gpl.html) +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Description: Read FASTA entries. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Usage: read_fasta [options] -i +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Options: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-i | --data_in=] - Comma separated list of files + to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-n | --num=] - Limit number of records to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-I | --stream_in=] - Read input stream from file + - Default=STDIN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-O | --stream_out=] - Write output stream to file + - Default=STDOUT +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Examples: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna - Read FASTA entries from file. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test1.fna,test2,fna - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i '*.fna' - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna -n 10 - Read first 10 FASTA entries from + file. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +The Data Stream +\end_layout + +\begin_layout Subsection +How to read the data stream from file? +\begin_inset LatexCommand label +name "sub:How-to-read-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +You want to read a data stream that you previously have saved to file in + biopieces format. + This can be done implicetly or explicitly. + The implicit way uses the 'stdout' stream of the Unix terminal: +\end_layout + +\begin_layout LyX-Code +cat | +\end_layout + +\begin_layout Standard +cat is the Unix command that reads a file and output the result to 'stdout' + --- which in this case is piped to any biopiece represented by the . + It is also possible to read the data stream using '<' to direct the 'stdout' + stream into the biopiece like this: +\end_layout + +\begin_layout LyX-Code + < +\end_layout + +\begin_layout Standard +However, that will not work if you pipe more biopieces together. + Then it is much safer to read the stream from a file explicitly like this: +\end_layout + +\begin_layout LyX-Code + --stream_in= +\end_layout + +\begin_layout Standard +Here the filename is explicetly given to the biopiece + with the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in. + This switch works with all biopieces. + It is also possible to read in data from multiple sources by repeating + the explicit read step: +\end_layout + +\begin_layout LyX-Code + --stream_in= | --stream_in= +\end_layout + +\begin_layout Subsection +How to write the data stream to file? +\begin_inset LatexCommand label +name "sub:How-to-write-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to save the output stream from a biopiece to file, so you can read + in the stream again at a later time, you can do one of two things: +\end_layout + +\begin_layout LyX-Code + > +\end_layout + +\begin_layout Standard +All, the biopieces write the data stream to 'stdout' by default which can + be written to a file by redirecting 'stdout' to file using '>' , however, + if one of the biopieces for writing other formats is used then the both + the biopieces records as well as the result output will go to 'stdout' + in a mixture causing havock! To avoid this you must use the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out that explictly tells the biopiece to write the output stream + to file: +\end_layout + +\begin_layout LyX-Code + --stream_out= +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out switch works with all biopieces. +\end_layout + +\begin_layout Subsection +How to terminate the data stream? +\end_layout + +\begin_layout Standard +The data stream is never stops unless the user want to save the stream or + by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch that will terminate the stream: +\end_layout + +\begin_layout LyX-Code + --no_stream +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch only works with those biopieces where it makes sense that + the user might want to terminale the data stream, +\emph on +i.e +\emph default +. + after an analysis step where the user wants to output the result, but not + the data stream. +\end_layout + +\begin_layout Subsection +How to write my results to file? +\begin_inset LatexCommand label +name "sub:How-to-write-result" + +\end_inset + + +\end_layout + +\begin_layout Standard +Saving the result of an analysis to file can be done implicitly or explicitly. + The implicit way: +\end_layout + +\begin_layout LyX-Code + --no_stream > +\end_layout + +\begin_layout Standard +If you use '>' to redirect 'stdout' to file then it is important to use + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch to avoid writing a mix of biopieces records and result + to the same file causing havock. + The safe way is to use the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which explicetly tells the biopiece to write the result + to a given file: +\end_layout + +\begin_layout LyX-Code + --result_out= +\end_layout + +\begin_layout Standard +Using the above method will not terminate the stream, so it is possible + to pipe that into another biopiece generating different results: +\end_layout + +\begin_layout LyX-Code + --result_out= | --result_out= +\end_layout + +\begin_layout Standard +And still the data stream will continue unless terminated with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream: +\end_layout + +\begin_layout LyX-Code + --result_out= --no_stream +\end_layout + +\begin_layout Standard +Or written to file using implicitly or explicity +\begin_inset LatexCommand eqref +reference "sub:How-to-write-result" + +\end_inset + +. + The explicit way: +\end_layout + +\begin_layout LyX-Code + --result_out= --stream_out= +\end_layout + +\begin_layout Subsection +How to read data from multiple sources? +\end_layout + +\begin_layout Standard +To read multiple data sources, with the same type or different type of data + do: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +where type is the data type a specific biopiece reads. +\end_layout + +\begin_layout Section +Reading input +\end_layout + +\begin_layout Subsection +How to read biopieces input? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-read-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to read in data? +\end_layout + +\begin_layout Standard +Data in different formats can be read with the appropriate biopiece for + that format. + The biopieces are typicalled named 'read_' such as +\series bold +read_fasta +\series default +, +\series bold +read_bed +\series default +, +\series bold +read_tab +\series default +, etc., and all behave in a similar manner. + Data can be read by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in switch and a file name to the file containing the data: +\end_layout + +\begin_layout LyX-Code + --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read in a saved biopieces stream (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-stream" + +\end_inset + +) as well as reading data in one go: +\end_layout + +\begin_layout LyX-Code + --stream_in= --data_in= +\end_layout + +\begin_layout Standard +If you want to read data from several files you can do this: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +If you have several data files you can read in all explicitly with a comma + separated list: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,file2,file3 +\end_layout + +\begin_layout Standard +And it is also possible to use file globbing +\begin_inset Foot +status open + +\begin_layout Standard +using the short option will only work if you quote the argument -i '*.fna' +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout LyX-Code + --data_in=*.fna +\end_layout + +\begin_layout Standard +Or in a combination: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,/dir/*.fna +\end_layout + +\begin_layout Standard +Finally, it is possible to read in data in different formats using the appropria +te biopiece for each format: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= ... +\end_layout + +\begin_layout Subsection +How to read FASTA input? +\end_layout + +\begin_layout Standard +Sequences in FASTA format can be read explicitly using +\series bold +read_fasta +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= +\end_layout + +\begin_layout Subsection +How to read alignment input? +\end_layout + +\begin_layout Standard +If your alignment if FASTA formatted then you can +\series bold +read_align +\series default +. + It is also possible to use +\series bold +read_fasta +\series default + since the data is FASTA formatted, however, with +\series bold +read_fasta +\series default + the key ALIGN will be omitted. + The ALIGN key is used to determine which sequences belong to what alignment + which is required for +\series bold +write_align +\series default +. +\end_layout + +\begin_layout LyX-Code +read_align --data_in= +\end_layout + +\begin_layout Subsection +How to read tabular input? +\begin_inset LatexCommand label +name "sub:How-to-read-table" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tabular input can be read with +\series bold +read_tab +\series default + which will read in all rows and chosen columns (separated by a given delimter) + from a table in text format. +\end_layout + +\begin_layout Standard +The table below: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Human +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +ATACGTCAG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +23524 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Dog +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AGCATGAC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2442 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Mouse +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +GACTG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +234 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Cat +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AAATGCA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2342 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Can be read using the command: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= +\end_layout + +\begin_layout Standard +Which will result in four records, one for each row, where the keys V0, + V1, V2 are the default keys for the organism, sequence, and count, respectively. + It is possible to select a subset of colums to read by using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +cols switch which takes a comma separated list of columns numbers (first + column is designated 0) as argument. + So to read in only the sequence and the count so that the count comes before + the sequence do: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 +\end_layout + +\begin_layout Standard +It is also possible to name the columns with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 --keys=COUNT,SEQ +\end_layout + +\begin_layout Subsection +How to read BED input? +\end_layout + +\begin_layout Standard +The BED (Browser Extensible Data +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/FAQ/FAQformat" + +\end_inset + + +\end_layout + +\end_inset + +) format is a tabular format for data pertaining to one of the Eukaryotic + genomes in the UCSC genome brower +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/" + +\end_inset + + +\end_layout + +\end_inset + +. + The BED format consists of up to 12 columns, where the first three are + mandatory CHR, CHR_BEG, and CHR_END. + The mandatory columns and any of the optional columns can all be read in + easily with the +\series bold +read_bed +\series default + biopiece. +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read the BED file with +\series bold +read_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +), however, that will be more cumbersome because you need to specify the + keys: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --keys=CHR,CHR_BEG,CHR_END ... +\end_layout + +\begin_layout Subsection +How to read PSL input? +\end_layout + +\begin_layout Standard +The PSL format is the output from BLAT and contains 21 mandatory fields + that can be read with +\series bold +read_psl +\series default +: +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= +\end_layout + +\begin_layout Section +Writing output +\end_layout + +\begin_layout Standard +All result output can be written explicitly to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which all result generating biopieces have. + It is also possible to write the result to file implicetly by directing + 'stdout' to file using '>', however, that requires the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream swich to prevent a mixture of data stream and results in the file. + The explicit (and safe) way: +\end_layout + +\begin_layout LyX-Code +... + | --result_out= +\end_layout + +\begin_layout Standard +The implicit way: +\end_layout + +\begin_layout LyX-Code +... + | --no_stream > +\end_layout + +\begin_layout Subsection +How to write biopieces output? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-write-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to write FASTA output? +\begin_inset LatexCommand label +name "sub:How-to-write-fasta" + +\end_inset + + +\end_layout + +\begin_layout Standard +FASTA output can be written with +\series bold +write_fasta +\series default +. +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --result_out= +\end_layout + +\begin_layout Standard +It is also possible to wrap the sequences to a given width using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch allthough wrapping of sequence is generally an evil thing: +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --no_stream --wrap=80 +\end_layout + +\begin_layout Subsection +How to write alignment output? +\begin_inset LatexCommand label +name "sub:How-to-write-alignment" + +\end_inset + + +\end_layout + +\begin_layout Standard +Pretty alignments with ruler +\begin_inset Foot +status collapsed + +\begin_layout Standard +'.' for every 10 residues, ':' for every 50, and '|' for every 100 +\end_layout + +\end_inset + + and consensus sequence +\begin_inset Note Note +status collapsed + +\begin_layout Standard +which reminds me to make that an option. +\end_layout + +\end_inset + + can be created with +\series bold +write_align +\series default +, what also have the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch to break the alignment into blocks of a given width: +\end_layout + +\begin_layout LyX-Code +... + | write_align --result_out= --wrap=80 +\end_layout + +\begin_layout Standard +If the number of sequnces in the alignment is 2 then a pairwise alignment + will be output otherwise a multiple alignment. + And if the sequence type, determined automagically, is protein, then residues + and symbols (+,\InsetSpace ~ +:,\InsetSpace ~ +.) will be used to show consensus according to the Blosum62 + matrix. +\end_layout + +\begin_layout Subsection +How to write tabular output? +\begin_inset LatexCommand label +name "sub:How-to-write-tab" + +\end_inset + + +\end_layout + +\begin_layout Standard +Outputting the data stream as a table can be done with +\series bold +write_tab +\series default +, which will write generate one row per record with the values as columns. + If you supply the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment switch, when the first row in the table will be a 'comment' line + prefixed with a '#': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --comment +\end_layout + +\begin_layout Standard +You can also change the delimiter from the default (tab) to +\emph on +e.g. + +\emph default + ',': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --delimit=',' +\end_layout + +\begin_layout Standard +If you want the values output in a specific order you have to supply a comma + separated list using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch that will print only those keys in that order: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --keys=SEQ_NAME,COUNT +\end_layout + +\begin_layout Standard +Alternatively, if you have some keys that you don't want in the tabular + output, use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switch. + So to print all keys except SEQ and SEQ_TYPE do: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --no_keys=SEQ,SEQ_TYPE +\end_layout + +\begin_layout Standard +Finally, if you have a stream containing a mix of different records types, + +\emph on +e.g. + +\emph default + records with sequences and records with matches, then you can use +\series bold +write_tab +\series default + to output all the records in tabluar format, however, the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment, +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys, and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switches will only respond to records of the first type encountered. + The reason is that outputting mixed records is probably not what you want + anyway, and you should remove all the unwanted records from the stream + before outputting the table: +\series bold +grab +\series default + is your friend (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to write a BED output? +\begin_inset LatexCommand label +name "sub:How-to-write-BED" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in BED format can be output if the records contain the mandatory keys + CHR, CHR_BEG, and CHR_END using +\series bold +write_bed +\series default +. + If the optional keys are also present, they will be output as well: +\end_layout + +\begin_layout LyX-Code +write_bed --result_out= +\end_layout + +\begin_layout Subsection +How to write PSL output? +\begin_inset LatexCommand label +name "sub:How-to-write-PSL" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in PSL format can be output using +\series bold +write_psl: +\end_layout + +\begin_layout LyX-Code +write_psl --result_out= +\end_layout + +\begin_layout Section +Manipulating Records +\end_layout + +\begin_layout Subsection +How to select a few records? +\begin_inset LatexCommand label +name "sub:How-to-select-a-few-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +To quickly get an overview of your data you can limit the data stream to + show a few records. + This also very useful to test the pipeline with a few records if you are + setting up a complex analysis using several biopieces. + That way you can inspect that all goes well before analyzing and waiting + for the full data set. + All of the read_ biopieces have the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch which will take a number as argument and only that number of + records will be read. + So to read in the first 10 FASTA entries from a file: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna --num=10 +\end_layout + +\begin_layout Standard +Another way of doing this is to use +\series bold +head_records +\series default + will limit the stream to show the first 10 records (default): +\end_layout + +\begin_layout LyX-Code +... + | head_records +\end_layout + +\begin_layout Standard +Using +\series bold +head_records +\series default + directly after one of the read_ biopieces will be a lot slower than + using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch with the read_ biopieces, however, +\series bold +head_records +\series default + can also be used to limit the output from all the other biopieces. + It is also possible to give +\series bold +head_records +\series default + a number of records to show using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch. + So to display the first 100 records do: +\end_layout + +\begin_layout LyX-Code +... + | head_records --num=100 +\end_layout + +\begin_layout Subsection +How to select random records? +\begin_inset LatexCommand label +name "sub:How-to-select-random-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to inspect a number of random records from the stream this can + be done with the +\series bold +random_records +\series default + biopiece. + So if you have 1 mio records in the stream and you want to select 1000 + random records do: +\end_layout + +\begin_layout LyX-Code +... + | random_records --num=1000 +\end_layout + +\begin_layout Subsection +How to count all records in the data stream? +\end_layout + +\begin_layout Standard +To count all the records in the data stream use +\series bold +count_records +\series default +, which adds one record (which is not included in the count) to the data + stream. + So to count the number of sequences in a FASTA file you can do this: +\end_layout + +\begin_layout LyX-Code +cat test.fna | read_fasta | count_records --no_stream +\end_layout + +\begin_layout Standard +Which will write the last record containing the count to 'stdout': +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records: 630 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +It is also possible to write the count to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch. +\end_layout + +\begin_layout Subsection +How to get the length of record values? +\begin_inset LatexCommand label +name "sub:How-to-get-value_length" + +\end_inset + + +\end_layout + +\begin_layout Standard +Use the +\series bold +length_vals +\series default + biopiece to get the length of each value for a comma separated list of + keys: +\end_layout + +\begin_layout LyX-Code +... + | length_vals --keys=HIT,PATTERN +\end_layout + +\begin_layout Subsection +How to grab specific records? +\begin_inset LatexCommand label +name "sub:How-to-grab" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biopiece +\series bold +grab +\series default + is related to the Unix grep and locates records based on matching keys + and/or values using either a pattern, a Perl regex, or a numerical evaluation. + To easily +\series bold +grab +\series default + all records in the stream that has any mentioning of the pattern 'human' + just pipe the data stream through +\series bold +grab +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human +\end_layout + +\begin_layout Standard +This will search for the pattern 'human' in all keys and all values. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma separated list of patterns, so in order to + match multiple patterns do: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human,mouse +\end_layout + +\begin_layout Standard +It is also possible to use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch instead of +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern. + +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in is used to read a file with one pattern per line: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern_in=patterns.txt +\end_layout + +\begin_layout Standard +If you want the opposite result --- to find all records that does not match + the patterns, add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch, which not only works with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch, but also with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +regex and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --invert +\end_layout + +\begin_layout Standard +If you want to search the record keys only, +\emph on +e.g. + +\emph default + to find all records containing the key SEQ you can add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys_only switch. + This will prevent matching of SEQ in any record value, and in fact SEQ + is a not uncommon peptide sequence you could get an unwanted record. + Also, this will give an increase in speed since only the keys are searched: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --keys_only +\end_layout + +\begin_layout Standard +However, if you are interested in finding the peptide sequence SEQ and not + the SEQ key, just add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +vals_only switch instead: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --vals_only +\end_layout + +\begin_layout Standard +Also, if you want to grab for certain key/value pairs you can supply a comma + separated list of keys whos values will then be searched using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch. + This is handy if your records contain large genomic sequences and you dont + want to search the entire sequence for +\emph on +e.g. + +\emph default + the organism name --- it is much faster to tell +\series bold +grab +\series default + which keys to search the value for: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --keys=SEQ_NAME +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +It is also possible to invoke flexible matching using regex (regular expressions +) instead of simple pattern matching. + In +\series bold +grab +\series default + the regex engine is Perl based and allows use of different type of wild + cards, alternatives, +\emph on +etc +\emph default + +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://perldoc.perl.org/perlreref.html" + +\end_inset + + +\end_layout + +\end_inset + +. + If you want to +\series bold +grab +\series default + records withs the sequence ATCG or GCTA you can do this: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='ATCG|GCTA' +\end_layout + +\begin_layout Standard +Or if you want to find sequences beginning with ATCG: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='^ATCG' +\end_layout + +\begin_layout Standard +You can also use +\series bold +grab +\series default + to locate records that fulfill a numerical property using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval switch witch takes an expression in three parts. + The first part is the key that holds the value we want to evaluate, the + second part holds one the six operators: +\end_layout + +\begin_layout Enumerate +Greater than: > +\end_layout + +\begin_layout Enumerate +Greater than or equal to: >= +\end_layout + +\begin_layout Enumerate +Less than: < +\end_layout + +\begin_layout Enumerate +Less than or equal to: <= +\end_layout + +\begin_layout Enumerate +Equal to: = +\end_layout + +\begin_layout Enumerate +Not equal to: != +\end_layout + +\begin_layout Enumerate +String wise equal to: eq +\end_layout + +\begin_layout Enumerate +String wise not equal to: ne +\end_layout + +\begin_layout Standard +And finally comes the number used in the evaluation. + So to +\series bold +grab +\series default + all records with a sequence length greater than 30: +\end_layout + +\begin_layout LyX-Code +... + length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +If you want to locate all records containing the pattern 'human' and where + the sequence length is greater that 30, you do this by running the stream + through +\series bold +grab +\series default + twice: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +Finally, it is possible to do fast matching of expressions from a file using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact switch. + Each of these expressions has to be matched exactly over the entrie length, + which if useful if you have a file with accession numbers, that you want + to locate in the stream: +\end_layout + +\begin_layout LyX-Code +... + | grab --exact acc_no.txt | ... +\end_layout + +\begin_layout Standard +Using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact is much faster than using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in, because with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact the expression has to be complete matches, where +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in looks for subpatterns. +\end_layout + +\begin_layout Standard +NB! To get the best speed performance, use the most restrictive +\series bold +grab +\series default + first. +\end_layout + +\begin_layout Subsection +How to remove keys from records? +\end_layout + +\begin_layout Standard +To remove one or more specific keys from all records in the data stream + use +\series bold +remove_keys +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | remove_keys --keys=SEQ,SEQ_NAME +\end_layout + +\begin_layout Standard +In the above example SEQ and SEQ_NAME will be removed from all records if + they exists in these. + If all keys are removed from a record, then the record will be removed. +\end_layout + +\begin_layout Subsection +How to rename keys in records? +\end_layout + +\begin_layout Standard +Sometimes you want to rename a record key, +\emph on +e.g. + +\emph default + if you have read in a two column table with sequence name and sequence + in each column (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +) without specifying the key names, then the sequence name will be called + V0 and the sequence V1 as default in the +\series bold +read_tab +\series default + biopiece. + To rename the V0 and V1 keys we need to run the stream through +\series bold +rename_keys +\series default + twice (one for each key to rename): +\end_layout + +\begin_layout LyX-Code +... + | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ +\end_layout + +\begin_layout Standard +The first instance of +\series bold +rename_keys +\series default + replaces all the V0 keys with SEQ_NAME, and the second instance of +\series bold +rename_keys +\series default + replaces all the V1 keys with SEQ. + +\emph on +Et viola +\emph default + the data can now be used in the biopieces that requires these keys. +\end_layout + +\begin_layout Section +Manipulating Sequences +\end_layout + +\begin_layout Subsection +How to get sequence lengths? +\end_layout + +\begin_layout Standard +The length for sequences in records can be determined with +\series bold +length_seq +\series default +, which adds the key SEQ_LEN to each record with the sequence length as + the value. + It also generates an extra record that is emitted last with the key TOTAL_SEQ_L +EN showing the total length of all the sequences. +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout Standard +It is also possible to determine the sequence length using the generic tool + +\series bold +length_vals +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-get-value_length" + +\end_inset + +, which determines the length of the values for a given list of keys: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout Standard +To obtain the total length of all sequences use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout LyX-Code +| sum_vals --keys=SEQ_LEN +\end_layout + +\begin_layout Standard +The biopiece +\series bold +analyze_seq +\series default + will also determine the length of each sequence (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-analyze" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to analyze sequence composition? +\begin_inset LatexCommand label +name "sub:How-to-analyze" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to find out the sequence type, composition, length, as well + as GC content, indel content and proportions of soft and hard masked sequence, + then use +\series bold +analyze_seq +\series default +. + This handy biopiece will determine all these things per sequence from which + it is easy to get an overview using the +\series bold +write_tab +\series default + biopiece to output a table (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +). + So in order to determine the sequence composition of a FASTA file with + just one entry containing the sequence 'ATCG' we just read the data with + +\series bold +read_fasta +\series default + and run the output through +\series bold +analyze_seq +\series default + which will add the analysis to the record like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:D: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MIX_INDEX: 0.55 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:W: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:G: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SOFT_MASK%: 63.75 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:B: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:V: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +HARD_MASK%: 0.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:H: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:S: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:N: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:.: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +GC%: 35.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:A: 8 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:Y: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:M: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:T: 44 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_TYPE: DNA +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:K: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:~: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc +tgtcg +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_LEN: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +80 RES:R: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:C: 12 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:-: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:U: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +Now to make a table of how may As, Ts, Cs, and Gs you can add the following: +\end_layout + +\begin_layout LyX-Code +... + | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G +\end_layout + +\begin_layout Standard +Or if you want to see the proportions of hard and soft masked sequence: +\end_layout + +\begin_layout LyX-Code +... + | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK% +\end_layout + +\begin_layout Standard +If you have a stack of sequences in one file and you want to determine the + mean GC content you can do it using the +\series bold +mean_vals +\series default + biopiece: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC% +\end_layout + +\begin_layout Standard +Or if you want the total count of Ns you can use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N +\end_layout + +\begin_layout Standard +The MIX_INDEX key is calculated as the count of the most common residue + over the sequence length, and can be used as a cut-off for removing sequence + tags consisting of mostly one nucleotide: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85' +\end_layout + +\begin_layout Subsection +How to extract subsequences? +\begin_inset LatexCommand label +name "sub:How-to-extract" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to extract a subsequence from a longer sequence use the biopiece + extract_seq, which will replace the sequence in the record with the subsequence + (this behaviour should probably be modified to be dependant of a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +replace or a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_replace switch +\begin_inset Note Note +status collapsed + +\begin_layout Standard +also in split_seq +\end_layout + +\end_inset + +). + So to extract the first 20 residues from all sequences do (first residue + is designated 1): +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=1 --len=20 +\end_layout + +\begin_layout Standard +You can also specify a begin and end coordinate set: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 --end=40 +\end_layout + +\begin_layout Standard +If you want the subsequences from position 20 to the sequence end do: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 +\end_layout + +\begin_layout Standard +If you want to extract subsequences a given distance from the sequence end + you can do this by reversing the sequence with the biopiece +\series bold +reverse_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-reverse-seq" + +\end_inset + +, followed by +\series bold +extract_seq +\series default + to get the subsequence, and then +\series bold +reverse_seq +\series default + again to get the subsequence back in the original orientation: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | reverse_seq +\end_layout + +\begin_layout LyX-Code +| extract_seq --beg=10 --len=10 | reverse_seq +\end_layout + +\begin_layout Subsection +How to get genomic sequence? +\begin_inset LatexCommand label +name "sub:How-to-get-genomic-sequence" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biopiece +\series bold +get_genomic_seq +\series default + can extract subsequences for a given genome specified with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch explicitly using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +beg and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +end/ +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +len switches: +\end_layout + +\begin_layout LyX-Code +get_genome_seq --genome= --beg=1 --len=100 +\end_layout + +\begin_layout Standard +Alternatively, +\series bold +get_genome_seq +\series default + can be used to append the corresponding sequence to BED, PSL, and BLAST + records: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to include flaking sequence using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +flank switch. + So to include 50 nucleotides upstream and 50 nucleotides downstream for + each BED entry do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= --flank=50 +\end_layout + +\begin_layout Subsection +How to upper-case sequences? +\end_layout + +\begin_layout Standard +Sequences can be shifted from lower case to upper case using +\series bold +uppercase_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | uppercase_seq +\end_layout + +\begin_layout Subsection +How to reverse sequences? +\begin_inset LatexCommand label +name "sub:How-to-reverse-seq" + +\end_inset + + +\end_layout + +\begin_layout Standard +The order of residues in a sequence can be reversed using reverse_seq: +\end_layout + +\begin_layout LyX-Code +... + | reverse_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +complement_seq +\series default + biopiece (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-complement" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to complement sequences? +\begin_inset LatexCommand label +name "sub:How-to-complement" + +\end_inset + + +\end_layout + +\begin_layout Standard +DNA and RNA sequences can be complemented with +\series bold +complement_seq +\series default +, which automagically determines the sequence type: +\end_layout + +\begin_layout LyX-Code +... + | complement_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +reverse_seq +\series default + biopiece (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-reverse-seq" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to remove indels from sequnces? +\end_layout + +\begin_layout Standard +Indels can be removed from sequences with the +\series bold +remove_indels +\series default + biopiece. + This is useful if you have aligned some sequences (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-align" + +\end_inset + +) and extracted (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-extract" + +\end_inset + +) a block of subsequences from the alignment and you want to use these sequence + in a search where you need to remove the indels first. + '-', '~', and '.' are considered indels: +\end_layout + +\begin_layout LyX-Code +... + | remove_indels +\end_layout + +\begin_layout Subsection +How to shuffle sequences? +\end_layout + +\begin_layout Standard +All residues in sequences in the stream can be shuffled to random positions + using the +\series bold +shuffle_seq +\series default + biopiece: +\end_layout + +\begin_layout LyX-Code +... + | shuffle_seq +\end_layout + +\begin_layout Subsection +How to split sequences into overlapping subsequences? +\end_layout + +\begin_layout Standard +Sequences can be slit into overlapping subsequences with the +\series bold +split_seq +\series default + biopiece. +\end_layout + +\begin_layout LyX-Code +... + | split_seq --word_size=20 --uniq +\end_layout + +\begin_layout Subsection +How to determine the oligo frequency? +\end_layout + +\begin_layout Standard +In order to determine if any oligo usage is over represented in one or more + sequences you can determine the frequency of oligos of a given size with + +\series bold +oligo_freq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 +\end_layout + +\begin_layout Standard +And if you have more than one sequence and want to accumulate the frequences + you need the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +all switch: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all +\end_layout + +\begin_layout Standard +To get a meaningful result you need to write the resulting frequencies as + a table with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +), but first it is important to +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +) the records with the frequencies to avoid full length sequences in the + table: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +And the resulting frequency table can be sorted with Unix sort (man sort). +\end_layout + +\begin_layout Subsection +How to search for sequences in genomes? +\end_layout + +\begin_layout Standard +See the following biopiece: +\end_layout + +\begin_layout Itemize + +\series bold +patscan_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blat_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blast_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +vmatch_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to search sequences for a pattern? +\begin_inset LatexCommand label +name "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Standard +It is possible to search sequences in the data stream for patterns using + the +\series bold +patscan_seq +\series default + biopiece which utilizes the powerful scan_for_matches engine. + Consult the documentation for scan_for_matches in order to learn how to + define patterns (the documentation is included in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:scan_for_matches-README" + +\end_inset + +). +\end_layout + +\begin_layout Standard +To search all sequences for a simple pattern consisting of the sequence + ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | patscan_seq --pattern='ATCGATCG[3,2,1]' +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma seperated list of patterns, so if you want + to search for more that one pattern do: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]' +\end_layout + +\begin_layout Standard +It is also possible to have a list of different patterns to search for in + a file with one pattern per line. + In order to get +\series bold +patscan_seq +\series default + to read these patterns use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern_in= +\end_layout + +\begin_layout Standard +To also scan the complementary strand in nucleotide sequences ( +\series bold +patscan_seq +\series default + automagically determines the sequence type) you need to add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comp switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --comp +\end_layout + +\begin_layout Standard +It is also possible to use +\series bold +patscan_seq +\series default + to output those records that does not contain a certain pattern by using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --invert +\end_layout + +\begin_layout Standard +Finally, +\series bold +patscan_seq +\series default + can also scan for patterns in a given genome sequence, instead of sequences + in the stream, using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch: +\end_layout + +\begin_layout LyX-Code +patscan --pattern= --genome= +\end_layout + +\begin_layout Subsection +How to use BLAT for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the data stream can be matched against supported genomes using + +\series bold +blat_seq +\series default + which is a biopiece using BLAT as the name might suggest. + Currently only Mouse and Human genomes are available and it is not possible + to use OOC files since there is still a need for a local repository for + genome files. + Otherwise it is just: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_seq --genome= +\end_layout + +\begin_layout Standard +The search results can then be written to file with +\series bold +write_psl +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +) or +\series bold +write_bed +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-BED" + +\end_inset + +) allthough with +\series bold +write_bed +\series default + some information will be lost). + It is also possible to plot chromosome distribution of the search results + using +\series bold +plot_chrdist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-chrdist" + +\end_inset + +) or the distribution of the match lengths using +\series bold +plot_lendist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-lendist" + +\end_inset + +) or a karyogram with the hits using +\series bold +plot_karyogram +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-karyogram" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to use BLAST for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Standard +Two biopieces exist for blasting sequences: +\series bold +create_blast_db +\series default + is used to create the BLAST database required for BLAST which is queried + using the biopiece +\series bold +blast_seq +\series default +. + So in order to create a BLAST database from sequences in the data stream + you simple run: +\end_layout + +\begin_layout LyX-Code +... + | create_blast_db --database=my_database --no_stream +\end_layout + +\begin_layout Standard +The type of sequence to use for the database is automagically determined + by +\series bold +create_blast_db +\series default +, but don't have a mixture of peptide and nucleic acids sequences in the + stream. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch takes a path as argument, but will default to 'blastdb_ if not set. +\end_layout + +\begin_layout Standard +The resulting database can now be queried with sequences in another data + stream using +\series bold +blast_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --database=my_database +\end_layout + +\begin_layout Standard +Again, the sequence type is determined automagically and the appropriate + BLAST program is guessed (see below table), however, the program name can + be overruled with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +program switch. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Subject sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Query sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Program guess +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastn +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastp +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastx +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +tblastn +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Finally, it is also possible to use +\series bold +blast_seq +\series default + for blasting sequences agains a preformatted genome using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch instead of the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --genome= +\end_layout + +\begin_layout Subsection +How to use Vmatch for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The powerful suffix array software package Vmatch +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.vmatch.de/" + +\end_inset + + +\end_layout + +\end_inset + + can be used for exact mapping of sequences against indexed genomes using + the biopiece +\series bold +vmatch_seq +\series default +, which will e.g. + map 700000 ESTs to the human genome locating all 160 mio hits in less than + an hour. + Only nucleotide sequences and sequences longer than 11 nucleotides will + be mapped. + It is recommended that sequences consisting of mostly one nucleotide type + are removed. + This can be done with the +\series bold +analyze_seq +\series default + biopiece +\begin_inset LatexCommand eqref +reference "sub:How-to-analyze" + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to allow for mismatches using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist switch. + So to allow for 2 mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=2 +\end_layout + +\begin_layout Standard +Or to allow for 10% mismathing nucleotides: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=10p +\end_layout + +\begin_layout Standard +To allow both indels and mismatches use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist switch. + So to allow for one mismatch or one indel: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=1 +\end_layout + +\begin_layout Standard +Or to allow for 5% indels or mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=5p +\end_layout + +\begin_layout Standard +Note that using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist or +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist greatly slows down vmatch considerably --- use with care. +\end_layout + +\begin_layout Standard +The resulting SCORE key can be replaced to hold the number of genome matches + of a given sequence (multi-mappers) is the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +count switch is given. +\end_layout + +\begin_layout Subsection +How to find all matches between sequences? +\begin_inset LatexCommand label +name "sub:How-to-find-matches" + +\end_inset + + +\end_layout + +\begin_layout Standard +All matches between two sequences can be determined with the biopiece +\series bold +match_seq +\series default +. + The match finding engine underneath the hood of +\series bold +match_seq +\series default + is the super fast suffix tree program MUMmer +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://mummer.sourceforge.net/" + +\end_inset + + +\end_layout + +\end_inset + +, which will locate all forward and reverse matches between huge sequences + in a matter of minutes (if the repeat count is not too high and if the + word size used is appropriate). + Matching two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq --word_size=20 --direction=both +\end_layout + +\begin_layout Standard +The output from +\series bold +match_seq +\series default + can be used to generate a dot plot with +\series bold +plot_matches +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-generate-dotplot" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to align sequences? +\begin_inset LatexCommand label +name "sub:How-to-align" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the stream can be aligned with the +\series bold +align_seq +\series default + biopiece that uses Muscle +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.drive5.com/muscle/muscle.html" + +\end_inset + + +\end_layout + +\end_inset + + as aligment engine. + Currently you cannot change any of the Muscle alignment parameters and + +\series bold +align_seq +\series default + will create an alignment based on the defaults (which are really good!): +\end_layout + +\begin_layout LyX-Code +... + | align_seq +\end_layout + +\begin_layout Standard +The aligned output can be written to file in FASTA format using +\series bold +write_fasta +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-fasta" + +\end_inset + +) or in pretty text using +\series bold +write_align +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-alignment" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to create a weight matrix? +\end_layout + +\begin_layout Standard +If you want a weight matrix to show the sequence composition of a stack + of sequences you can use the biopiece create_weight_matrix: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix +\end_layout + +\begin_layout Standard +The result can be output in percent using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +percent switch: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix --percent +\end_layout + +\begin_layout Standard +The weight matrix can be written as tabular output with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +) after removeing the records containing SEQ with +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix | grab --invert --keys=SEQ --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +The V0 column will hold the residue, while the rest of the columns will + hold the frequencies for each sequence position. +\end_layout + +\begin_layout Section +Plotting +\end_layout + +\begin_layout Standard +There exists several biopieces for plotting. + Some of these are based on GNUplot +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.gnuplot.info/" + +\end_inset + + +\end_layout + +\end_inset + +, which is an extremely powerful platform to generate all sorts of plots + and even though GNUplot has quite a steep learning curve, the biopieces + utilizing GNUplot are simple to use. + GNUplot is able to output a lot of different formats (called terminals + in GNUplot), but the biopieces focusses on three formats only: +\end_layout + +\begin_layout Enumerate +The 'dumb' terminal is default to the GNUplot based biopieces and will output + a plot in crude ASCII text (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +). + This is quite nice for a quick and dirty plot to get an overview of your + data . +\end_layout + +\begin_layout Enumerate +The 'post' or 'postscript' terminal output postscript code which is publication + grade graphics that can be viewed with applications such as Ghostview, + Photoshop, and Preview. +\end_layout + +\begin_layout Enumerate +The 'svg' terminal output's scalable vector graphics (SVG) which is a vector + based format. + SVG is great because you can edit the resulting plot using Photoshop or + Inkscape +\begin_inset Foot +status collapsed + +\begin_layout Standard +Inkscape is a really handy drawing program that is free and open source. + Availble at +\begin_inset LatexCommand htmlurl +target "http://www.inkscape.org" + +\end_inset + + +\end_layout + +\end_inset + + if you want to add additional labels, captions, arrows, and so on and then + save the result in different formats, such as postscript without loosing + resolution. +\end_layout + +\begin_layout Standard +The biopieces for plotting that are not based on GNUplot only output SVG + (that may change in the future). +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist_ascii.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dumb-terminal" + +\end_inset + +Dumb terminal +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +The output of a length distribution plot in the default 'dumb terminal' + to the terminal window. + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a histogram? +\begin_inset LatexCommand label +name "How-to-plot-histogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +A generic histogram for a given value can be plotted with the biopiece +\series bold +plot_histogram +\series default + (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Histogram" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | plot_histogram --key=TISSUE --no_stream +\end_layout + +\begin_layout Standard +(Figure missing) +\end_layout + +\begin_layout Standard +\noindent +\align left +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename histogram.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Histogram" + +\end_inset + +Histogram +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a length distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-lendist" + +\end_inset + + +\end_layout + +\begin_layout Standard +Plotting of length distributions, weather sequence lengths, patterns lengths, + hit lengths, +\emph on +etc. + +\emph default + is a really handy thing and can be done with the the biopiece +\series bold +plot_lendist +\series default +. + If you have a file with FASTA entries and want to plot the length distribution + you do it like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout LyX-Code +| plot_lendist --key=SEQ_LEN --no_stream +\end_layout + +\begin_layout Standard +The result will be written to the default dumb terminal and will look like + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +. +\end_layout + +\begin_layout Standard +If you instead want the result in postscript format you can do: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps +\end_layout + +\begin_layout Standard +That will generate the plot and save it to file, but not interrupt the data + stream which can then be used in further analysis. + You can also save the plot implicetly using '>', however, it is then important + to terminate the stream with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps +\end_layout + +\begin_layout Standard +The resulting plot can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Length-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Length-distribution" + +\end_inset + +Length distribution +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Length distribution of 630 piRNA like RNAs. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a chromosome distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-chrdist" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you have the result of a sequence search against a multi chromosome genome, + it is very practical to be able to plot the distribution of search hits + on the different chromosomes. + This can be done with +\series bold +plot_chrdist +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_genome | plot_chrdist --no_stream +\end_layout + +\begin_layout Standard +The above example will result in a crude plot using the 'dumb' terminal, + and if you want to mess around with the results from the BLAT search you + probably want to save the result to file first (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +). + To plot the chromosome distribution from the saved search result you can + do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +That will result in the output show in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Chromosome-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename chrdist.ps + lyxscale 50 + width 12cm + rotateAngle 90 + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Chromosome-distribution" + +\end_inset + +Chromosome distribution +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to generate a dotplot? +\begin_inset LatexCommand label +name "sub:How-to-generate-dotplot" + +\end_inset + + +\end_layout + +\begin_layout Standard +A dotplot is a powerful way to get an overview of the size and location + of sequence insertions, deletions, and duplications between two sequences. + Generating a dotplot with biopieces is a two step process where you initially + find all matches between two sequences using the tool +\series bold +match_seq +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-find-matches" + +\end_inset + +) and plot the resulting matches with +\series bold +plot_matches +\series default +. + Matching and plotting two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq | plot_matches --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +The resulting dotplot is in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dotplot" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename dotplot.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dotplot" + +\end_inset + +Dotplot +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Forward matches are displayed in green while reverse matches are displayed + in red. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a sequence logo? +\end_layout + +\begin_layout Standard +Sequence logos can be generate with +\series bold +plot_seqlogo +\series default +. + The sequnce type is determined automagically and an entropy scale of 2 + bits and 4 bits is used for nucleotide and peptide sequences, respectively +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand htmlurl +target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html" + +\end_inset + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | plot_seqlogo --no_stream --result_out=seqlogo.svg +\end_layout + +\begin_layout Standard +An example of a sequence logo can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Sequence-logo" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename seqlogo.png + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Sequence-logo" + +\end_inset + +Sequence logo +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a karyogram? +\begin_inset LatexCommand label +name "sub:How-to-plot-karyogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +To plot search hits on genomes use +\series bold +plot_karyogram +\series default +, which will output a nice karyogram in SVG graphics: +\end_layout + +\begin_layout LyX-Code +... + | plot_karyogram --result_out=karyogram.svg +\end_layout + +\begin_layout Standard +The banding data is taken from the UCSC genome browser database and currently + only Human and Mouse is supported. + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Karyogram" + +\end_inset + + shows the distribution of piRNA like RNAs matched to the Human genome. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename karyogram.png + lyxscale 35 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Karyogram" + +\end_inset + +Karyogram +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Hits from a search of piRNA like RNAs in the Human genome is displayed as + short horizontal bars. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Uploading Results +\end_layout + +\begin_layout Subsection +How do I display my results in the UCSC Genome Browser? +\end_layout + +\begin_layout Standard +Results from the list of biopieces below can be uploaded directly to a local + mirror of the UCSC Genome Browser using the biopiece +\series bold +upload_to_ucsc +\series default +: +\end_layout + +\begin_layout Itemize +patscan_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blat_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blast_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize +vmatch_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The syntax for uploading data the most simple way requires two mandatory + switches: +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database, which is the UCSC database name (such as hg18, mm9, etc.) and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +table which should be the users initials followed by an underscore and a + short description of the data: +\end_layout + +\begin_layout LyX-Code +... + | upload_to_ucsc --database=hg18 --table=mah_snoRNAs +\end_layout + +\begin_layout Standard +The +\series bold +upload_to_ucsc +\series default + biopiece modifies the users ~/ucsc/my_tracks.ra file automagically (a backup + is created with the name ~/ucsc/my_tracks.ra~) with default values that + can be overridden using the following switches: +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +short_label - Short label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +long_label - Long label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +group - Track group name - Default= +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +priority - Track display priority - Default=1 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +color - Track color - Default=147,73,42 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +chunk_size - Chunks for loading - Default=10000000 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +visibility - Track visibility - Default=pack +\end_layout + +\begin_layout Standard +Also, data in BED or PSL format can be uploaded with +\series bold +upload_to_ucsc +\series default + as long as these reference to genomes and chromosomes existing in the UCSC + Genome Browser: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout Section +Power Scripting +\end_layout + +\begin_layout Standard +It is possible to do commandline scripting of biopiece records using Perl. + Because a biopiece record essentially is a hash structure, you can pass + records to +\series bold +bioscript +\series default + command, which is a wrapper around the Perl executable that allows direct + manipulations of the records using the power of Perl. +\end_layout + +\begin_layout Standard +In the below example we replace in all records the value to the CHR key + with a forthrunning number: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{CHR}=$i++; put_record($r)}' +\end_layout + +\begin_layout Standard +Something more useful would probably be to create custom FASTA headers. + E.g. + if we read in a BED file, lookup the genomic sequence, create a custom + FASTA header with +\series bold +bioscript +\series default + and output FASTA entries: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{SEQ_NAME}= // +\end_layout + +\begin_layout LyX-Code +join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}' +\end_layout + +\begin_layout Standard +And the output: +\end_layout + +\begin_layout LyX-Code +>chr2L_21567527_21567550 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_693380_693403 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_13859534_13859557 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_9005090_9005113 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_2106825_2106848 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_14649031_14649054 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout Section +Trouble shooting +\end_layout + +\begin_layout Standard +Shoot the messenger! +\end_layout + +\begin_layout Section +\start_of_appendix +Keys +\begin_inset LatexCommand label +name "sec:Keys" + +\end_inset + + +\end_layout + +\begin_layout Standard +HIT +\end_layout + +\begin_layout Standard +HIT_BEG +\end_layout + +\begin_layout Standard +HIT_END +\end_layout + +\begin_layout Standard +HIT_LEN +\end_layout + +\begin_layout Standard +HIT_NAME +\end_layout + +\begin_layout Standard +PATTERN +\end_layout + +\begin_layout Section +Switches +\begin_inset LatexCommand label +name "sec:Switches" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num +\end_layout + +\begin_layout Section +scan_for_matches README +\begin_inset LatexCommand label +name "sec:scan_for_matches-README" + +\end_inset + + +\end_layout + +\begin_layout LyX-Code + scan_for_matches: +\end_layout + +\begin_layout LyX-Code + A Program to Scan Nucleotide or Protein Sequences for Matching Patterns +\end_layout + +\begin_layout LyX-Code + Ross Overbeek +\end_layout + +\begin_layout LyX-Code + MCS +\end_layout + +\begin_layout LyX-Code + Argonne National Laboratory +\end_layout + +\begin_layout LyX-Code + Argonne, IL 60439 +\end_layout + +\begin_layout LyX-Code + USA +\end_layout + +\begin_layout LyX-Code +Scan_for_matches is a utility that we have written to search for +\end_layout + +\begin_layout LyX-Code +patterns in DNA and protein sequences. + I wrote most of the code, +\end_layout + +\begin_layout LyX-Code +although David Joerg and Morgan Price wrote sections of an +\end_layout + +\begin_layout LyX-Code +earlier version. + The whole notion of pattern matching has a rich +\end_layout + +\begin_layout LyX-Code +history, and we borrowed liberally from many sources. + However, it is +\end_layout + +\begin_layout LyX-Code +worth noting that we were strongly influenced by the elegant tools +\end_layout + +\begin_layout LyX-Code +developed and distributed by David Searls. + My intent is to make the +\end_layout + +\begin_layout LyX-Code +existing tool available to anyone in the research community that might +\end_layout + +\begin_layout LyX-Code +find it useful. + I will continue to try to fix bugs and make suggested +\end_layout + +\begin_layout LyX-Code +enhancements, at least until I feel that a superior tool exists. +\end_layout + +\begin_layout LyX-Code +Hence, I would appreciate it if all bug reports and suggestions are +\end_layout + +\begin_layout LyX-Code +directed to me at Overbeek@mcs.anl.gov. + +\end_layout + +\begin_layout LyX-Code +I will try to log all bug fixes and report them to users that send me +\end_layout + +\begin_layout LyX-Code +their email addresses. + I do not require that you give me your name +\end_layout + +\begin_layout LyX-Code +and address. + However, if you do give it to me, I will try to notify +\end_layout + +\begin_layout LyX-Code +you of serious problems as they are discovered. +\end_layout + +\begin_layout LyX-Code +Getting Started: +\end_layout + +\begin_layout LyX-Code + The distribution should contain at least the following programs: +\end_layout + +\begin_layout LyX-Code + README - This document +\end_layout + +\begin_layout LyX-Code + ggpunit.c - One of the two source files +\end_layout + +\begin_layout LyX-Code + scan_for_matches.c - The second source file +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + run_tests - A perl script to test things +\end_layout + +\begin_layout LyX-Code + show_hits - A handy perl script +\end_layout + +\begin_layout LyX-Code + test_dna_input - Test sequences for DNA +\end_layout + +\begin_layout LyX-Code + test_dna_patterns - Test patterns for DNA scan +\end_layout + +\begin_layout LyX-Code + test_output - Desired output from test +\end_layout + +\begin_layout LyX-Code + test_prot_input - Test protein sequences +\end_layout + +\begin_layout LyX-Code + test_prot_patterns - Test patterns for proteins +\end_layout + +\begin_layout LyX-Code + testit - a perl script used for test +\end_layout + +\begin_layout LyX-Code + Only the first three files are required. + The others are useful, +\end_layout + +\begin_layout LyX-Code + but only if you have Perl installed on your system. + If you do +\end_layout + +\begin_layout LyX-Code + have Perl, I suggest that you type +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + which perl +\end_layout + +\begin_layout LyX-Code + to find out where it installed. + On my system, I get the following +\end_layout + +\begin_layout LyX-Code + response: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% which perl +\end_layout + +\begin_layout LyX-Code + /usr/local/bin/perl +\end_layout + +\begin_layout LyX-Code + indicating that Perl is installed in /usr/local/bin. + Anyway, once +\end_layout + +\begin_layout LyX-Code + you know where it is installed, edit the first line of files +\end_layout + +\begin_layout LyX-Code + testit +\end_layout + +\begin_layout LyX-Code + show_hits +\end_layout + +\begin_layout LyX-Code + replacing /usr/local/bin/perl with the appropriate location. + I +\end_layout + +\begin_layout LyX-Code + will assume that you can do this, although it is not critical (it +\end_layout + +\begin_layout LyX-Code + is needed only to test the installation and to use the "show_hits" +\end_layout + +\begin_layout LyX-Code + utility). + Perl is not required to actually install and run +\end_layout + +\begin_layout LyX-Code + scan_for_matches. + +\end_layout + +\begin_layout LyX-Code + If you do not have Perl, I suggest you get it and install it (it +\end_layout + +\begin_layout LyX-Code + is a wonderful utility). + Information about Perl and how to get it +\end_layout + +\begin_layout LyX-Code + can be found in the book "Programming Perl" by Larry Wall and +\end_layout + +\begin_layout LyX-Code + Randall L. + Schwartz, published by O'Reilly & Associates, Inc. +\end_layout + +\begin_layout LyX-Code + To get started, you will need to compile the program. + I do this +\end_layout + +\begin_layout LyX-Code + using +\end_layout + +\begin_layout LyX-Code + gcc -O -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + If you do not use GNU C, use +\end_layout + +\begin_layout LyX-Code + cc -O -DCC -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + which works on my Sun. + +\end_layout + +\begin_layout LyX-Code + Once you have compiled scan_for_matches, you can verify that it +\end_layout + +\begin_layout LyX-Code + works with +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + You may get a few strange lines of the sort +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + rm: tmp: No such file or directory +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + These should cause no concern. + However, if the "diff" shows that +\end_layout + +\begin_layout LyX-Code + tmp and test_output are different, contact me (you have a +\end_layout + +\begin_layout LyX-Code + problem). + +\end_layout + +\begin_layout LyX-Code + You should now be able to use scan_for_matches by following the +\end_layout + +\begin_layout LyX-Code + instructions given below (which is all the normal user should have +\end_layout + +\begin_layout LyX-Code + to understand, once things are installed properly). +\end_layout + +\begin_layout LyX-Code + ============================================================== +\end_layout + +\begin_layout LyX-Code +How to run scan_for_matches: +\end_layout + +\begin_layout LyX-Code + To run the program, you type need to create two files +\end_layout + +\begin_layout LyX-Code + 1. + the first file contains the pattern you wish to scan for; I'll +\end_layout + +\begin_layout LyX-Code + call this file pat_file in what follows (but any name is ok) +\end_layout + +\begin_layout LyX-Code + 2. + the second file contains a set of sequences to scan. + These +\end_layout + +\begin_layout LyX-Code + should be in "fasta format". + Just look at the contents of +\end_layout + +\begin_layout LyX-Code + test_dna_input to see examples of this format. + Basically, +\end_layout + +\begin_layout LyX-Code + each sequence begins with a line of the form +\end_layout + +\begin_layout LyX-Code + >sequence_id +\end_layout + +\begin_layout LyX-Code + and is followed by one or more lines containing the sequence. +\end_layout + +\begin_layout LyX-Code + Once these files have been created, you just use +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < input_file +\end_layout + +\begin_layout LyX-Code + to scan all of the input sequences for the given pattern. + As an +\end_layout + +\begin_layout LyX-Code + example, suppose that pat_file contains a single line of the form +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + Then, +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + should produce two "hits". + When I run this on my machine, I get +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + >tst2:[6,27] +\end_layout + +\begin_layout LyX-Code + CGUAACC GGTTAACC GGUUACG +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code +Simple Patterns Built by Matching Ranges and Reverse Complements +\end_layout + +\begin_layout LyX-Code + Let me first explain this simple pattern: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + The pattern consists of three "pattern units" separated by spaces. +\end_layout + +\begin_layout LyX-Code + The first pattern unit is +\end_layout + +\begin_layout LyX-Code + p1=4...7 +\end_layout + +\begin_layout LyX-Code + which means "match 4 to 7 characters and call them p1". + The +\end_layout + +\begin_layout LyX-Code + second pattern unit is +\end_layout + +\begin_layout LyX-Code + 3...8 +\end_layout + +\begin_layout LyX-Code + which means "then match 3 to 8 characters". + The last pattern unit +\end_layout + +\begin_layout LyX-Code + is +\end_layout + +\begin_layout LyX-Code + ~p1 +\end_layout + +\begin_layout LyX-Code + which means "match the reverse complement of p1". + The first +\end_layout + +\begin_layout LyX-Code + reported hit is shown as +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + which states that characters 6 through 27 of sequence tst1 were +\end_layout + +\begin_layout LyX-Code + matched. + "cguaac" matched the first pattern unit, "ggttaacc" the +\end_layout + +\begin_layout LyX-Code + second, and "gguuacg" the third. + This is an example of a common +\end_layout + +\begin_layout LyX-Code + type of pattern used to search for sections of DNA or RNA that +\end_layout + +\begin_layout LyX-Code + would fold into a hairpin loop. +\end_layout + +\begin_layout LyX-Code +Searching Both Strands +\end_layout + +\begin_layout LyX-Code + Now for a short aside: scan_for_matches only searched the +\end_layout + +\begin_layout LyX-Code + sequences in the input file; it did not search the opposite +\end_layout + +\begin_layout LyX-Code + strand. + With a pattern of the sort we just used, there is not +\end_layout + +\begin_layout LyX-Code + need o search the opposite strand. + However, it is normally the +\end_layout + +\begin_layout LyX-Code + case that you will wish to search both the sequence and the +\end_layout + +\begin_layout LyX-Code + opposite strand (i.e., the reverse complement of the sequence). +\end_layout + +\begin_layout LyX-Code + To do that, you would just use the "-c" command line. + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + Hits on the opposite strand will show a beginning location greater +\end_layout + +\begin_layout LyX-Code + than te end location of the match. +\end_layout + +\begin_layout LyX-Code +Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions +\end_layout + +\begin_layout LyX-Code + Let us stop now and ask "What additional features would one need to +\end_layout + +\begin_layout LyX-Code + really find the kinds of loop structures that characterize tRNAs, +\end_layout + +\begin_layout LyX-Code + rRNAs, and so forth?" I can immediately think of two: +\end_layout + +\begin_layout LyX-Code + a) you will need to be able to allow non-standard pairings +\end_layout + +\begin_layout LyX-Code + (those other than G-C and A-U), and +\end_layout + +\begin_layout LyX-Code + b) you will need to be able to tolerate some number of +\end_layout + +\begin_layout LyX-Code + mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + Let me first show you how to handle non-standard "rules for +\end_layout + +\begin_layout LyX-Code + pairing in reverse complements". + Consider the following pattern, +\end_layout + +\begin_layout LyX-Code + which I show as two line (you may use as many lines as you like in +\end_layout + +\begin_layout LyX-Code + forming a pattern, although you can only break a pattern at points +\end_layout + +\begin_layout LyX-Code + where space would be legal): +\end_layout + +\begin_layout LyX-Code + r1={au,ua,gc,cg,gu,ug,ga,ag} +\end_layout + +\begin_layout LyX-Code + p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1 +\end_layout + +\begin_layout LyX-Code + The first "pattern unit" does not actually match anything; rather, +\end_layout + +\begin_layout LyX-Code + it defines a "pairing rule" in which standard pairings are +\end_layout + +\begin_layout LyX-Code + allowed, as well as G-A and A-G (in case you wondered, Us and Ts +\end_layout + +\begin_layout LyX-Code + and upper and lower case can be used interchangably; for example +\end_layout + +\begin_layout LyX-Code + r1={AT,UA,gc,cg} could be used to define the "standard rule" for +\end_layout + +\begin_layout LyX-Code + pairings). + The second line consists of six pattern units which +\end_layout + +\begin_layout LyX-Code + may be interpreted as follows: +\end_layout + +\begin_layout LyX-Code + p1=2...3 match 2 or 3 characters (call it p1) +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + p2=2...5 match 2 to 5 characters (call it p2) +\end_layout + +\begin_layout LyX-Code + 1...5 match 1 to 5 characters +\end_layout + +\begin_layout LyX-Code + r1~p2 match the reverse complement of p2, +\end_layout + +\begin_layout LyX-Code + allowing G-A and A-G pairs +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + ~p1 match the reverse complement of p1 +\end_layout + +\begin_layout LyX-Code + allowing only G-C, C-G, A-T, and T-A pairs +\end_layout + +\begin_layout LyX-Code + Thus, r1~p2 means "match the reverse complement of p2 using rule r1". +\end_layout + +\begin_layout LyX-Code + Now let us consider the issue of tolerating mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + You may add a "qualifier" to the pattern unit that gives the +\end_layout + +\begin_layout LyX-Code + tolerable number of "mismatches, deletions, and insertions". +\end_layout + +\begin_layout LyX-Code + Thus, +\end_layout + +\begin_layout LyX-Code + p1=10...10 3...8 ~p1[1,2,1] +\end_layout + +\begin_layout LyX-Code + means that the third pattern unit must match 10 characters, +\end_layout + +\begin_layout LyX-Code + allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or +\end_layout + +\begin_layout LyX-Code + T-A), two deletions (a deletion is a character that occurs in p1, +\end_layout + +\begin_layout LyX-Code + but has been "deleted" from the string matched by ~p1), and one +\end_layout + +\begin_layout LyX-Code + insertion (an "insertion" is a character that occurs in the string +\end_layout + +\begin_layout LyX-Code + matched by ~p1, but not for which no corresponding character +\end_layout + +\begin_layout LyX-Code + occurs in p1). + In this case, the pattern would match +\end_layout + +\begin_layout LyX-Code + ACGTACGTAC GGGGGGGG GCGTTACCT +\end_layout + +\begin_layout LyX-Code + which is, you must admit, a fairly weak loop. + It is common to +\end_layout + +\begin_layout LyX-Code + allow mismatches, but you will find yourself using insertions and +\end_layout + +\begin_layout LyX-Code + deletions much more rarely. + In any event, you should note that +\end_layout + +\begin_layout LyX-Code + allowing mismatches, insertions, and deletions does force the +\end_layout + +\begin_layout LyX-Code + program to try many additional possible pairings, so it does slow +\end_layout + +\begin_layout LyX-Code + things down a bit. +\end_layout + +\begin_layout LyX-Code +How Patterns Are Matched +\end_layout + +\begin_layout LyX-Code + Now is as good a time as any to discuss the basic flow of control +\end_layout + +\begin_layout LyX-Code + when matching patterns. + Recall that a "pattern" is a sequence of +\end_layout + +\begin_layout LyX-Code + "pattern units". + Suppose that the pattern units were +\end_layout + +\begin_layout LyX-Code + u1 u2 u3 u4 ... + un +\end_layout + +\begin_layout LyX-Code + The scan of a sequence S begins by setting the current position +\end_layout + +\begin_layout LyX-Code + to 1. + Then, an attempt is made to match u1 starting at the +\end_layout + +\begin_layout LyX-Code + current position. + Each attempt to match a pattern unit can +\end_layout + +\begin_layout LyX-Code + succeed or fail. + If it succeeds, then an attempt is made to match +\end_layout + +\begin_layout LyX-Code + the next unit. + If it fails, then an attempt is made to find an +\end_layout + +\begin_layout LyX-Code + alternative match for the immediately preceding pattern unit. + If +\end_layout + +\begin_layout LyX-Code + this succeeds, then we proceed forward again to the next unit. + If +\end_layout + +\begin_layout LyX-Code + it fails we go back to the preceding unit. + This process is called +\end_layout + +\begin_layout LyX-Code + "backtracking". + If there are no previous units, then the current +\end_layout + +\begin_layout LyX-Code + position is incremented by one, and everything starts again. + This +\end_layout + +\begin_layout LyX-Code + proceeds until either the current position goes past the end of +\end_layout + +\begin_layout LyX-Code + the sequence or all of the pattern units succeed. + On success, +\end_layout + +\begin_layout LyX-Code + scan_for_matches reports the "hit", the current position is set +\end_layout + +\begin_layout LyX-Code + just past the hit, and an attempt is made to find another hit. +\end_layout + +\begin_layout LyX-Code + If you wish to limit the scan to simply finding a maximum of, say, +\end_layout + +\begin_layout LyX-Code + 10 hits, you can use the -n option (-n 10 would set the limit to +\end_layout + +\begin_layout LyX-Code + 10 reported hits). + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c -n 1 pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + would search for just the first hit (and would stop searching the +\end_layout + +\begin_layout LyX-Code + current sequences or any that follow in the input file). +\end_layout + +\begin_layout LyX-Code +Searching for repeats: +\end_layout + +\begin_layout LyX-Code + In the last section, I discussed almost all of the details +\end_layout + +\begin_layout LyX-Code + required to allow you to look for repeats. + Consider the following +\end_layout + +\begin_layout LyX-Code + set of patterns: +\end_layout + +\begin_layout LyX-Code + p1=6...6 3...8 p1 (find exact 6 character repeat separated +\end_layout + +\begin_layout LyX-Code + by to 8 characters) +\end_layout + +\begin_layout LyX-Code + p1=6...6 3..8 p1[1,0,0] (allow one mismatch) +\end_layout + +\begin_layout LyX-Code + p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0] +\end_layout + +\begin_layout LyX-Code + (match 12 characters that are the remains +\end_layout + +\begin_layout LyX-Code + of a 3-character sequence occurring 4 times) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...8 0...3 p2=6...8 p1 0...3 p2 +\end_layout + +\begin_layout LyX-Code + (This would match things like +\end_layout + +\begin_layout LyX-Code + ATCT G TCTTT ATCT TG TCTTT +\end_layout + +\begin_layout LyX-Code + ) +\end_layout + +\begin_layout LyX-Code +Searching for particular sequences: +\end_layout + +\begin_layout LyX-Code + Occasionally, one wishes to match a specific, known sequence. +\end_layout + +\begin_layout LyX-Code + In such a case, you can just give the sequence (along with an +\end_layout + +\begin_layout LyX-Code + optional statement of the allowable mismatches, insertions, and +\end_layout + +\begin_layout LyX-Code + deletions). + Thus, +\end_layout + +\begin_layout LyX-Code + p1=6...8 GAGA ~p1 (match a hairpin with GAGA as the loop) +\end_layout + +\begin_layout LyX-Code + RRRRYYYY (match 4 purines followed by 4 pyrimidines) +\end_layout + +\begin_layout LyX-Code + TATAA[1,0,0] (match TATAA, allowing 1 mismatch) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matches against a "weight matrix": +\end_layout + +\begin_layout LyX-Code + I will conclude my examples of the types of pattern units +\end_layout + +\begin_layout LyX-Code + available for matching against nucleotide sequences by discussing a +\end_layout + +\begin_layout LyX-Code + crude implemetation of matching using a "weight matrix". + While I +\end_layout + +\begin_layout LyX-Code + am less than overwhelmed with the syntax that I chose, I think that +\end_layout + +\begin_layout LyX-Code + the reader should be aware that I was thinking of generating +\end_layout + +\begin_layout LyX-Code + patterns containing such pattern units automatically from +\end_layout + +\begin_layout LyX-Code + alignments (and did not really plan on typing such things in by +\end_layout + +\begin_layout LyX-Code + hand very often). + Anyway, suppose that you wanted to match a +\end_layout + +\begin_layout LyX-Code + sequence of eight characters. + The "consensus" of these eight +\end_layout + +\begin_layout LyX-Code + characters is GRCACCGS, but the actual "frequencies of occurrence" +\end_layout + +\begin_layout LyX-Code + are given in the matrix below. + Thus, the first character is an A +\end_layout + +\begin_layout LyX-Code + 16% the time and a G 84% of the time. + The second is an A 57% of +\end_layout + +\begin_layout LyX-Code + the time, a C 10% of the time, a G 29% of the time, and a T 4% of +\end_layout + +\begin_layout LyX-Code + the time. + +\end_layout + +\begin_layout LyX-Code + C1 C2 C3 C4 C5 C6 C7 C8 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + A 16 57 0 95 0 18 0 0 +\end_layout + +\begin_layout LyX-Code + C 0 10 80 0 100 60 0 50 +\end_layout + +\begin_layout LyX-Code + G 84 29 0 0 0 20 100 50 +\end_layout + +\begin_layout LyX-Code + T 0 4 20 5 0 2 0 0 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + One could use the following pattern unit to search for inexact +\end_layout + +\begin_layout LyX-Code + matches related to such a "weight matrix": +\end_layout + +\begin_layout LyX-Code + {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + This pattern unit will attempt to match exactly eight characters. +\end_layout + +\begin_layout LyX-Code + For each character in the sequence, the entry in the corresponding +\end_layout + +\begin_layout LyX-Code + tuple is added to an accumulated sum. + If the sum is greater than +\end_layout + +\begin_layout LyX-Code + 450, the match succeeds; else it fails. +\end_layout + +\begin_layout LyX-Code + Recently, this feature was upgraded to allow ranges. + Thus, +\end_layout + +\begin_layout LyX-Code + 600 > {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + will work, as well. +\end_layout + +\begin_layout LyX-Code +Allowing Alternatives: +\end_layout + +\begin_layout LyX-Code + Very occasionally, you may wish to allow alternative pattern units +\end_layout + +\begin_layout LyX-Code + (i.e., "match either A or B"). + You can do this using something +\end_layout + +\begin_layout LyX-Code + like +\end_layout + +\begin_layout LyX-Code + ( GAGA | GCGCA) +\end_layout + +\begin_layout LyX-Code + which says "match either GAGA or GCGCA". + You may take +\end_layout + +\begin_layout LyX-Code + alternatives of a list of pattern units, for example +\end_layout + +\begin_layout LyX-Code + (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG) +\end_layout + +\begin_layout LyX-Code + would match one of two sequences of pattern units. + There is one +\end_layout + +\begin_layout LyX-Code + clumsy aspect of the syntax: to match a list of alternatives, you +\end_layout + +\begin_layout LyX-Code + need to fully the request. + Thus, +\end_layout + +\begin_layout LyX-Code + (GAGA | (GCGCA | TTCGA)) +\end_layout + +\begin_layout LyX-Code + would be needed to try the three alternatives. +\end_layout + +\begin_layout LyX-Code +One Minor Extension +\end_layout + +\begin_layout LyX-Code + Sometimes a pattern will contain a sequence of distinct ranges, +\end_layout + +\begin_layout LyX-Code + and you might wish to limit the sum of the lengths of the matched +\end_layout + +\begin_layout LyX-Code + subsequences. + For example, suppose that you basically wanted to +\end_layout + +\begin_layout LyX-Code + match something like +\end_layout + +\begin_layout LyX-Code + ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT +\end_layout + +\begin_layout LyX-Code + but that the sum of the lengths of p1, p2, and p3 must not exceed +\end_layout + +\begin_layout LyX-Code + eight characters. + To do this, you could add +\end_layout + +\begin_layout LyX-Code + length(p1+p2+p3) < 9 +\end_layout + +\begin_layout LyX-Code + as the last pattern unit. + It will just succeed or fail (but does +\end_layout + +\begin_layout LyX-Code + not actually match any characters in the sequence). +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matching Protein Sequences +\end_layout + +\begin_layout LyX-Code + Suppose that the input file contains protein sequences. + In this +\end_layout + +\begin_layout LyX-Code + case, you must invoke scan_for_matches with the "-p" option. + You +\end_layout + +\begin_layout LyX-Code + cannot use aspects of the language that relate directly to +\end_layout + +\begin_layout LyX-Code + nucleotide sequences (e.g., the -c command line option or pattern +\end_layout + +\begin_layout LyX-Code + constructs referring to the reverse complement of a previously +\end_layout + +\begin_layout LyX-Code + matched unit). + +\end_layout + +\begin_layout LyX-Code + You also have two additional constructs that allow you to match +\end_layout + +\begin_layout LyX-Code + either "one of a set of amino acids" or "any amino acid other than +\end_layout + +\begin_layout LyX-Code + those a given set". + For example, +\end_layout + +\begin_layout LyX-Code + p1=0...4 any(HQD) 1...3 notany(HK) p1 +\end_layout + +\begin_layout LyX-Code + would successfully match a string like +\end_layout + +\begin_layout LyX-Code + YWV D AA C YWV +\end_layout + +\begin_layout LyX-Code +Using the show_hits Utility +\end_layout + +\begin_layout LyX-Code + When viewing a large set of complex matches, you might find it +\end_layout + +\begin_layout LyX-Code + convenient to post-process the scan_for_matches output to get a +\end_layout + +\begin_layout LyX-Code + more readable version. + We provide a simple post-processor called +\end_layout + +\begin_layout LyX-Code + "show_hits". + To see its effect, just pipe the output of a +\end_layout + +\begin_layout LyX-Code + scan_for_matches into show_hits: +\end_layout + +\begin_layout LyX-Code + Normal Output: +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp +\end_layout + +\begin_layout LyX-Code + >tst1:[1,28] +\end_layout + +\begin_layout LyX-Code + gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst1:[28,1] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + >tst2:[2,31] +\end_layout + +\begin_layout LyX-Code + CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + >tst2:[31,2] +\end_layout + +\begin_layout LyX-Code + CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + >tst3:[3,32] +\end_layout + +\begin_layout LyX-Code + gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst3:[32,3] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + Piped Through show_hits: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + Optionally, you can specify which of the "fields" in the matches +\end_layout + +\begin_layout LyX-Code + you wish to sort on, and show_hits will sort them. + The field +\end_layout + +\begin_layout LyX-Code + numbers start with 0. + So, you might get something like +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits 2 1 +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + In this case, the hits have been sorted on fields 2 and 1 (that is, +\end_layout + +\begin_layout LyX-Code + the third and second matched subfields). +\end_layout + +\begin_layout LyX-Code + show_hits is just one possible little post-processor, and you +\end_layout + +\begin_layout LyX-Code + might well wish to write a customized one for yourself. +\end_layout + +\begin_layout LyX-Code +Reducing the Cost of a Search +\end_layout + +\begin_layout LyX-Code + The scan_for_matches utility uses a fairly simple search, and may +\end_layout + +\begin_layout LyX-Code + consume large amounts of CPU time for complex patterns. + Someday, +\end_layout + +\begin_layout LyX-Code + I may decide to optimize the code. + However, until then, let me +\end_layout + +\begin_layout LyX-Code + mention one useful technique. + +\end_layout + +\begin_layout LyX-Code + When you have a complex pattern that includes a number of varying +\end_layout + +\begin_layout LyX-Code + ranges, imprecise matches, and so forth, it is useful to +\end_layout + +\begin_layout LyX-Code + "pipeline" matches. + That is, form a simpler pattern that can be +\end_layout + +\begin_layout LyX-Code + used to scan through a large database extracting sections that +\end_layout + +\begin_layout LyX-Code + might be matched by the more complex pattern. + Let me illustrate +\end_layout + +\begin_layout LyX-Code + with a short example. + Suppose that you really wished to match the +\end_layout + +\begin_layout LyX-Code + pattern +\end_layout + +\begin_layout LyX-Code + p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0] +\end_layout + +\begin_layout LyX-Code + In this case, the pattern units AGC 3...5 RYGC can be used to rapidly +\end_layout + +\begin_layout LyX-Code + constrain the overall search. + You can preprocess the overall +\end_layout + +\begin_layout LyX-Code + database using the pattern: +\end_layout + +\begin_layout LyX-Code + 31...31 AGC 3...5 RYGC 7...7 +\end_layout + +\begin_layout LyX-Code + Put the complex pattern in pat_file1 and the simpler pattern in +\end_layout + +\begin_layout LyX-Code + pat_file2. + Then use, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file2 < nucleotide_database | +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file1 +\end_layout + +\begin_layout LyX-Code + The output will show things like +\end_layout + +\begin_layout LyX-Code + >seqid:[232,280][2,47] +\end_layout + +\begin_layout LyX-Code + matches pieces +\end_layout + +\begin_layout LyX-Code + Then, the actual section of the sequence that was matched can be +\end_layout + +\begin_layout LyX-Code + easily computed as [233,278] (remember, the positions start from +\end_layout + +\begin_layout LyX-Code + 1, not 0). +\end_layout + +\begin_layout LyX-Code + Let me finally add, you should do a few short experiments to see +\end_layout + +\begin_layout LyX-Code + whether or not such pipelining actually improves performance -- it +\end_layout + +\begin_layout LyX-Code + is not always obvious where the time is going, and I have +\end_layout + +\begin_layout LyX-Code + sometimes found that the added complexity of pipelining actually +\end_layout + +\begin_layout LyX-Code + slowed things up. + It gets its best improvements when there are +\end_layout + +\begin_layout LyX-Code + exact matches of more than just a few characters that can be +\end_layout + +\begin_layout LyX-Code + rapidly used to eliminate large sections of the database. +\end_layout + +\begin_layout LyX-Code +============= +\end_layout + +\begin_layout LyX-Code +Additions: +\end_layout + +\begin_layout LyX-Code +Feb 9, 1995: the pattern units ^ and $ now work as in normal regular +\end_layout + +\begin_layout LyX-Code + expressions. + That is +\end_layout + +\begin_layout LyX-Code + TTF $ +\end_layout + +\begin_layout LyX-Code + matches only TTF at the end of the string and +\end_layout + +\begin_layout LyX-Code + ^ TTF +\end_layout + +\begin_layout LyX-Code + matches only an initial TTF +\end_layout + +\begin_layout LyX-Code + The pattern unit +\end_layout + +\begin_layout LyX-Code + : +\end_layout + +\begin_layout Standard +\begin_inset Box Frameless +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 0 +width "100col%" +special "none" +height "1in" +height_special "totalheight" +status open + +\begin_layout LyX-Code + +\size scriptsize +Program name: read_fasta +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Author: Martin Asser Hansen - Copyright (C) - All rights reserved +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Contact: mail@maasha.dk +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Date: August 2007 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/ +gpl.html) +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Description: Read FASTA entries. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Usage: read_fasta [options] -i +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Options: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-i | --data_in=] - Comma separated list of files + to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-n | --num=] - Limit number of records to read. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-I | --stream_in=] - Read input stream from file + - Default=STDIN +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + [-O | --stream_out=] - Write output stream to file + - Default=STDOUT +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +Examples: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna - Read FASTA entries from file. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test1.fna,test2,fna - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i '*.fna' - Read FASTA entries from files. +\end_layout + +\begin_layout LyX-Code + +\size scriptsize + read_fasta -i test.fna -n 10 - Read first 10 FASTA entries from + file. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +The Data Stream +\end_layout + +\begin_layout Subsection +How to read the data stream from file? +\begin_inset LatexCommand label +name "sub:How-to-read-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +You want to read a data stream that you previously have saved to file in + biotools format. + This can be done implicetly or explicitly. + The implicit way uses the 'stdout' stream of the Unix terminal: +\end_layout + +\begin_layout LyX-Code +cat | +\end_layout + +\begin_layout Standard +cat is the Unix command that reads a file and output the result to 'stdout' + --- which in this case is piped to any biotool represented by the . + It is also possible to read the data stream using '<' to direct the 'stdout' + stream into the biotool like this: +\end_layout + +\begin_layout LyX-Code + < +\end_layout + +\begin_layout Standard +However, that will not work if you pipe more biotools together. + Then it is much safer to read the stream from a file explicitly like this: +\end_layout + +\begin_layout LyX-Code + --stream_in= +\end_layout + +\begin_layout Standard +Here the filename is explicetly given to the biotool with + the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in. + This switch works with all biotools. + It is also possible to read in data from multiple sources by repeating + the explicit read step: +\end_layout + +\begin_layout LyX-Code + --stream_in= | --stream_in= +\end_layout + +\begin_layout Subsection +How to write the data stream to file? +\begin_inset LatexCommand label +name "sub:How-to-write-stream" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to save the output stream from a biotool to file, so you can read + in the stream again at a later time, you can do one of two things: +\end_layout + +\begin_layout LyX-Code + > +\end_layout + +\begin_layout Standard +All, the biotools write the data stream to 'stdout' by default which can + be written to a file by redirecting 'stdout' to file using '>' , however, + if one of the biotools for writing other formats is used then the both + the biotools records as well as the result output will go to 'stdout' in + a mixture causing havock! To avoid this you must use the switch +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out that explictly tells the biotool to write the output stream to + file: +\end_layout + +\begin_layout LyX-Code + --stream_out= +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out switch works with all biotools. +\end_layout + +\begin_layout Subsection +How to terminate the data stream? +\end_layout + +\begin_layout Standard +The data stream is never stops unless the user want to save the stream or + by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch that will terminate the stream: +\end_layout + +\begin_layout LyX-Code + --no_stream +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch only works with those biotools where it makes sense that + the user might want to terminale the data stream, +\emph on +i.e +\emph default +. + after an analysis step where the user wants to output the result, but not + the data stream. +\end_layout + +\begin_layout Subsection +How to write my results to file? +\begin_inset LatexCommand label +name "sub:How-to-write-result" + +\end_inset + + +\end_layout + +\begin_layout Standard +Saving the result of an analysis to file can be done implicitly or explicitly. + The implicit way: +\end_layout + +\begin_layout LyX-Code + --no_stream > +\end_layout + +\begin_layout Standard +If you use '>' to redirect 'stdout' to file then it is important to use + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch to avoid writing a mix of biotools records and result to + the same file causing havock. + The safe way is to use the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which explicetly tells the biotool to write the result + to a given file: +\end_layout + +\begin_layout LyX-Code + --result_out= +\end_layout + +\begin_layout Standard +Using the above method will not terminate the stream, so it is possible + to pipe that into another biotool generating different results: +\end_layout + +\begin_layout LyX-Code + --result_out= | --result_out= +\end_layout + +\begin_layout Standard +And still the data stream will continue unless terminated with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream: +\end_layout + +\begin_layout LyX-Code + --result_out= --no_stream +\end_layout + +\begin_layout Standard +Or written to file using implicitly or explicity +\begin_inset LatexCommand eqref +reference "sub:How-to-write-result" + +\end_inset + +. + The explicit way: +\end_layout + +\begin_layout LyX-Code + --result_out= --stream_out= +\end_layout + +\begin_layout Subsection +How to read data from multiple sources? +\end_layout + +\begin_layout Standard +To read multiple data sources, with the same type or different type of data + do: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +where type is the data type a specific biotool reads. +\end_layout + +\begin_layout Section +Reading input +\end_layout + +\begin_layout Subsection +How to read biotools input? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-read-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to read in data? +\end_layout + +\begin_layout Standard +Data in different formats can be read with the appropriate biotool for that + format. + The biotools are typicalled named 'read_' such as +\series bold +read_fasta +\series default +, +\series bold +read_bed +\series default +, +\series bold +read_tab +\series default +, etc., and all behave in a similar manner. + Data can be read by supplying the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in switch and a file name to the file containing the data: +\end_layout + +\begin_layout LyX-Code + --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read in a saved biotools stream (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-stream" + +\end_inset + +) as well as reading data in one go: +\end_layout + +\begin_layout LyX-Code + --stream_in= --data_in= +\end_layout + +\begin_layout Standard +If you want to read data from several files you can do this: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= +\end_layout + +\begin_layout Standard +If you have several data files you can read in all explicitly with a comma + separated list: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,file2,file3 +\end_layout + +\begin_layout Standard +And it is also possible to use file globbing +\begin_inset Foot +status open + +\begin_layout Standard +using the short option will only work if you quote the argument -i '*.fna' +\end_layout + +\end_inset + +: +\end_layout + +\begin_layout LyX-Code + --data_in=*.fna +\end_layout + +\begin_layout Standard +Or in a combination: +\end_layout + +\begin_layout LyX-Code + --data_in=file1,/dir/*.fna +\end_layout + +\begin_layout Standard +Finally, it is possible to read in data in different formats using the appropria +te biotool for each format: +\end_layout + +\begin_layout LyX-Code + --data_in= | --data_in= ... +\end_layout + +\begin_layout Subsection +How to read FASTA input? +\end_layout + +\begin_layout Standard +Sequences in FASTA format can be read explicitly using +\series bold +read_fasta +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= +\end_layout + +\begin_layout Subsection +How to read alignment input? +\end_layout + +\begin_layout Standard +If your alignment if FASTA formatted then you can +\series bold +read_align +\series default +. + It is also possible to use +\series bold +read_fasta +\series default + since the data is FASTA formatted, however, with +\series bold +read_fasta +\series default + the key ALIGN will be omitted. + The ALIGN key is used to determine which sequences belong to what alignment + which is required for +\series bold +write_align +\series default +. +\end_layout + +\begin_layout LyX-Code +read_align --data_in= +\end_layout + +\begin_layout Subsection +How to read tabular input? +\begin_inset LatexCommand label +name "sub:How-to-read-table" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tabular input can be read with +\series bold +read_tab +\series default + which will read in all rows and chosen columns (separated by a given delimter) + from a table in text format. +\end_layout + +\begin_layout Standard +The table below: +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Human +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +ATACGTCAG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +23524 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Dog +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AGCATGAC +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2442 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Mouse +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +GACTG +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +234 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Cat +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +AAATGCA +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +2342 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Can be read using the command: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= +\end_layout + +\begin_layout Standard +Which will result in four records, one for each row, where the keys V0, + V1, V2 are the default keys for the organism, sequence, and count, respectively. + It is possible to select a subset of colums to read by using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +cols switch which takes a comma separated list of columns numbers (first + column is designated 0) as argument. + So to read in only the sequence and the count so that the count comes before + the sequence do: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 +\end_layout + +\begin_layout Standard +It is also possible to name the columns with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --cols=2,1 --keys=COUNT,SEQ +\end_layout + +\begin_layout Subsection +How to read BED input? +\end_layout + +\begin_layout Standard +The BED (Browser Extensible Data +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/FAQ/FAQformat" + +\end_inset + + +\end_layout + +\end_inset + +) format is a tabular format for data pertaining to one of the Eukaryotic + genomes in the UCSC genome brower +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://genome.ucsc.edu/" + +\end_inset + + +\end_layout + +\end_inset + +. + The BED format consists of up to 12 columns, where the first three are + mandatory CHR, CHR_BEG, and CHR_END. + The mandatory columns and any of the optional columns can all be read in + easily with the +\series bold +read_bed +\series default + biotool. +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= +\end_layout + +\begin_layout Standard +It is also possible to read the BED file with +\series bold +read_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +), however, that will be more cumbersome because you need to specify the + keys: +\end_layout + +\begin_layout LyX-Code +read_tab --data_in= --keys=CHR,CHR_BEG,CHR_END ... +\end_layout + +\begin_layout Subsection +How to read PSL input? +\end_layout + +\begin_layout Standard +The PSL format is the output from BLAT and contains 21 mandatory fields + that can be read with +\series bold +read_psl +\series default +: +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= +\end_layout + +\begin_layout Section +Writing output +\end_layout + +\begin_layout Standard +All result output can be written explicitly to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch which all result generating biotools have. + It is also possible to write the result to file implicetly by directing + 'stdout' to file using '>', however, that requires the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream swich to prevent a mixture of data stream and results in the file. + The explicit (and safe) way: +\end_layout + +\begin_layout LyX-Code +... + | --result_out= +\end_layout + +\begin_layout Standard +The implicit way: +\end_layout + +\begin_layout LyX-Code +... + | --no_stream > +\end_layout + +\begin_layout Subsection +How to write biotools output? +\end_layout + +\begin_layout Standard +See +\begin_inset LatexCommand eqref +reference "sub:How-to-write-stream" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +How to write FASTA output? +\begin_inset LatexCommand label +name "sub:How-to-write-fasta" + +\end_inset + + +\end_layout + +\begin_layout Standard +FASTA output can be written with +\series bold +write_fasta +\series default +. +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --result_out= +\end_layout + +\begin_layout Standard +It is also possible to wrap the sequences to a given width using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch allthough wrapping of sequence is generally an evil thing: +\end_layout + +\begin_layout LyX-Code +... + | write_fasta --no_stream --wrap=80 +\end_layout + +\begin_layout Subsection +How to write alignment output? +\begin_inset LatexCommand label +name "sub:How-to-write-alignment" + +\end_inset + + +\end_layout + +\begin_layout Standard +Pretty alignments with ruler +\begin_inset Foot +status collapsed + +\begin_layout Standard +'.' for every 10 residues, ':' for every 50, and '|' for every 100 +\end_layout + +\end_inset + + and consensus sequence +\begin_inset Note Note +status collapsed + +\begin_layout Standard +which reminds me to make that an option. +\end_layout + +\end_inset + + can be created with +\series bold +write_align +\series default +, what also have the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +wrap switch to break the alignment into blocks of a given width: +\end_layout + +\begin_layout LyX-Code +... + | write_align --result_out= --wrap=80 +\end_layout + +\begin_layout Standard +If the number of sequnces in the alignment is 2 then a pairwise alignment + will be output otherwise a multiple alignment. + And if the sequence type, determined automagically, is protein, then residues + and symbols (+,\InsetSpace ~ +:,\InsetSpace ~ +.) will be used to show consensus according to the Blosum62 + matrix. +\end_layout + +\begin_layout Subsection +How to write tabular output? +\begin_inset LatexCommand label +name "sub:How-to-write-tab" + +\end_inset + + +\end_layout + +\begin_layout Standard +Outputting the data stream as a table can be done with +\series bold +write_tab +\series default +, which will write generate one row per record with the values as columns. + If you supply the optional +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment switch, when the first row in the table will be a 'comment' line + prefixed with a '#': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --comment +\end_layout + +\begin_layout Standard +You can also change the delimiter from the default (tab) to +\emph on +e.g. + +\emph default + ',': +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --delimit=',' +\end_layout + +\begin_layout Standard +If you want the values output in a specific order you have to supply a comma + separated list using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch that will print only those keys in that order: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --keys=SEQ_NAME,COUNT +\end_layout + +\begin_layout Standard +Alternatively, if you have some keys that you don't want in the tabular + output, use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switch. + So to print all keys except SEQ and SEQ_TYPE do: +\end_layout + +\begin_layout LyX-Code +... + | write_tab --result_out= --no_keys=SEQ,SEQ_TYPE +\end_layout + +\begin_layout Standard +Finally, if you have a stream containing a mix of different records types, + +\emph on +e.g. + +\emph default + records with sequences and records with matches, then you can use +\series bold +write_tab +\series default + to output all the records in tabluar format, however, the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comment, +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys, and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_keys switches will only respond to records of the first type encountered. + The reason is that outputting mixed records is probably not what you want + anyway, and you should remove all the unwanted records from the stream + before outputting the table: +\series bold +grab +\series default + is your friend (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to write a BED output? +\begin_inset LatexCommand label +name "sub:How-to-write-BED" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in BED format can be output if the records contain the mandatory keys + CHR, CHR_BEG, and CHR_END using +\series bold +write_bed +\series default +. + If the optional keys are also present, they will be output as well: +\end_layout + +\begin_layout LyX-Code +write_bed --result_out= +\end_layout + +\begin_layout Subsection +How to write PSL output? +\begin_inset LatexCommand label +name "sub:How-to-write-PSL" + +\end_inset + + +\end_layout + +\begin_layout Standard +Data in PSL format can be output using +\series bold +write_psl: +\end_layout + +\begin_layout LyX-Code +write_psl --result_out= +\end_layout + +\begin_layout Section +Manipulating Records +\end_layout + +\begin_layout Subsection +How to select a few records? +\begin_inset LatexCommand label +name "sub:How-to-select-a-few-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +To quickly get an overview of your data you can limit the data stream to + show a few records. + This also very useful to test the pipeline with a few records if you are + setting up a complex analysis using several biotools. + That way you can inspect that all goes well before analyzing and waiting + for the full data set. + All of the read_ biotools have the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch which will take a number as argument and only that number of + records will be read. + So to read in the first 10 FASTA entries from a file: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna --num=10 +\end_layout + +\begin_layout Standard +Another way of doing this is to use +\series bold +head_records +\series default + will limit the stream to show the first 10 records (default): +\end_layout + +\begin_layout LyX-Code +... + | head_records +\end_layout + +\begin_layout Standard +Using +\series bold +head_records +\series default + directly after one of the read_ biotools will be a lot slower than + using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch with the read_ biotools, however, +\series bold +head_records +\series default + can also be used to limit the output from all the other biotools. + It is also possible to give +\series bold +head_records +\series default + a number of records to show using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num switch. + So to display the first 100 records do: +\end_layout + +\begin_layout LyX-Code +... + | head_records --num=100 +\end_layout + +\begin_layout Subsection +How to select random records? +\begin_inset LatexCommand label +name "sub:How-to-select-random-records" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to inspect a number of random records from the stream this can + be done with the +\series bold +random_records +\series default + biotool. + So if you have 1 mio records in the stream and you want to select 1000 + random records do: +\end_layout + +\begin_layout LyX-Code +... + | random_records --num=1000 +\end_layout + +\begin_layout Subsection +How to count all records in the data stream? +\end_layout + +\begin_layout Standard +To count all the records in the data stream use +\series bold +count_records +\series default +, which adds one record (which is not included in the count) to the data + stream. + So to count the number of sequences in a FASTA file you can do this: +\end_layout + +\begin_layout LyX-Code +cat test.fna | read_fasta | count_records --no_stream +\end_layout + +\begin_layout Standard +Which will write the last record containing the count to 'stdout': +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +count_records: 630 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout Standard +It is also possible to write the count to file using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out switch. +\end_layout + +\begin_layout Subsection +How to get the length of record values? +\begin_inset LatexCommand label +name "sub:How-to-get-value_length" + +\end_inset + + +\end_layout + +\begin_layout Standard +Use the +\series bold +length_vals +\series default + biotool to get the length of each value for a comma separated list of keys: +\end_layout + +\begin_layout LyX-Code +... + | length_vals --keys=HIT,PATTERN +\end_layout + +\begin_layout Subsection +How to grab specific records? +\begin_inset LatexCommand label +name "sub:How-to-grab" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biotool +\series bold +grab +\series default + is related to the Unix grep and locates records based on matching keys + and/or values using either a pattern, a Perl regex, or a numerical evaluation. + To easily +\series bold +grab +\series default + all records in the stream that has any mentioning of the pattern 'human' + just pipe the data stream through +\series bold +grab +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human +\end_layout + +\begin_layout Standard +This will search for the pattern 'human' in all keys and all values. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma separated list of patterns, so in order to + match multiple patterns do: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human,mouse +\end_layout + +\begin_layout Standard +It is also possible to use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch instead of +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern. + +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in is used to read a file with one pattern per line: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern_in=patterns.txt +\end_layout + +\begin_layout Standard +If you want the opposite result --- to find all records that does not match + the patterns, add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch, which not only works with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch, but also with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +regex and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --invert +\end_layout + +\begin_layout Standard +If you want to search the record keys only, +\emph on +e.g. + +\emph default + to find all records containing the key SEQ you can add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys_only switch. + This will prevent matching of SEQ in any record value, and in fact SEQ + is a not uncommon peptide sequence you could get an unwanted record. + Also, this will give an increase in speed since only the keys are searched: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --keys_only +\end_layout + +\begin_layout Standard +However, if you are interested in finding the peptide sequence SEQ and not + the SEQ key, just add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +vals_only switch instead: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=SEQ --vals_only +\end_layout + +\begin_layout Standard +Also, if you want to grab for certain key/value pairs you can supply a comma + separated list of keys whos values will then be searched using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +keys switch. + This is handy if your records contain large genomic sequences and you dont + want to search the entire sequence for +\emph on +e.g. + +\emph default + the organism name --- it is much faster to tell +\series bold +grab +\series default + which keys to search the value for: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern=human --keys=SEQ_NAME +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +It is also possible to invoke flexible matching using regex (regular expressions +) instead of simple pattern matching. + In +\series bold +grab +\series default + the regex engine is Perl based and allows use of different type of wild + cards, alternatives, +\emph on +etc +\emph default + +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://perldoc.perl.org/perlreref.html" + +\end_inset + + +\end_layout + +\end_inset + +. + If you want to +\series bold +grab +\series default + records withs the sequence ATCG or GCTA you can do this: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='ATCG|GCTA' +\end_layout + +\begin_layout Standard +Or if you want to find sequences beginning with ATCG: +\end_layout + +\begin_layout LyX-Code +... + | grab --regex='^ATCG' +\end_layout + +\begin_layout Standard +You can also use +\series bold +grab +\series default + to locate records that fulfill a numerical property using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +eval switch witch takes an expression in three parts. + The first part is the key that holds the value we want to evaluate, the + second part holds one the six operators: +\end_layout + +\begin_layout Enumerate +Greater than: > +\end_layout + +\begin_layout Enumerate +Greater than or equal to: >= +\end_layout + +\begin_layout Enumerate +Less than: < +\end_layout + +\begin_layout Enumerate +Less than or equal to: <= +\end_layout + +\begin_layout Enumerate +Equal to: = +\end_layout + +\begin_layout Enumerate +Not equal to: != +\end_layout + +\begin_layout Enumerate +String wise equal to: eq +\end_layout + +\begin_layout Enumerate +String wise not equal to: ne +\end_layout + +\begin_layout Standard +And finally comes the number used in the evaluation. + So to +\series bold +grab +\series default + all records with a sequence length greater than 30: +\end_layout + +\begin_layout LyX-Code +... + length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +If you want to locate all records containing the pattern 'human' and where + the sequence length is greater that 30, you do this by running the stream + through +\series bold +grab +\series default + twice: +\end_layout + +\begin_layout LyX-Code +... + | grab --pattern='human' | length_seq | grab --eval='SEQ_LEN > 30' +\end_layout + +\begin_layout Standard +Finally, it is possible to do fast matching of expressions from a file using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact switch. + Each of these expressions has to be matched exactly over the entrie length, + which if useful if you have a file with accession numbers, that you want + to locate in the stream: +\end_layout + +\begin_layout LyX-Code +... + | grab --exact acc_no.txt | ... +\end_layout + +\begin_layout Standard +Using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact is much faster than using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in, because with +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +exact the expression has to be complete matches, where +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in looks for subpatterns. +\end_layout + +\begin_layout Standard +NB! To get the best speed performance, use the most restrictive +\series bold +grab +\series default + first. +\end_layout + +\begin_layout Subsection +How to remove keys from records? +\end_layout + +\begin_layout Standard +To remove one or more specific keys from all records in the data stream + use +\series bold +remove_keys +\series default + like this: +\end_layout + +\begin_layout LyX-Code +... + | remove_keys --keys=SEQ,SEQ_NAME +\end_layout + +\begin_layout Standard +In the above example SEQ and SEQ_NAME will be removed from all records if + they exists in these. + If all keys are removed from a record, then the record will be removed. +\end_layout + +\begin_layout Subsection +How to rename keys in records? +\end_layout + +\begin_layout Standard +Sometimes you want to rename a record key, +\emph on +e.g. + +\emph default + if you have read in a two column table with sequence name and sequence + in each column (see +\begin_inset LatexCommand ref +reference "sub:How-to-read-table" + +\end_inset + +) without specifying the key names, then the sequence name will be called + V0 and the sequence V1 as default in the +\series bold +read_tab +\series default + biotool. + To rename the V0 and V1 keys we need to run the stream through +\series bold +rename_keys +\series default + twice (one for each key to rename): +\end_layout + +\begin_layout LyX-Code +... + | rename_keys --keys=V0,SEQ_NAME | rename_keys --keys=V1,SEQ +\end_layout + +\begin_layout Standard +The first instance of +\series bold +rename_keys +\series default + replaces all the V0 keys with SEQ_NAME, and the second instance of +\series bold +rename_keys +\series default + replaces all the V1 keys with SEQ. + +\emph on +Et viola +\emph default + the data can now be used in the biotools that requires these keys. +\end_layout + +\begin_layout Section +Manipulating Sequences +\end_layout + +\begin_layout Subsection +How to get sequence lengths? +\end_layout + +\begin_layout Standard +The length for sequences in records can be determined with +\series bold +length_seq +\series default +, which adds the key SEQ_LEN to each record with the sequence length as + the value. + It also generates an extra record that is emitted last with the key TOTAL_SEQ_L +EN showing the total length of all the sequences. +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout Standard +It is also possible to determine the sequence length using the generic tool + +\series bold +length_vals +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-get-value_length" + +\end_inset + +, which determines the length of the values for a given list of keys: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout Standard +To obtain the total length of all sequences use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_vals --keys=SEQ +\end_layout + +\begin_layout LyX-Code +| sum_vals --keys=SEQ_LEN +\end_layout + +\begin_layout Standard +The biotool +\series bold +analyze_seq +\series default + will also determine the length of each sequence (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-analyze" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to analyze sequence composition? +\begin_inset LatexCommand label +name "sub:How-to-analyze" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you want to find out the sequence type, composition, length, as well + as GC content, indel content and proportions of soft and hard masked sequence, + then use +\series bold +analyze_seq +\series default +. + This handy biotool will determine all these things per sequence from which + it is easy to get an overview using the +\series bold +write_tab +\series default + biotool to output a table (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +). + So in order to determine the sequence composition of a FASTA file with + just one entry containing the sequence 'ATCG' we just read the data with + +\series bold +read_fasta +\series default + and run the output through +\series bold +analyze_seq +\series default + which will add the analysis to the record like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:D: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +MIX_INDEX: 0.55 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:W: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:G: 16 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SOFT_MASK%: 63.75 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:B: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:V: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +HARD_MASK%: 0.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:H: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:S: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:N: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:.: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +GC%: 35.00 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:A: 8 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:Y: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:M: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:T: 44 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_TYPE: DNA +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:K: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:~: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ: TTTCAGTTTGGGACGGAGTAAGGCCTTCCtttttttttttttttttttttttttttttgagaccgagtcttgctc +tgtcg +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +SEQ_LEN: +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +80 RES:R: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:C: 12 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:-: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +RES:U: 0 +\end_layout + +\begin_layout LyX-Code + +\size scriptsize +--- +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Standard +Now to make a table of how may As, Ts, Cs, and Gs you can add the following: +\end_layout + +\begin_layout LyX-Code +... + | analyze_seq | write_tab --keys=RES:A,RES:T,RES:C,RES:G +\end_layout + +\begin_layout Standard +Or if you want to see the proportions of hard and soft masked sequence: +\end_layout + +\begin_layout LyX-Code +... + | analyse_seq | write_tab --keys=HARD_MASK%,SOFT_MASK% +\end_layout + +\begin_layout Standard +If you have a stack of sequences in one file and you want to determine the + mean GC content you can do it using the +\series bold +mean_vals +\series default + biotool: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | mean_vals --keys=GC% +\end_layout + +\begin_layout Standard +Or if you want the total count of Ns you can use +\series bold +sum_vals +\series default + like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | sum_vals --keys=RES:N +\end_layout + +\begin_layout Standard +The MIX_INDEX key is calculated as the count of the most common residue + over the sequence length, and can be used as a cut-off for removing sequence + tags consisting of mostly one nucleotide: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | analyze_seq | grab --eval='MIX_INDEX<0.85' +\end_layout + +\begin_layout Subsection +How to extract subsequences? +\begin_inset LatexCommand label +name "sub:How-to-extract" + +\end_inset + + +\end_layout + +\begin_layout Standard +In order to extract a subsequence from a longer sequence use the biotool + extract_seq, which will replace the sequence in the record with the subsequence + (this behaviour should probably be modified to be dependant of a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +replace or a +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_replace switch +\begin_inset Note Note +status collapsed + +\begin_layout Standard +also in split_seq +\end_layout + +\end_inset + +). + So to extract the first 20 residues from all sequences do (first residue + is designated 1): +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=1 --len=20 +\end_layout + +\begin_layout Standard +You can also specify a begin and end coordinate set: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 --end=40 +\end_layout + +\begin_layout Standard +If you want the subsequences from position 20 to the sequence end do: +\end_layout + +\begin_layout LyX-Code +... + | extract_seq --beg=20 +\end_layout + +\begin_layout Standard +If you want to extract subsequences a given distance from the sequence end + you can do this by reversing the sequence with the biotool +\series bold +reverse_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-reverse-seq" + +\end_inset + +, followed by +\series bold +extract_seq +\series default + to get the subsequence, and then +\series bold +reverse_seq +\series default + again to get the subsequence back in the original orientation: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in=test.fna | reverse_seq +\end_layout + +\begin_layout LyX-Code +| extract_seq --beg=10 --len=10 | reverse_seq +\end_layout + +\begin_layout Subsection +How to get genomic sequence? +\begin_inset LatexCommand label +name "sub:How-to-get-genomic-sequence" + +\end_inset + + +\end_layout + +\begin_layout Standard +The biotool +\series bold +get_genomic_seq +\series default + can extract subsequences for a given genome specified with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch explicitly using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +beg and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +end/ +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +len switches: +\end_layout + +\begin_layout LyX-Code +get_genome_seq --genome= --beg=1 --len=100 +\end_layout + +\begin_layout Standard +Alternatively, +\series bold +get_genome_seq +\series default + can be used to append the corresponding sequence to BED, PSL, and BLAST + records: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to include flaking sequence using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +flank switch. + So to include 50 nucleotides upstream and 50 nucleotides downstream for + each BED entry do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | get_genome_seq --genome= --flank=50 +\end_layout + +\begin_layout Subsection +How to upper-case sequences? +\end_layout + +\begin_layout Standard +Sequences can be shifted from lower case to upper case using +\series bold +uppercase_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | uppercase_seq +\end_layout + +\begin_layout Subsection +How to reverse sequences? +\begin_inset LatexCommand label +name "sub:How-to-reverse-seq" + +\end_inset + + +\end_layout + +\begin_layout Standard +The order of residues in a sequence can be reversed using reverse_seq: +\end_layout + +\begin_layout LyX-Code +... + | reverse_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +complement_seq +\series default + biotool (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-complement" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to complement sequences? +\begin_inset LatexCommand label +name "sub:How-to-complement" + +\end_inset + + +\end_layout + +\begin_layout Standard +DNA and RNA sequences can be complemented with +\series bold +complement_seq +\series default +, which automagically determines the sequence type: +\end_layout + +\begin_layout LyX-Code +... + | complement_seq +\end_layout + +\begin_layout Standard +Note that in order to reverse/complement a sequence you also need the +\series bold +reverse_seq +\series default + biotool (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-reverse-seq" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to remove indels from sequnces? +\end_layout + +\begin_layout Standard +Indels can be removed from sequences with the +\series bold +remove_indels +\series default + biotool. + This is useful if you have aligned some sequences (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-align" + +\end_inset + +) and extracted (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-extract" + +\end_inset + +) a block of subsequences from the alignment and you want to use these sequence + in a search where you need to remove the indels first. + '-', '~', and '.' are considered indels: +\end_layout + +\begin_layout LyX-Code +... + | remove_indels +\end_layout + +\begin_layout Subsection +How to shuffle sequences? +\end_layout + +\begin_layout Standard +All residues in sequences in the stream can be shuffled to random positions + using the +\series bold +shuffle_seq +\series default + biotool: +\end_layout + +\begin_layout LyX-Code +... + | shuffle_seq +\end_layout + +\begin_layout Subsection +How to split sequences into overlapping subsequences? +\end_layout + +\begin_layout Standard +Sequences can be slit into overlapping subsequences with the +\series bold +split_seq +\series default + biotool. +\end_layout + +\begin_layout LyX-Code +... + | split_seq --word_size=20 --uniq +\end_layout + +\begin_layout Subsection +How to determine the oligo frequency? +\end_layout + +\begin_layout Standard +In order to determine if any oligo usage is over represented in one or more + sequences you can determine the frequency of oligos of a given size with + +\series bold +oligo_freq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 +\end_layout + +\begin_layout Standard +And if you have more than one sequence and want to accumulate the frequences + you need the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +all switch: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all +\end_layout + +\begin_layout Standard +To get a meaningful result you need to write the resulting frequencies as + a table with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +), but first it is important to +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +) the records with the frequencies to avoid full length sequences in the + table: +\end_layout + +\begin_layout LyX-Code +... + | oligo_freq --word_size=4 --all | grab --pattern=OLIGO --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +And the resulting frequency table can be sorted with Unix sort (man sort). +\end_layout + +\begin_layout Subsection +How to search for sequences in genomes? +\end_layout + +\begin_layout Standard +See the following biotool: +\end_layout + +\begin_layout Itemize + +\series bold +patscan_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blat_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +blast_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize + +\series bold +vmatch_seq +\series default + +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to search sequences for a pattern? +\begin_inset LatexCommand label +name "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Standard +It is possible to search sequences in the data stream for patterns using + the +\series bold +patscan_seq +\series default + biotool which utilizes the powerful scan_for_matches engine. + Consult the documentation for scan_for_matches in order to learn how to + define patterns (the documentation is included in Appendix\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sec:scan_for_matches-README" + +\end_inset + +). +\end_layout + +\begin_layout Standard +To search all sequences for a simple pattern consisting of the sequence + ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | patscan_seq --pattern='ATCGATCG[3,2,1]' +\end_layout + +\begin_layout Standard +The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern switch takes a comma seperated list of patterns, so if you want + to search for more that one pattern do: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern='ATCGATCG[3,2,1],GCTAGCTA[3,2,1]' +\end_layout + +\begin_layout Standard +It is also possible to have a list of different patterns to search for in + a file with one pattern per line. + In order to get +\series bold +patscan_seq +\series default + to read these patterns use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +pattern_in switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern_in= +\end_layout + +\begin_layout Standard +To also scan the complementary strand in nucleotide sequences ( +\series bold +patscan_seq +\series default + automagically determines the sequence type) you need to add the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +comp switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --comp +\end_layout + +\begin_layout Standard +It is also possible to use +\series bold +patscan_seq +\series default + to output those records that does not contain a certain pattern by using + the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +invert switch: +\end_layout + +\begin_layout LyX-Code +... + | patscan_seq --pattern= --invert +\end_layout + +\begin_layout Standard +Finally, +\series bold +patscan_seq +\series default + can also scan for patterns in a given genome sequence, instead of sequences + in the stream, using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch: +\end_layout + +\begin_layout LyX-Code +patscan --pattern= --genome= +\end_layout + +\begin_layout Subsection +How to use BLAT for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the data stream can be matched against supported genomes using + +\series bold +blat_seq +\series default + which is a biotool using BLAT as the name might suggest. + Currently only Mouse and Human genomes are available and it is not possible + to use OOC files since there is still a need for a local repository for + genome files. + Otherwise it is just: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_seq --genome= +\end_layout + +\begin_layout Standard +The search results can then be written to file with +\series bold +write_psl +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +) or +\series bold +write_bed +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-BED" + +\end_inset + +) allthough with +\series bold +write_bed +\series default + some information will be lost). + It is also possible to plot chromosome distribution of the search results + using +\series bold +plot_chrdist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-chrdist" + +\end_inset + +) or the distribution of the match lengths using +\series bold +plot_lendist +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-lendist" + +\end_inset + +) or a karyogram with the hits using +\series bold +plot_karyogram +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-plot-karyogram" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to use BLAST for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Standard +Two biotools exist for blasting sequences: +\series bold +create_blast_db +\series default + is used to create the BLAST database required for BLAST which is queried + using the biotool +\series bold +blast_seq +\series default +. + So in order to create a BLAST database from sequences in the data stream + you simple run: +\end_layout + +\begin_layout LyX-Code +... + | create_blast_db --database=my_database --no_stream +\end_layout + +\begin_layout Standard +The type of sequence to use for the database is automagically determined + by +\series bold +create_blast_db +\series default +, but don't have a mixture of peptide and nucleic acids sequences in the + stream. + The +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch takes a path as argument, but will default to 'blastdb_ if not set. +\end_layout + +\begin_layout Standard +The resulting database can now be queried with sequences in another data + stream using +\series bold +blast_seq +\series default +: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --database=my_database +\end_layout + +\begin_layout Standard +Again, the sequence type is determined automagically and the appropriate + BLAST program is guessed (see below table), however, the program name can + be overruled with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +program switch. +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Standard +Subject sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Query sequence +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Program guess +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastn +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastp +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +blastx +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Standard +Nucleotide +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +Protein +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Standard +tblastn +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +Finally, it is also possible to use +\series bold +blast_seq +\series default + for blasting sequences agains a preformatted genome using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +genome switch instead of the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database switch: +\end_layout + +\begin_layout LyX-Code +... + | blast_seq --genome= +\end_layout + +\begin_layout Subsection +How to use Vmatch for sequence search? +\begin_inset LatexCommand label +name "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The powerful suffix array software package Vmatch +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.vmatch.de/" + +\end_inset + + +\end_layout + +\end_inset + + can be used for exact mapping of sequences against indexed genomes using + the biotool +\series bold +vmatch_seq +\series default +, which will e.g. + map 700000 ESTs to the human genome locating all 160 mio hits in less than + an hour. + Only nucleotide sequences and sequences longer than 11 nucleotides will + be mapped. + It is recommended that sequences consisting of mostly one nucleotide type + are removed. + This can be done with the +\series bold +analyze_seq +\series default + biotool +\begin_inset LatexCommand eqref +reference "sub:How-to-analyze" + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= +\end_layout + +\begin_layout Standard +It is also possible to allow for mismatches using the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist switch. + So to allow for 2 mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=2 +\end_layout + +\begin_layout Standard +Or to allow for 10% mismathing nucleotides: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=10p +\end_layout + +\begin_layout Standard +To allow both indels and mismatches use the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist switch. + So to allow for one mismatch or one indel: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=1 +\end_layout + +\begin_layout Standard +Or to allow for 5% indels or mismatches: +\end_layout + +\begin_layout LyX-Code +... + | vmatch_seq --genome= --hamming_dist=5p +\end_layout + +\begin_layout Standard +Note that using +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +hamming_dist or +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +edit_dist greatly slows down vmatch considerably --- use with care. +\end_layout + +\begin_layout Standard +The resulting SCORE key can be replaced to hold the number of genome matches + of a given sequence (multi-mappers) is the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +count switch is given. +\end_layout + +\begin_layout Subsection +How to find all matches between sequences? +\begin_inset LatexCommand label +name "sub:How-to-find-matches" + +\end_inset + + +\end_layout + +\begin_layout Standard +All matches between two sequences can be determined with the biotool +\series bold +match_seq +\series default +. + The match finding engine underneath the hood of +\series bold +match_seq +\series default + is the super fast suffix tree program MUMmer +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://mummer.sourceforge.net/" + +\end_inset + + +\end_layout + +\end_inset + +, which will locate all forward and reverse matches between huge sequences + in a matter of minutes (if the repeat count is not too high and if the + word size used is appropriate). + Matching two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq --word_size=20 --direction=both +\end_layout + +\begin_layout Standard +The output from +\series bold +match_seq +\series default + can be used to generate a dot plot with +\series bold +plot_matches +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-generate-dotplot" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to align sequences? +\begin_inset LatexCommand label +name "sub:How-to-align" + +\end_inset + + +\end_layout + +\begin_layout Standard +Sequences in the stream can be aligned with the +\series bold +align_seq +\series default + biotool that uses Muscle +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.drive5.com/muscle/muscle.html" + +\end_inset + + +\end_layout + +\end_inset + + as aligment engine. + Currently you cannot change any of the Muscle alignment parameters and + +\series bold +align_seq +\series default + will create an alignment based on the defaults (which are really good!): +\end_layout + +\begin_layout LyX-Code +... + | align_seq +\end_layout + +\begin_layout Standard +The aligned output can be written to file in FASTA format using +\series bold +write_fasta +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-fasta" + +\end_inset + +) or in pretty text using +\series bold +write_align +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-alignment" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +How to create a weight matrix? +\end_layout + +\begin_layout Standard +If you want a weight matrix to show the sequence composition of a stack + of sequences you can use the biotool create_weight_matrix: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix +\end_layout + +\begin_layout Standard +The result can be output in percent using the +\begin_inset ERT +status open + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +percent switch: +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix --percent +\end_layout + +\begin_layout Standard +The weight matrix can be written as tabular output with +\series bold +write_tab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-tab" + +\end_inset + +) after removeing the records containing SEQ with +\series bold +grab +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-grab" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | create_weight_matrix | grab --invert --keys=SEQ --keys_only +\end_layout + +\begin_layout LyX-Code +| write_tab --no_stream +\end_layout + +\begin_layout Standard +The V0 column will hold the residue, while the rest of the columns will + hold the frequencies for each sequence position. +\end_layout + +\begin_layout Section +Plotting +\end_layout + +\begin_layout Standard +There exists several biotools for plotting. + Some of these are based on GNUplot +\begin_inset Foot +status open + +\begin_layout Standard +\begin_inset LatexCommand url +target "http://www.gnuplot.info/" + +\end_inset + + +\end_layout + +\end_inset + +, which is an extremely powerful platform to generate all sorts of plots + and even though GNUplot has quite a steep learning curve, the biotools + utilizing GNUplot are simple to use. + GNUplot is able to output a lot of different formats (called terminals + in GNUplot), but the biotools focusses on three formats only: +\end_layout + +\begin_layout Enumerate +The 'dumb' terminal is default to the GNUplot based biotools and will output + a plot in crude ASCII text (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +). + This is quite nice for a quick and dirty plot to get an overview of your + data . +\end_layout + +\begin_layout Enumerate +The 'post' or 'postscript' terminal output postscript code which is publication + grade graphics that can be viewed with applications such as Ghostview, + Photoshop, and Preview. +\end_layout + +\begin_layout Enumerate +The 'svg' terminal output's scalable vector graphics (SVG) which is a vector + based format. + SVG is great because you can edit the resulting plot using Photoshop or + Inkscape +\begin_inset Foot +status collapsed + +\begin_layout Standard +Inkscape is a really handy drawing program that is free and open source. + Availble at +\begin_inset LatexCommand htmlurl +target "http://www.inkscape.org" + +\end_inset + + +\end_layout + +\end_inset + + if you want to add additional labels, captions, arrows, and so on and then + save the result in different formats, such as postscript without loosing + resolution. +\end_layout + +\begin_layout Standard +The biotools for plotting that are not based on GNUplot only output SVG + (that may change in the future). +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist_ascii.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dumb-terminal" + +\end_inset + +Dumb terminal +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +The output of a length distribution plot in the default 'dumb terminal' + to the terminal window. + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a histogram? +\begin_inset LatexCommand label +name "How-to-plot-histogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +A generic histogram for a given value can be plotted with the biotool +\series bold +plot_histogram +\series default + (Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Histogram" + +\end_inset + +): +\end_layout + +\begin_layout LyX-Code +... + | plot_histogram --key=TISSUE --no_stream +\end_layout + +\begin_layout Standard +(Figure missing) +\end_layout + +\begin_layout Standard +\noindent +\align left +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename histogram.png + lyxscale 70 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Histogram" + +\end_inset + +Histogram +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a length distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-lendist" + +\end_inset + + +\end_layout + +\begin_layout Standard +Plotting of length distributions, weather sequence lengths, patterns lengths, + hit lengths, +\emph on +etc. + +\emph default + is a really handy thing and can be done with the the biotool +\series bold +plot_lendist +\series default +. + If you have a file with FASTA entries and want to plot the length distribution + you do it like this: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | length_seq +\end_layout + +\begin_layout LyX-Code +| plot_lendist --key=SEQ_LEN --no_stream +\end_layout + +\begin_layout Standard +The result will be written to the default dumb terminal and will look like + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dumb-terminal" + +\end_inset + +. +\end_layout + +\begin_layout Standard +If you instead want the result in postscript format you can do: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --result_out=file.ps +\end_layout + +\begin_layout Standard +That will generate the plot and save it to file, but not interrupt the data + stream which can then be used in further analysis. + You can also save the plot implicetly using '>', however, it is then important + to terminate the stream with the +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream switch: +\end_layout + +\begin_layout LyX-Code +... + | plot_lendist --key=SEQ_LEN --terminal=post --no_stream > file.ps +\end_layout + +\begin_layout Standard +The resulting plot can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Length-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename lendist.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Length-distribution" + +\end_inset + +Length distribution +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Length distribution of 630 piRNA like RNAs. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a chromosome distribution? +\begin_inset LatexCommand label +name "sub:How-to-plot-chrdist" + +\end_inset + + +\end_layout + +\begin_layout Standard +If you have the result of a sequence search against a multi chromosome genome, + it is very practical to be able to plot the distribution of search hits + on the different chromosomes. + This can be done with +\series bold +plot_chrdist +\series default +: +\end_layout + +\begin_layout LyX-Code +read_fasta --data_in= | blat_genome | plot_chrdist --no_stream +\end_layout + +\begin_layout Standard +The above example will result in a crude plot using the 'dumb' terminal, + and if you want to mess around with the results from the BLAT search you + probably want to save the result to file first (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-write-PSL" + +\end_inset + +). + To plot the chromosome distribution from the saved search result you can + do: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in=file.bed | plot_chrdist --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +That will result in the output show in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Chromosome-distribution" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename chrdist.ps + lyxscale 50 + width 12cm + rotateAngle 90 + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Chromosome-distribution" + +\end_inset + +Chromosome distribution +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to generate a dotplot? +\begin_inset LatexCommand label +name "sub:How-to-generate-dotplot" + +\end_inset + + +\end_layout + +\begin_layout Standard +A dotplot is a powerful way to get an overview of the size and location + of sequence insertions, deletions, and duplications between two sequences. + Generating a dotplot with biotools is a two step process where you initially + find all matches between two sequences using the tool +\series bold +match_seq +\series default + (see\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "sub:How-to-find-matches" + +\end_inset + +) and plot the resulting matches with +\series bold +plot_matches +\series default +. + Matching and plotting two +\emph on +Helicobacter pylori +\emph default + genomes (1.7Mbp) takes around 10 seconds: +\end_layout + +\begin_layout LyX-Code +... + | match_seq | plot_matches --terminal=post --result_out=plot.ps +\end_layout + +\begin_layout Standard +The resulting dotplot is in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Dotplot" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename dotplot.ps + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Dotplot" + +\end_inset + +Dotplot +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Forward matches are displayed in green while reverse matches are displayed + in red. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a sequence logo? +\end_layout + +\begin_layout Standard +Sequence logos can be generate with +\series bold +plot_seqlogo +\series default +. + The sequnce type is determined automagically and an entropy scale of 2 + bits and 4 bits is used for nucleotide and peptide sequences, respectively +\begin_inset Foot +status collapsed + +\begin_layout Standard +\begin_inset LatexCommand htmlurl +target "http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html" + +\end_inset + + +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout LyX-Code +... + | plot_seqlogo --no_stream --result_out=seqlogo.svg +\end_layout + +\begin_layout Standard +An example of a sequence logo can be seen in Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Sequence-logo" + +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename seqlogo.png + lyxscale 50 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Sequence-logo" + +\end_inset + +Sequence logo +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +How to plot a karyogram? +\begin_inset LatexCommand label +name "sub:How-to-plot-karyogram" + +\end_inset + + +\end_layout + +\begin_layout Standard +To plot search hits on genomes use +\series bold +plot_karyogram +\series default +, which will output a nice karyogram in SVG graphics: +\end_layout + +\begin_layout LyX-Code +... + | plot_karyogram --result_out=karyogram.svg +\end_layout + +\begin_layout Standard +The banding data is taken from the UCSC genome browser database and currently + only Human and Mouse is supported. + Fig.\InsetSpace ~ + +\begin_inset LatexCommand ref +reference "fig:Karyogram" + +\end_inset + + shows the distribution of piRNA like RNAs matched to the Human genome. +\end_layout + +\begin_layout Standard +\begin_inset Float figure +wide false +sideways false +status open + +\begin_layout Standard +\noindent +\align center +\begin_inset Graphics + filename karyogram.png + lyxscale 35 + width 12cm + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Caption + +\begin_layout Standard +\begin_inset LatexCommand label +name "fig:Karyogram" + +\end_inset + +Karyogram +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Quote +Hits from a search of piRNA like RNAs in the Human genome is displayed as + short horizontal bars. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Uploading Results +\end_layout + +\begin_layout Subsection +How do I display my results in the UCSC Genome Browser? +\end_layout + +\begin_layout Standard +Results from the list of biotools below can be uploaded directly to a local + mirror of the UCSC Genome Browser using the biotool +\series bold +upload_to_ucsc +\series default +: +\end_layout + +\begin_layout Itemize +patscan_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-patscan" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blat_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAT" + +\end_inset + + +\end_layout + +\begin_layout Itemize +blast_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-BLAST" + +\end_inset + + +\end_layout + +\begin_layout Itemize +vmatch_seq +\begin_inset LatexCommand eqref +reference "sub:How-to-use-Vmatch" + +\end_inset + + +\end_layout + +\begin_layout Standard +The syntax for uploading data the most simple way requires two mandatory + switches: +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +database, which is the UCSC database name (such as hg18, mm9, etc.) and +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +table which should be the users initials followed by an underscore and a + short description of the data: +\end_layout + +\begin_layout LyX-Code +... + | upload_to_ucsc --database=hg18 --table=mah_snoRNAs +\end_layout + +\begin_layout Standard +The +\series bold +upload_to_ucsc +\series default + biotool modifies the users ~/ucsc/my_tracks.ra file automagically (a backup + is created with the name ~/ucsc/my_tracks.ra~) with default values that + can be overridden using the following switches: +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +short_label - Short label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +long_label - Long label for track - Default=database->table +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +group - Track group name - Default= +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +priority - Track display priority - Default=1 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +color - Track color - Default=147,73,42 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +chunk_size - Chunks for loading - Default=10000000 +\end_layout + +\begin_layout Itemize +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +visibility - Track visibility - Default=pack +\end_layout + +\begin_layout Standard +Also, data in BED or PSL format can be uploaded with +\series bold +upload_to_ucsc +\series default + as long as these reference to genomes and chromosomes existing in the UCSC + Genome Browser: +\end_layout + +\begin_layout LyX-Code +read_bed --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +read_psl --data_in= | upload_to_ucsc ... +\end_layout + +\begin_layout Section +Power Scripting +\end_layout + +\begin_layout Standard +It is possible to do commandline scripting of biotool records using Perl. + Because a biotool record essentially is a hash structure, you can pass + records to +\series bold +bioscript +\series default + command, which is a wrapper around the Perl executable that allows direct + manipulations of the records using the power of Perl. +\end_layout + +\begin_layout Standard +In the below example we replace in all records the value to the CHR key + with a forthrunning number: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{CHR}=$i++; put_record($r)}' +\end_layout + +\begin_layout Standard +Something more useful would probably be to create custom FASTA headers. + E.g. + if we read in a BED file, lookup the genomic sequence, create a custom + FASTA header with +\series bold +bioscript +\series default + and output FASTA entries: +\end_layout + +\begin_layout LyX-Code +... + | bioscript 'while($r=get_record( +\backslash +*STDIN)){$r->{SEQ_NAME}= // +\end_layout + +\begin_layout LyX-Code +join("_",$r->{CHR},$r->{CHR_BEG},$r->{CHR_END}); put_record($r)}' +\end_layout + +\begin_layout Standard +And the output: +\end_layout + +\begin_layout LyX-Code +>chr2L_21567527_21567550 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_693380_693403 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_13859534_13859557 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_9005090_9005113 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_2106825_2106848 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout LyX-Code +>chr2L_14649031_14649054 +\end_layout + +\begin_layout LyX-Code +taccaaacggatgcctcagacatc +\end_layout + +\begin_layout Section +Trouble shooting +\end_layout + +\begin_layout Standard +Shoot the messenger! +\end_layout + +\begin_layout Section +\start_of_appendix +Keys +\begin_inset LatexCommand label +name "sec:Keys" + +\end_inset + + +\end_layout + +\begin_layout Standard +HIT +\end_layout + +\begin_layout Standard +HIT_BEG +\end_layout + +\begin_layout Standard +HIT_END +\end_layout + +\begin_layout Standard +HIT_LEN +\end_layout + +\begin_layout Standard +HIT_NAME +\end_layout + +\begin_layout Standard +PATTERN +\end_layout + +\begin_layout Section +Switches +\begin_inset LatexCommand label +name "sec:Switches" + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +stream_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +no_stream +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +data_in +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +result_out +\end_layout + +\begin_layout Standard +\begin_inset ERT +status collapsed + +\begin_layout Standard + +- +\backslash +/- +\end_layout + +\end_inset + +num +\end_layout + +\begin_layout Section +scan_for_matches README +\begin_inset LatexCommand label +name "sec:scan_for_matches-README" + +\end_inset + + +\end_layout + +\begin_layout LyX-Code + scan_for_matches: +\end_layout + +\begin_layout LyX-Code + A Program to Scan Nucleotide or Protein Sequences for Matching Patterns +\end_layout + +\begin_layout LyX-Code + Ross Overbeek +\end_layout + +\begin_layout LyX-Code + MCS +\end_layout + +\begin_layout LyX-Code + Argonne National Laboratory +\end_layout + +\begin_layout LyX-Code + Argonne, IL 60439 +\end_layout + +\begin_layout LyX-Code + USA +\end_layout + +\begin_layout LyX-Code +Scan_for_matches is a utility that we have written to search for +\end_layout + +\begin_layout LyX-Code +patterns in DNA and protein sequences. + I wrote most of the code, +\end_layout + +\begin_layout LyX-Code +although David Joerg and Morgan Price wrote sections of an +\end_layout + +\begin_layout LyX-Code +earlier version. + The whole notion of pattern matching has a rich +\end_layout + +\begin_layout LyX-Code +history, and we borrowed liberally from many sources. + However, it is +\end_layout + +\begin_layout LyX-Code +worth noting that we were strongly influenced by the elegant tools +\end_layout + +\begin_layout LyX-Code +developed and distributed by David Searls. + My intent is to make the +\end_layout + +\begin_layout LyX-Code +existing tool available to anyone in the research community that might +\end_layout + +\begin_layout LyX-Code +find it useful. + I will continue to try to fix bugs and make suggested +\end_layout + +\begin_layout LyX-Code +enhancements, at least until I feel that a superior tool exists. +\end_layout + +\begin_layout LyX-Code +Hence, I would appreciate it if all bug reports and suggestions are +\end_layout + +\begin_layout LyX-Code +directed to me at Overbeek@mcs.anl.gov. + +\end_layout + +\begin_layout LyX-Code +I will try to log all bug fixes and report them to users that send me +\end_layout + +\begin_layout LyX-Code +their email addresses. + I do not require that you give me your name +\end_layout + +\begin_layout LyX-Code +and address. + However, if you do give it to me, I will try to notify +\end_layout + +\begin_layout LyX-Code +you of serious problems as they are discovered. +\end_layout + +\begin_layout LyX-Code +Getting Started: +\end_layout + +\begin_layout LyX-Code + The distribution should contain at least the following programs: +\end_layout + +\begin_layout LyX-Code + README - This document +\end_layout + +\begin_layout LyX-Code + ggpunit.c - One of the two source files +\end_layout + +\begin_layout LyX-Code + scan_for_matches.c - The second source file +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + run_tests - A perl script to test things +\end_layout + +\begin_layout LyX-Code + show_hits - A handy perl script +\end_layout + +\begin_layout LyX-Code + test_dna_input - Test sequences for DNA +\end_layout + +\begin_layout LyX-Code + test_dna_patterns - Test patterns for DNA scan +\end_layout + +\begin_layout LyX-Code + test_output - Desired output from test +\end_layout + +\begin_layout LyX-Code + test_prot_input - Test protein sequences +\end_layout + +\begin_layout LyX-Code + test_prot_patterns - Test patterns for proteins +\end_layout + +\begin_layout LyX-Code + testit - a perl script used for test +\end_layout + +\begin_layout LyX-Code + Only the first three files are required. + The others are useful, +\end_layout + +\begin_layout LyX-Code + but only if you have Perl installed on your system. + If you do +\end_layout + +\begin_layout LyX-Code + have Perl, I suggest that you type +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + which perl +\end_layout + +\begin_layout LyX-Code + to find out where it installed. + On my system, I get the following +\end_layout + +\begin_layout LyX-Code + response: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% which perl +\end_layout + +\begin_layout LyX-Code + /usr/local/bin/perl +\end_layout + +\begin_layout LyX-Code + indicating that Perl is installed in /usr/local/bin. + Anyway, once +\end_layout + +\begin_layout LyX-Code + you know where it is installed, edit the first line of files +\end_layout + +\begin_layout LyX-Code + testit +\end_layout + +\begin_layout LyX-Code + show_hits +\end_layout + +\begin_layout LyX-Code + replacing /usr/local/bin/perl with the appropriate location. + I +\end_layout + +\begin_layout LyX-Code + will assume that you can do this, although it is not critical (it +\end_layout + +\begin_layout LyX-Code + is needed only to test the installation and to use the "show_hits" +\end_layout + +\begin_layout LyX-Code + utility). + Perl is not required to actually install and run +\end_layout + +\begin_layout LyX-Code + scan_for_matches. + +\end_layout + +\begin_layout LyX-Code + If you do not have Perl, I suggest you get it and install it (it +\end_layout + +\begin_layout LyX-Code + is a wonderful utility). + Information about Perl and how to get it +\end_layout + +\begin_layout LyX-Code + can be found in the book "Programming Perl" by Larry Wall and +\end_layout + +\begin_layout LyX-Code + Randall L. + Schwartz, published by O'Reilly & Associates, Inc. +\end_layout + +\begin_layout LyX-Code + To get started, you will need to compile the program. + I do this +\end_layout + +\begin_layout LyX-Code + using +\end_layout + +\begin_layout LyX-Code + gcc -O -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + If you do not use GNU C, use +\end_layout + +\begin_layout LyX-Code + cc -O -DCC -o scan_for_matches ggpunit.c scan_for_matches.c +\end_layout + +\begin_layout LyX-Code + which works on my Sun. + +\end_layout + +\begin_layout LyX-Code + Once you have compiled scan_for_matches, you can verify that it +\end_layout + +\begin_layout LyX-Code + works with +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + You may get a few strange lines of the sort +\end_layout + +\begin_layout LyX-Code + clone% run_tests tmp +\end_layout + +\begin_layout LyX-Code + rm: tmp: No such file or directory +\end_layout + +\begin_layout LyX-Code + clone% diff tmp test_output +\end_layout + +\begin_layout LyX-Code + These should cause no concern. + However, if the "diff" shows that +\end_layout + +\begin_layout LyX-Code + tmp and test_output are different, contact me (you have a +\end_layout + +\begin_layout LyX-Code + problem). + +\end_layout + +\begin_layout LyX-Code + You should now be able to use scan_for_matches by following the +\end_layout + +\begin_layout LyX-Code + instructions given below (which is all the normal user should have +\end_layout + +\begin_layout LyX-Code + to understand, once things are installed properly). +\end_layout + +\begin_layout LyX-Code + ============================================================== +\end_layout + +\begin_layout LyX-Code +How to run scan_for_matches: +\end_layout + +\begin_layout LyX-Code + To run the program, you type need to create two files +\end_layout + +\begin_layout LyX-Code + 1. + the first file contains the pattern you wish to scan for; I'll +\end_layout + +\begin_layout LyX-Code + call this file pat_file in what follows (but any name is ok) +\end_layout + +\begin_layout LyX-Code + 2. + the second file contains a set of sequences to scan. + These +\end_layout + +\begin_layout LyX-Code + should be in "fasta format". + Just look at the contents of +\end_layout + +\begin_layout LyX-Code + test_dna_input to see examples of this format. + Basically, +\end_layout + +\begin_layout LyX-Code + each sequence begins with a line of the form +\end_layout + +\begin_layout LyX-Code + >sequence_id +\end_layout + +\begin_layout LyX-Code + and is followed by one or more lines containing the sequence. +\end_layout + +\begin_layout LyX-Code + Once these files have been created, you just use +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < input_file +\end_layout + +\begin_layout LyX-Code + to scan all of the input sequences for the given pattern. + As an +\end_layout + +\begin_layout LyX-Code + example, suppose that pat_file contains a single line of the form +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + Then, +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + should produce two "hits". + When I run this on my machine, I get +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + >tst2:[6,27] +\end_layout + +\begin_layout LyX-Code + CGUAACC GGTTAACC GGUUACG +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code +Simple Patterns Built by Matching Ranges and Reverse Complements +\end_layout + +\begin_layout LyX-Code + Let me first explain this simple pattern: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...7 3...8 ~p1 +\end_layout + +\begin_layout LyX-Code + The pattern consists of three "pattern units" separated by spaces. +\end_layout + +\begin_layout LyX-Code + The first pattern unit is +\end_layout + +\begin_layout LyX-Code + p1=4...7 +\end_layout + +\begin_layout LyX-Code + which means "match 4 to 7 characters and call them p1". + The +\end_layout + +\begin_layout LyX-Code + second pattern unit is +\end_layout + +\begin_layout LyX-Code + 3...8 +\end_layout + +\begin_layout LyX-Code + which means "then match 3 to 8 characters". + The last pattern unit +\end_layout + +\begin_layout LyX-Code + is +\end_layout + +\begin_layout LyX-Code + ~p1 +\end_layout + +\begin_layout LyX-Code + which means "match the reverse complement of p1". + The first +\end_layout + +\begin_layout LyX-Code + reported hit is shown as +\end_layout + +\begin_layout LyX-Code + >tst1:[6,27] +\end_layout + +\begin_layout LyX-Code + cguaacc ggttaacc gguuacg +\end_layout + +\begin_layout LyX-Code + which states that characters 6 through 27 of sequence tst1 were +\end_layout + +\begin_layout LyX-Code + matched. + "cguaac" matched the first pattern unit, "ggttaacc" the +\end_layout + +\begin_layout LyX-Code + second, and "gguuacg" the third. + This is an example of a common +\end_layout + +\begin_layout LyX-Code + type of pattern used to search for sections of DNA or RNA that +\end_layout + +\begin_layout LyX-Code + would fold into a hairpin loop. +\end_layout + +\begin_layout LyX-Code +Searching Both Strands +\end_layout + +\begin_layout LyX-Code + Now for a short aside: scan_for_matches only searched the +\end_layout + +\begin_layout LyX-Code + sequences in the input file; it did not search the opposite +\end_layout + +\begin_layout LyX-Code + strand. + With a pattern of the sort we just used, there is not +\end_layout + +\begin_layout LyX-Code + need o search the opposite strand. + However, it is normally the +\end_layout + +\begin_layout LyX-Code + case that you will wish to search both the sequence and the +\end_layout + +\begin_layout LyX-Code + opposite strand (i.e., the reverse complement of the sequence). +\end_layout + +\begin_layout LyX-Code + To do that, you would just use the "-c" command line. + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + Hits on the opposite strand will show a beginning location greater +\end_layout + +\begin_layout LyX-Code + than te end location of the match. +\end_layout + +\begin_layout LyX-Code +Defining Pairing Rules and Allowing Mismatches, Insertions, and Deletions +\end_layout + +\begin_layout LyX-Code + Let us stop now and ask "What additional features would one need to +\end_layout + +\begin_layout LyX-Code + really find the kinds of loop structures that characterize tRNAs, +\end_layout + +\begin_layout LyX-Code + rRNAs, and so forth?" I can immediately think of two: +\end_layout + +\begin_layout LyX-Code + a) you will need to be able to allow non-standard pairings +\end_layout + +\begin_layout LyX-Code + (those other than G-C and A-U), and +\end_layout + +\begin_layout LyX-Code + b) you will need to be able to tolerate some number of +\end_layout + +\begin_layout LyX-Code + mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + Let me first show you how to handle non-standard "rules for +\end_layout + +\begin_layout LyX-Code + pairing in reverse complements". + Consider the following pattern, +\end_layout + +\begin_layout LyX-Code + which I show as two line (you may use as many lines as you like in +\end_layout + +\begin_layout LyX-Code + forming a pattern, although you can only break a pattern at points +\end_layout + +\begin_layout LyX-Code + where space would be legal): +\end_layout + +\begin_layout LyX-Code + r1={au,ua,gc,cg,gu,ug,ga,ag} +\end_layout + +\begin_layout LyX-Code + p1=2...3 0...4 p2=2...5 1...5 r1~p2 0...4 ~p1 +\end_layout + +\begin_layout LyX-Code + The first "pattern unit" does not actually match anything; rather, +\end_layout + +\begin_layout LyX-Code + it defines a "pairing rule" in which standard pairings are +\end_layout + +\begin_layout LyX-Code + allowed, as well as G-A and A-G (in case you wondered, Us and Ts +\end_layout + +\begin_layout LyX-Code + and upper and lower case can be used interchangably; for example +\end_layout + +\begin_layout LyX-Code + r1={AT,UA,gc,cg} could be used to define the "standard rule" for +\end_layout + +\begin_layout LyX-Code + pairings). + The second line consists of six pattern units which +\end_layout + +\begin_layout LyX-Code + may be interpreted as follows: +\end_layout + +\begin_layout LyX-Code + p1=2...3 match 2 or 3 characters (call it p1) +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + p2=2...5 match 2 to 5 characters (call it p2) +\end_layout + +\begin_layout LyX-Code + 1...5 match 1 to 5 characters +\end_layout + +\begin_layout LyX-Code + r1~p2 match the reverse complement of p2, +\end_layout + +\begin_layout LyX-Code + allowing G-A and A-G pairs +\end_layout + +\begin_layout LyX-Code + 0...4 match 0 to 4 characters +\end_layout + +\begin_layout LyX-Code + ~p1 match the reverse complement of p1 +\end_layout + +\begin_layout LyX-Code + allowing only G-C, C-G, A-T, and T-A pairs +\end_layout + +\begin_layout LyX-Code + Thus, r1~p2 means "match the reverse complement of p2 using rule r1". +\end_layout + +\begin_layout LyX-Code + Now let us consider the issue of tolerating mismatches and bulges. +\end_layout + +\begin_layout LyX-Code + You may add a "qualifier" to the pattern unit that gives the +\end_layout + +\begin_layout LyX-Code + tolerable number of "mismatches, deletions, and insertions". +\end_layout + +\begin_layout LyX-Code + Thus, +\end_layout + +\begin_layout LyX-Code + p1=10...10 3...8 ~p1[1,2,1] +\end_layout + +\begin_layout LyX-Code + means that the third pattern unit must match 10 characters, +\end_layout + +\begin_layout LyX-Code + allowing one "mismatch" (a pairing other than G-C, C-G, A-T, or +\end_layout + +\begin_layout LyX-Code + T-A), two deletions (a deletion is a character that occurs in p1, +\end_layout + +\begin_layout LyX-Code + but has been "deleted" from the string matched by ~p1), and one +\end_layout + +\begin_layout LyX-Code + insertion (an "insertion" is a character that occurs in the string +\end_layout + +\begin_layout LyX-Code + matched by ~p1, but not for which no corresponding character +\end_layout + +\begin_layout LyX-Code + occurs in p1). + In this case, the pattern would match +\end_layout + +\begin_layout LyX-Code + ACGTACGTAC GGGGGGGG GCGTTACCT +\end_layout + +\begin_layout LyX-Code + which is, you must admit, a fairly weak loop. + It is common to +\end_layout + +\begin_layout LyX-Code + allow mismatches, but you will find yourself using insertions and +\end_layout + +\begin_layout LyX-Code + deletions much more rarely. + In any event, you should note that +\end_layout + +\begin_layout LyX-Code + allowing mismatches, insertions, and deletions does force the +\end_layout + +\begin_layout LyX-Code + program to try many additional possible pairings, so it does slow +\end_layout + +\begin_layout LyX-Code + things down a bit. +\end_layout + +\begin_layout LyX-Code +How Patterns Are Matched +\end_layout + +\begin_layout LyX-Code + Now is as good a time as any to discuss the basic flow of control +\end_layout + +\begin_layout LyX-Code + when matching patterns. + Recall that a "pattern" is a sequence of +\end_layout + +\begin_layout LyX-Code + "pattern units". + Suppose that the pattern units were +\end_layout + +\begin_layout LyX-Code + u1 u2 u3 u4 ... + un +\end_layout + +\begin_layout LyX-Code + The scan of a sequence S begins by setting the current position +\end_layout + +\begin_layout LyX-Code + to 1. + Then, an attempt is made to match u1 starting at the +\end_layout + +\begin_layout LyX-Code + current position. + Each attempt to match a pattern unit can +\end_layout + +\begin_layout LyX-Code + succeed or fail. + If it succeeds, then an attempt is made to match +\end_layout + +\begin_layout LyX-Code + the next unit. + If it fails, then an attempt is made to find an +\end_layout + +\begin_layout LyX-Code + alternative match for the immediately preceding pattern unit. + If +\end_layout + +\begin_layout LyX-Code + this succeeds, then we proceed forward again to the next unit. + If +\end_layout + +\begin_layout LyX-Code + it fails we go back to the preceding unit. + This process is called +\end_layout + +\begin_layout LyX-Code + "backtracking". + If there are no previous units, then the current +\end_layout + +\begin_layout LyX-Code + position is incremented by one, and everything starts again. + This +\end_layout + +\begin_layout LyX-Code + proceeds until either the current position goes past the end of +\end_layout + +\begin_layout LyX-Code + the sequence or all of the pattern units succeed. + On success, +\end_layout + +\begin_layout LyX-Code + scan_for_matches reports the "hit", the current position is set +\end_layout + +\begin_layout LyX-Code + just past the hit, and an attempt is made to find another hit. +\end_layout + +\begin_layout LyX-Code + If you wish to limit the scan to simply finding a maximum of, say, +\end_layout + +\begin_layout LyX-Code + 10 hits, you can use the -n option (-n 10 would set the limit to +\end_layout + +\begin_layout LyX-Code + 10 reported hits). + For example, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c -n 1 pat_file < test_dna_input +\end_layout + +\begin_layout LyX-Code + would search for just the first hit (and would stop searching the +\end_layout + +\begin_layout LyX-Code + current sequences or any that follow in the input file). +\end_layout + +\begin_layout LyX-Code +Searching for repeats: +\end_layout + +\begin_layout LyX-Code + In the last section, I discussed almost all of the details +\end_layout + +\begin_layout LyX-Code + required to allow you to look for repeats. + Consider the following +\end_layout + +\begin_layout LyX-Code + set of patterns: +\end_layout + +\begin_layout LyX-Code + p1=6...6 3...8 p1 (find exact 6 character repeat separated +\end_layout + +\begin_layout LyX-Code + by to 8 characters) +\end_layout + +\begin_layout LyX-Code + p1=6...6 3..8 p1[1,0,0] (allow one mismatch) +\end_layout + +\begin_layout LyX-Code + p1=3...3 p1[1,0,0] p1[1,0,0] p1[1,0,0] +\end_layout + +\begin_layout LyX-Code + (match 12 characters that are the remains +\end_layout + +\begin_layout LyX-Code + of a 3-character sequence occurring 4 times) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + p1=4...8 0...3 p2=6...8 p1 0...3 p2 +\end_layout + +\begin_layout LyX-Code + (This would match things like +\end_layout + +\begin_layout LyX-Code + ATCT G TCTTT ATCT TG TCTTT +\end_layout + +\begin_layout LyX-Code + ) +\end_layout + +\begin_layout LyX-Code +Searching for particular sequences: +\end_layout + +\begin_layout LyX-Code + Occasionally, one wishes to match a specific, known sequence. +\end_layout + +\begin_layout LyX-Code + In such a case, you can just give the sequence (along with an +\end_layout + +\begin_layout LyX-Code + optional statement of the allowable mismatches, insertions, and +\end_layout + +\begin_layout LyX-Code + deletions). + Thus, +\end_layout + +\begin_layout LyX-Code + p1=6...8 GAGA ~p1 (match a hairpin with GAGA as the loop) +\end_layout + +\begin_layout LyX-Code + RRRRYYYY (match 4 purines followed by 4 pyrimidines) +\end_layout + +\begin_layout LyX-Code + TATAA[1,0,0] (match TATAA, allowing 1 mismatch) +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matches against a "weight matrix": +\end_layout + +\begin_layout LyX-Code + I will conclude my examples of the types of pattern units +\end_layout + +\begin_layout LyX-Code + available for matching against nucleotide sequences by discussing a +\end_layout + +\begin_layout LyX-Code + crude implemetation of matching using a "weight matrix". + While I +\end_layout + +\begin_layout LyX-Code + am less than overwhelmed with the syntax that I chose, I think that +\end_layout + +\begin_layout LyX-Code + the reader should be aware that I was thinking of generating +\end_layout + +\begin_layout LyX-Code + patterns containing such pattern units automatically from +\end_layout + +\begin_layout LyX-Code + alignments (and did not really plan on typing such things in by +\end_layout + +\begin_layout LyX-Code + hand very often). + Anyway, suppose that you wanted to match a +\end_layout + +\begin_layout LyX-Code + sequence of eight characters. + The "consensus" of these eight +\end_layout + +\begin_layout LyX-Code + characters is GRCACCGS, but the actual "frequencies of occurrence" +\end_layout + +\begin_layout LyX-Code + are given in the matrix below. + Thus, the first character is an A +\end_layout + +\begin_layout LyX-Code + 16% the time and a G 84% of the time. + The second is an A 57% of +\end_layout + +\begin_layout LyX-Code + the time, a C 10% of the time, a G 29% of the time, and a T 4% of +\end_layout + +\begin_layout LyX-Code + the time. + +\end_layout + +\begin_layout LyX-Code + C1 C2 C3 C4 C5 C6 C7 C8 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + A 16 57 0 95 0 18 0 0 +\end_layout + +\begin_layout LyX-Code + C 0 10 80 0 100 60 0 50 +\end_layout + +\begin_layout LyX-Code + G 84 29 0 0 0 20 100 50 +\end_layout + +\begin_layout LyX-Code + T 0 4 20 5 0 2 0 0 +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + One could use the following pattern unit to search for inexact +\end_layout + +\begin_layout LyX-Code + matches related to such a "weight matrix": +\end_layout + +\begin_layout LyX-Code + {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + This pattern unit will attempt to match exactly eight characters. +\end_layout + +\begin_layout LyX-Code + For each character in the sequence, the entry in the corresponding +\end_layout + +\begin_layout LyX-Code + tuple is added to an accumulated sum. + If the sum is greater than +\end_layout + +\begin_layout LyX-Code + 450, the match succeeds; else it fails. +\end_layout + +\begin_layout LyX-Code + Recently, this feature was upgraded to allow ranges. + Thus, +\end_layout + +\begin_layout LyX-Code + 600 > {(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), +\end_layout + +\begin_layout LyX-Code + (0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)} > 450 +\end_layout + +\begin_layout LyX-Code + will work, as well. +\end_layout + +\begin_layout LyX-Code +Allowing Alternatives: +\end_layout + +\begin_layout LyX-Code + Very occasionally, you may wish to allow alternative pattern units +\end_layout + +\begin_layout LyX-Code + (i.e., "match either A or B"). + You can do this using something +\end_layout + +\begin_layout LyX-Code + like +\end_layout + +\begin_layout LyX-Code + ( GAGA | GCGCA) +\end_layout + +\begin_layout LyX-Code + which says "match either GAGA or GCGCA". + You may take +\end_layout + +\begin_layout LyX-Code + alternatives of a list of pattern units, for example +\end_layout + +\begin_layout LyX-Code + (p1=3...3 3...8 ~p1 | p1=5...5 4...4 ~p1 GGG) +\end_layout + +\begin_layout LyX-Code + would match one of two sequences of pattern units. + There is one +\end_layout + +\begin_layout LyX-Code + clumsy aspect of the syntax: to match a list of alternatives, you +\end_layout + +\begin_layout LyX-Code + need to fully the request. + Thus, +\end_layout + +\begin_layout LyX-Code + (GAGA | (GCGCA | TTCGA)) +\end_layout + +\begin_layout LyX-Code + would be needed to try the three alternatives. +\end_layout + +\begin_layout LyX-Code +One Minor Extension +\end_layout + +\begin_layout LyX-Code + Sometimes a pattern will contain a sequence of distinct ranges, +\end_layout + +\begin_layout LyX-Code + and you might wish to limit the sum of the lengths of the matched +\end_layout + +\begin_layout LyX-Code + subsequences. + For example, suppose that you basically wanted to +\end_layout + +\begin_layout LyX-Code + match something like +\end_layout + +\begin_layout LyX-Code + ARRYYTT p1=0...5 GCA[1,0,0] p2=1...6 ~p1 4...8 ~p2 p3=4...10 CCT +\end_layout + +\begin_layout LyX-Code + but that the sum of the lengths of p1, p2, and p3 must not exceed +\end_layout + +\begin_layout LyX-Code + eight characters. + To do this, you could add +\end_layout + +\begin_layout LyX-Code + length(p1+p2+p3) < 9 +\end_layout + +\begin_layout LyX-Code + as the last pattern unit. + It will just succeed or fail (but does +\end_layout + +\begin_layout LyX-Code + not actually match any characters in the sequence). +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code +Matching Protein Sequences +\end_layout + +\begin_layout LyX-Code + Suppose that the input file contains protein sequences. + In this +\end_layout + +\begin_layout LyX-Code + case, you must invoke scan_for_matches with the "-p" option. + You +\end_layout + +\begin_layout LyX-Code + cannot use aspects of the language that relate directly to +\end_layout + +\begin_layout LyX-Code + nucleotide sequences (e.g., the -c command line option or pattern +\end_layout + +\begin_layout LyX-Code + constructs referring to the reverse complement of a previously +\end_layout + +\begin_layout LyX-Code + matched unit). + +\end_layout + +\begin_layout LyX-Code + You also have two additional constructs that allow you to match +\end_layout + +\begin_layout LyX-Code + either "one of a set of amino acids" or "any amino acid other than +\end_layout + +\begin_layout LyX-Code + those a given set". + For example, +\end_layout + +\begin_layout LyX-Code + p1=0...4 any(HQD) 1...3 notany(HK) p1 +\end_layout + +\begin_layout LyX-Code + would successfully match a string like +\end_layout + +\begin_layout LyX-Code + YWV D AA C YWV +\end_layout + +\begin_layout LyX-Code +Using the show_hits Utility +\end_layout + +\begin_layout LyX-Code + When viewing a large set of complex matches, you might find it +\end_layout + +\begin_layout LyX-Code + convenient to post-process the scan_for_matches output to get a +\end_layout + +\begin_layout LyX-Code + more readable version. + We provide a simple post-processor called +\end_layout + +\begin_layout LyX-Code + "show_hits". + To see its effect, just pipe the output of a +\end_layout + +\begin_layout LyX-Code + scan_for_matches into show_hits: +\end_layout + +\begin_layout LyX-Code + Normal Output: +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp +\end_layout + +\begin_layout LyX-Code + >tst1:[1,28] +\end_layout + +\begin_layout LyX-Code + gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst1:[28,1] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + >tst2:[2,31] +\end_layout + +\begin_layout LyX-Code + CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + >tst2:[31,2] +\end_layout + +\begin_layout LyX-Code + CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + >tst3:[3,32] +\end_layout + +\begin_layout LyX-Code + gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + >tst3:[32,3] +\end_layout + +\begin_layout LyX-Code + gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + Piped Through show_hits: +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + Optionally, you can specify which of the "fields" in the matches +\end_layout + +\begin_layout LyX-Code + you wish to sort on, and show_hits will sort them. + The field +\end_layout + +\begin_layout LyX-Code + numbers start with 0. + So, you might get something like +\end_layout + +\begin_layout LyX-Code + clone% scan_for_matches -c pat_file < tmp | show_hits 2 1 +\end_layout + +\begin_layout LyX-Code + tst2:[2,31]: CGTACGUAAC C GGTTAACC GGUUACGTACG +\end_layout + +\begin_layout LyX-Code + tst2:[31,2]: CGTACGTAAC C GGTTAACC GGTTACGTACG +\end_layout + +\begin_layout LyX-Code + tst3:[32,3]: gtacgtaacc g aagttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[1,28]: gtacguaacc ggttaac cgguuacgtac +\end_layout + +\begin_layout LyX-Code + tst1:[28,1]: gtacgtaacc ggttaac cggttacgtac +\end_layout + +\begin_layout LyX-Code + tst3:[3,32]: gtacguaacc g gttaactt cgguuacgtac +\end_layout + +\begin_layout LyX-Code + clone% +\end_layout + +\begin_layout LyX-Code + In this case, the hits have been sorted on fields 2 and 1 (that is, +\end_layout + +\begin_layout LyX-Code + the third and second matched subfields). +\end_layout + +\begin_layout LyX-Code + show_hits is just one possible little post-processor, and you +\end_layout + +\begin_layout LyX-Code + might well wish to write a customized one for yourself. +\end_layout + +\begin_layout LyX-Code +Reducing the Cost of a Search +\end_layout + +\begin_layout LyX-Code + The scan_for_matches utility uses a fairly simple search, and may +\end_layout + +\begin_layout LyX-Code + consume large amounts of CPU time for complex patterns. + Someday, +\end_layout + +\begin_layout LyX-Code + I may decide to optimize the code. + However, until then, let me +\end_layout + +\begin_layout LyX-Code + mention one useful technique. + +\end_layout + +\begin_layout LyX-Code + When you have a complex pattern that includes a number of varying +\end_layout + +\begin_layout LyX-Code + ranges, imprecise matches, and so forth, it is useful to +\end_layout + +\begin_layout LyX-Code + "pipeline" matches. + That is, form a simpler pattern that can be +\end_layout + +\begin_layout LyX-Code + used to scan through a large database extracting sections that +\end_layout + +\begin_layout LyX-Code + might be matched by the more complex pattern. + Let me illustrate +\end_layout + +\begin_layout LyX-Code + with a short example. + Suppose that you really wished to match the +\end_layout + +\begin_layout LyX-Code + pattern +\end_layout + +\begin_layout LyX-Code + p1=3...5 0...8 ~p1[1,1,0] p2=6...7 3...6 AGC 3...5 RYGC ~p2[1,0,0] +\end_layout + +\begin_layout LyX-Code + In this case, the pattern units AGC 3...5 RYGC can be used to rapidly +\end_layout + +\begin_layout LyX-Code + constrain the overall search. + You can preprocess the overall +\end_layout + +\begin_layout LyX-Code + database using the pattern: +\end_layout + +\begin_layout LyX-Code + 31...31 AGC 3...5 RYGC 7...7 +\end_layout + +\begin_layout LyX-Code + Put the complex pattern in pat_file1 and the simpler pattern in +\end_layout + +\begin_layout LyX-Code + pat_file2. + Then use, +\end_layout + +\begin_layout LyX-Code + scan_for_matches -c pat_file2 < nucleotide_database | +\end_layout + +\begin_layout LyX-Code + scan_for_matches pat_file1 +\end_layout + +\begin_layout LyX-Code + The output will show things like +\end_layout + +\begin_layout LyX-Code + >seqid:[232,280][2,47] +\end_layout + +\begin_layout LyX-Code + matches pieces +\end_layout + +\begin_layout LyX-Code + Then, the actual section of the sequence that was matched can be +\end_layout + +\begin_layout LyX-Code + easily computed as [233,278] (remember, the positions start from +\end_layout + +\begin_layout LyX-Code + 1, not 0). +\end_layout + +\begin_layout LyX-Code + Let me finally add, you should do a few short experiments to see +\end_layout + +\begin_layout LyX-Code + whether or not such pipelining actually improves performance -- it +\end_layout + +\begin_layout LyX-Code + is not always obvious where the time is going, and I have +\end_layout + +\begin_layout LyX-Code + sometimes found that the added complexity of pipelining actually +\end_layout + +\begin_layout LyX-Code + slowed things up. + It gets its best improvements when there are +\end_layout + +\begin_layout LyX-Code + exact matches of more than just a few characters that can be +\end_layout + +\begin_layout LyX-Code + rapidly used to eliminate large sections of the database. +\end_layout + +\begin_layout LyX-Code +============= +\end_layout + +\begin_layout LyX-Code +Additions: +\end_layout + +\begin_layout LyX-Code +Feb 9, 1995: the pattern units ^ and $ now work as in normal regular +\end_layout + +\begin_layout LyX-Code + expressions. + That is +\end_layout + +\begin_layout LyX-Code + TTF $ +\end_layout + +\begin_layout LyX-Code + matches only TTF at the end of the string and +\end_layout + +\begin_layout LyX-Code + ^ TTF +\end_layout + +\begin_layout LyX-Code + matches only an initial TTF +\end_layout + +\begin_layout LyX-Code + The pattern unit +\end_layout + +\begin_layout LyX-Code + +\end{lyxcode} +cat is the Unix command that reads a file and output the result to +'stdout' --- which in this case is piped to any biotool represented +by the . It is also possible to read the data stream using +'<' to direct the 'stdout' stream into the biotool like this: + +\begin{lyxcode} +~<~ +\end{lyxcode} +However, that will not work if you pipe more biotools together. Then +it is much safer to read the stream from a file explicitly like this: + +\begin{lyxcode} +~-{}-stream\_in= +\end{lyxcode} +Here the filename is explicetly given to the biotool +with the switch -\/-stream\_in. This switch works with all biotools. +It is also possible to read in data from multiple sources by repeating +the explicit read step: + +\begin{lyxcode} +~-{}-stream\_in=~|~~-{}-stream\_in= +\end{lyxcode} + +\subsection{How to write the data stream to file?\label{sub:How-to-write-stream}} + +In order to save the output stream from a biotool to file, so you +can read in the stream again at a later time, you can do one of two +things: + +\begin{lyxcode} +~>~ +\end{lyxcode} +All, the biotools write the data stream to 'stdout' by default which +can be written to a file by redirecting 'stdout' to file using '>' +, however, if one of the biotools for writing other formats is used +then the both the biotools records as well as the result output will +go to 'stdout' in a mixture causing havock! To avoid this you must +use the switch -\/-stream\_out that explictly tells the biotool to +write the output stream to file: + +\begin{lyxcode} +~-{}-stream\_out= +\end{lyxcode} +The -\/-stream\_out switch works with all biotools. + + +\subsection{How to terminate the data stream?} + +The data stream is never stops unless the user want to save the stream +or by supplying the -\/-no\_stream switch that will terminate the +stream: + +\begin{lyxcode} +~-{}-no\_stream +\end{lyxcode} +The -\/-no\_stream switch only works with those biotools where it +makes sense that the user might want to terminale the data stream, +\emph{i.e}. after an analysis step where the user wants to output +the result, but not the data stream. + + +\subsection{How to write my results to file?\label{sub:How-to-write-result}} + +Saving the result of an analysis to file can be done implicitly or +explicitly. The implicit way: + +\begin{lyxcode} +~-{}-no\_stream~>~ +\end{lyxcode} +If you use '>' to redirect 'stdout' to file then it is important to +use the -\/-no\_stream switch to avoid writing a mix of biotools +records and result to the same file causing havock. The safe way is +to use the -\/-result\_out switch which explicetly tells the biotool +to write the result to a given file: + +\begin{lyxcode} +~-{}-result\_out= +\end{lyxcode} +Using the above method will not terminate the stream, so it is possible +to pipe that into another biotool generating different results: + +\begin{lyxcode} +~-{}-result\_out=~|~~-{}-result\_out= +\end{lyxcode} +And still the data stream will continue unless terminated with -\/-no\_stream: + +\begin{lyxcode} +~-{}-result\_out=~-{}-no\_stream +\end{lyxcode} +Or written to file using implicitly or explicity \eqref{sub:How-to-write-result}. +The explicit way: + +\begin{lyxcode} +~-{}-result\_out=~-{}-stream\_out= +\end{lyxcode} + +\subsection{How to read data from multiple sources?} + +To read multiple data sources, with the same type or different type +of data do: + +\begin{lyxcode} +~-{}-data\_in=~|~~-{}-data\_in= +\end{lyxcode} +where type is the data type a specific biotool reads. + + +\section{Reading input} + + +\subsection{How to read biotools input?} + +See \eqref{sub:How-to-read-stream}. + + +\subsection{How to read in data?} + +Data in different formats can be read with the appropriate biotool +for that format. The biotools are typicalled named 'read\_' +such as \textbf{read\_fasta}, \textbf{read\_bed}, \textbf{read\_tab}, +etc., and all behave in a similar manner. Data can be read by supplying +the -\/-data\_in switch and a file name to the file containing the +data: + +\begin{lyxcode} +~-{}-data\_in= +\end{lyxcode} +It is also possible to read in a saved biotools stream (see \ref{sub:How-to-read-stream}) +as well as reading data in one go: + +\begin{lyxcode} +~-{}-stream\_in=~-{}-data\_in= +\end{lyxcode} +If you want to read data from several files you can do this: + +\begin{lyxcode} +~-{}-data\_in=~|~~-{}-data\_in= +\end{lyxcode} +If you have several data files you can read in all explicitly with +a comma separated list: + +\begin{lyxcode} +~-{}-data\_in=file1,file2,file3 +\end{lyxcode} +And it is also possible to use file globbing: + +\begin{lyxcode} +~-{}-data\_in={*}.fna +\end{lyxcode} +Or in a combination: + +\begin{lyxcode} +~-{}-data\_in=file1,/dir/{*}.fna +\end{lyxcode} +Finally, it is possible to read in data in different formats using +the appropriate biotool for each format: + +\begin{lyxcode} +~-{}-data\_in=~|~~-{}-data\_in=~... +\end{lyxcode} + +\subsection{How to read FASTA input?} + +Sequences in FASTA format can be read explicitly using \textbf{read\_fasta}: + +\begin{lyxcode} +read\_fasta~-{}-data\_in= +\end{lyxcode} + +\subsection{How to read alignment input?} + +If your alignment if FASTA formatted then you can \textbf{read\_align}. +It is also possible to use \textbf{read\_fasta} since the data is +FASTA formatted, however, with \textbf{read\_fasta} the key ALIGN +will be omitted. The ALIGN key is used to determine which sequences +belong to what alignment which is required for \textbf{write\_align}. + +\begin{lyxcode} +read\_align~-{}-data\_in= +\end{lyxcode} + +\subsection{How to read tabular input?\label{sub:How-to-read-table}} + +Tabular input can be read with \textbf{read\_tab} which will read +in all rows and chosen columns (separated by a given delimter) from +a table in text format. + +The table below: + +\noindent \begin{center} +\begin{tabular}{lll} +Human & ATACGTCAG & 23524\tabularnewline +Dog & AGCATGAC & 2442\tabularnewline +Mouse & GACTG & 234\tabularnewline +Cat & AAATGCA & 2342\tabularnewline +\end{tabular} +\par\end{center} + +Can be read using the command: + +\begin{lyxcode} +read\_tab~-{}-data\_in= +\end{lyxcode} +Which will result in four records, one for each row, where the keys +V0, V1, V2 are the default keys for the organism, sequence, and count, +respectively. It is possible to select a subset of colums to read +by using the -\/-cols switch which takes a comma separated list of +columns numbers (first column is designated 0) as argument. So to +read in only the sequence and the count so that the count comes before +the sequence do: + +\begin{lyxcode} +read\_tab~-{}-data\_in=~-{}-cols=2,1 +\end{lyxcode} +It is also possible to name the columns with the -\/-keys switch: + +\begin{lyxcode} +read\_tab~-{}-data\_in=~-{}-cols=2,1~-{}-keys=COUNT,SEQ +\end{lyxcode} + +\subsection{How to read BED input?} + +The BED (Browser Extensible Data% +\footnote{\url{http://genome.ucsc.edu/FAQ/FAQformat}% +}) format is a tabular format for data pertaining to one of the Eukaryotic +genomes in the UCSC genome brower% +\footnote{\url{http://genome.ucsc.edu/}% +}. The BED format consists of up to 12 columns, where the first three +are mandatory CHR, CHR\_BEG, and CHR\_END. The mandatory columns and +any of the optional columns can all be read in easily with the \textbf{read\_bed} +biotool. + +\begin{lyxcode} +read\_bed~-{}-data\_in= +\end{lyxcode} +It is also possible to read the BED file with \textbf{read\_tab} (see~\ref{sub:How-to-read-table}), +however, that will be more cumbersome because you need to specify +the keys: + +\begin{lyxcode} +read\_tab~-{}-data\_in=~-{}-keys=CHR,CHR\_BEG,CHR\_END~... +\end{lyxcode} + +\subsection{How to read PSL input?} + +The PSL format is the output from BLAT and contains 21 mandatory fields +that can be read with \textbf{read\_psl}: + +\begin{lyxcode} +read\_psl~-{}-data\_in= +\end{lyxcode} + +\section{Writing output} + +All result output can be written explicitly to file using the -\/-result\_out +switch which all result generating biotools have. It is also possible +to write the result to file implicetly by directing 'stdout' to file +using '>', however, that requires the -\/-no\_stream swich to prevent +a mixture of data stream and results in the file. The explicit (and +safe) way: + +\begin{lyxcode} +...~|~~-{}-result\_out= +\end{lyxcode} +The implicit way: + +\begin{lyxcode} +...~|~~-{}-no\_stream~>~ +\end{lyxcode} + +\subsection{How to write biotools output?} + +See \eqref{sub:How-to-write-stream}. + + +\subsection{How to write FASTA output?\label{sub:How-to-write-fasta}} + +FASTA output can be written with \textbf{write\_fasta}. + +\begin{lyxcode} +...~|~write\_fasta~-{}-result\_out= +\end{lyxcode} +It is also possible to wrap the sequences to a given width using the +-\/-wrap switch allthough wrapping of sequence is generally an evil +thing: + +\begin{lyxcode} +...~|~write\_fasta~-{}-no\_stream~-{}-wrap=80 +\end{lyxcode} + +\subsection{How to write alignment output?\label{sub:How-to-write-alignment}} + +Pretty alignments with ruler% +\footnote{'.' for every 10 residues, ':' for every 50, and '|' for every 100% +} and consensus sequence can be created with \textbf{write\_align}, +what also have the optional -\/-wrap switch to break the alignment +into blocks of a given width: + +\begin{lyxcode} +...~|~write\_align~-{}-result\_out=~-{}-wrap=80 +\end{lyxcode} +If the number of sequnces in the alignment is 2 then a pairwise alignment +will be output otherwise a multiple alignment. And if the sequence +type, determined automagically, is protein, then residues and symbols +(+,~:,~.) will be used to show consensus according to the Blosum62 +matrix. + + +\subsection{How to write tabular output?\label{sub:How-to-write-tab}} + +Outputting the data stream as a table can be done with \textbf{write\_tab}, +which will write generate one row per record with the values as columns. +If you supply the optional -\/-comment switch, when the first row +in the table will be a 'comment' line prefixed with a '\#': + +\begin{lyxcode} +...~|~write\_tab~-{}-result\_out=~-{}-comment +\end{lyxcode} +You can also change the delimiter from the default (tab) to \emph{e.g.} +',': + +\begin{lyxcode} +...~|~write\_tab~-{}-result\_out=~-{}-delimit=',' +\end{lyxcode} +If you want the values output in a specific order you have to supply +a comma separated list using the -\/-keys switch that will print +only those keys in that order: + +\begin{lyxcode} +...~|~write\_tab~-{}-result\_out=~-{}-keys=SEQ\_NAME,COUNT +\end{lyxcode} +Alternatively, if you have some keys that you don't want in the tabular +output, use the -\/-no\_keys switch. So to print all keys except +SEQ and SEQ\_TYPE do: + +\begin{lyxcode} +...~|~write\_tab~-{}-result\_out=~-{}-no\_keys=SEQ,SEQ\_TYPE +\end{lyxcode} +Finally, if you have a stream containing a mix of different records +types, \emph{e.g.} records with sequences and records with matches, +then you can use \textbf{write\_tab} to output all the records in +tabluar format, however, the -\/-comment, -\/-keys, and -\/-no\_keys +switches will only respond to records of the first type encountered. +The reason is that outputting mixed records is probably not what you +want anyway, and you should remove all the unwanted records from the +stream before outputting the table: \textbf{grab} is your friend (see~\ref{sub:How-to-grab}). + + +\subsection{How to write a BED output?\label{sub:How-to-write-BED}} + +Data in BED format can be output if the records contain the mandatory +keys CHR, CHR\_BEG, and CHR\_END using \textbf{write\_bed}. If the +optional keys are also present, they will be output as well: + +\begin{lyxcode} +write\_bed~-{}-result\_out= +\end{lyxcode} + +\subsection{How to write PSL output?\label{sub:How-to-write-PSL}} + +Data in PSL format can be output using \textbf{write\_psl:} + +\begin{lyxcode} +write\_psl~-{}-result\_out= +\end{lyxcode} + +\section{Manipulating Records} + + +\subsection{How to select a few records?\label{sub:How-to-select-a-few-records}} + +To quickly get an overview of your data you can limit the data stream +to show a few records. This also very useful to test the pipeline +with a few records if you are setting up a complex analysis using +several biotools. That way you can inspect that all goes well before +analyzing and waiting for the full data set. All of the read\_ +biotools have the -\/-num switch which will take a number as argument +and only that number of records will be read. So to read in the first +10 FASTA entries from a file: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=test.fna~-{}-num=10 +\end{lyxcode} +Another way of doing this is to use \textbf{head\_records} will limit +the stream to show the first 10 records (default): + +\begin{lyxcode} +...~|~head\_records +\end{lyxcode} +Using \textbf{head\_records} directly after one of the read\_ +biotools will be a lot slower than using the -\/-num switch with +the read\_ biotools, however, \textbf{head\_records} can also +be used to limit the output from all the other biotools. It is also +possible to give \textbf{head\_records} a number of records to show +using the -\/-num switch. So to display the first 100 records do: + +\begin{lyxcode} +...~|~head\_records~-{}-num=100 +\end{lyxcode} + +\subsection{How to count all records in the data stream?} + +To count all the records in the data stream use \textbf{count\_records}, +which adds one record (which is not included in the count) to the +data stream. So to count the number of sequences in a FASTA file you +can do this: + +\begin{lyxcode} +cat~test.fna~|~read\_fasta~|~count\_records~-{}-no\_stream +\end{lyxcode} +Which will write the last record containing the count to 'stdout': + +\begin{lyxcode} +-{}-{}- + +count\_records:~630 +\end{lyxcode} +It is also possible to write the count to file using the -\/-result\_out +switch. + + +\subsection{How to grab specific records?\label{sub:How-to-grab}} + +The biotool \textbf{grab} is related to the Unix grep and locates +records based on matching keys and/or values using either a pattern, +a Perl regex, or a numerical evaluation. To easily \textbf{grab} all +records in the stream that has any mentioning of the pattern 'human' +just pipe the data stream through \textbf{grab} like this: + +\begin{lyxcode} +...~|~grab~-{}-pattern=human +\end{lyxcode} +This will search for the pattern 'human' in all keys and all values. +The -\/-pattern switch takes a comma separated list of patterns, +so in order to match multiple patterns do: + +\begin{lyxcode} +...~|~grab~-{}-pattern=human,mouse +\end{lyxcode} +It is also possible to use the -\/-pattern\_in switch instead of +-\/-pattern. -\/-pattern\_in is used to read a file with one pattern +per line: + +\begin{lyxcode} +...~|~grab~-{}-pattern\_in=patterns.txt +\end{lyxcode} +If you want the opposite result --- to find all records that does +not match the patterns, add the -\/-invert switch, which not only +works with the -\/-pattern switch, but also with -\/-regex and -\/-eval: + +\begin{lyxcode} +...~|~grab~-{}-pattern=human~-{}-invert +\end{lyxcode} +If you want to search the record keys only, \emph{e.g.} to find all +records containing the key SEQ you can add the -\/-keys\_only switch. +This will prevent matching of SEQ in any record value, and in fact +SEQ is a not uncommon peptide sequence you could get an unwanted record. +Also, this will give an increase in speed since only the keys are +searched: + +\begin{lyxcode} +...~|~grab~-{}-pattern=SEQ~-{}-keys\_only +\end{lyxcode} +However, if you are interested in finding the peptide sequence SEQ +and not the SEQ key, just add the -\/-vals\_only switch instead: + +\begin{lyxcode} +...~|~grab~-{}-pattern=SEQ~-{}-vals\_only +\end{lyxcode} +Also, if you want to grab for certain key/value pairs you can supply +a comma separated list of keys whos values will then be searched using +the -\/-keys switch. This is handy if your records contain large +genomic sequences and you dont want to search the entire sequence +for \emph{e.g.} the organism name --- it is much faster to tell \textbf{grab} +which keys to search the value for: + +\begin{lyxcode} +...~|~grab~-{}-pattern=human~-{}-keys=SEQ\_NAME + + +\end{lyxcode} +It is also possible to invoke flexible matching using regex (regular +expressions) instead of simple pattern matching. In \textbf{grab} +the regex engine is Perl based and allows use of different type of +wild cards, alternatives, \emph{etc}% +\footnote{\url{http://perldoc.perl.org/perlreref.html}% +}. If you want to \textbf{grab} records withs the sequence ATCG or +GCTA you can do this: + +\begin{lyxcode} +...~|~grab~-{}-regex='ATCG|GCTA' +\end{lyxcode} +Or if you want to find sequences beginning with ATCG: + +\begin{lyxcode} +...~|~grab~-{}-regex='\textasciicircum{}ATCG' +\end{lyxcode} +You can also use \textbf{grab} to locate records that fulfill a numerical +property using the -\/-eval switch witch takes an expression in three +parts. The first part is the key that holds the number we want to +evaluate, the second part holds one the six operators: + +\begin{enumerate} +\item Greater than: > +\item Greater than or equal to: >= +\item Less than: < +\item Less than or equal to: <= +\item Equal to: = +\item Not equal to: != +\end{enumerate} +And finally comes the number used in the evaluation. So to \textbf{grab} +all records with a sequence length greater than 30: + +\begin{lyxcode} +...~length\_seq~|~grab~-{}-eval='SEQ\_LEN~>~30' +\end{lyxcode} +If you want to locate all records containing the pattern 'human' and +where the sequence length is greater that 30, you do this by running +the stream through \textbf{grab} twice: + +\begin{lyxcode} +...~|~grab~-{}-pattern='human'~|~length\_seq~|~grab~-{}-eval='SEQ\_LEN~>~30' +\end{lyxcode} +To get the best speed performance, use the most restrictive \textbf{grab} +first. + + +\subsection{How to remove keys from records?} + +To remove one or more specific keys from all records in the data stream +use \textbf{remove\_keys} like this: + +\begin{lyxcode} +...~|~remove\_keys~-{}-keys=SEQ,SEQ\_NAME +\end{lyxcode} +In the above example SEQ and SEQ\_NAME will be removed from all records +if they exists in these. If all keys are removed from a record, then +the record will be removed. + + +\subsection{How to rename keys in records?} + +Sometimes you want to rename a record key, \emph{e.g.} if you have +read in a two column table with sequence name and sequence in each +column (see \ref{sub:How-to-read-table}) without specifying the key +names, then the sequence name will be called V0 and the sequence V1 +as default in the \textbf{read\_tab} biotool. To rename the V0 and +V1 keys we need to run the stream through \textbf{rename\_keys} twice +(one for each key to rename): + +\begin{lyxcode} +...~|~rename\_keys~-{}-keys=V0,SEQ\_NAME~|~rename\_keys~-{}-keys=V1,SEQ +\end{lyxcode} +The first instance of \textbf{rename\_keys} replaces all the V0 keys +with SEQ\_NAME, and the second instance of \textbf{rename\_keys} replaces +all the V1 keys with SEQ. \emph{Et viola} the data can now be used +in the biotools that requires these keys. + + +\section{Manipulating Sequences} + + +\subsection{How to get sequence lengths?} + +The length for sequences in records can be determined with \textbf{length\_seq}, +which adds the key SEQ\_LEN to each record with the sequence length +as the value. It also generates an extra record that is emitted last +with the key TOTAL\_SEQ\_LEN showing the total length of all the sequences. + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~length\_seq +\end{lyxcode} +It is also possible to determine the sequence length using the generic +tool \textbf{length\_vals} (see \#\#\#), which determines the length +of the values for a given list of keys: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~length\_vals~-{}-keys=SEQ +\end{lyxcode} +To obtain the total length of all sequences use \textbf{sum\_vals} +like this: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~length\_vals~-{}-keys=SEQ + +|~sum\_vals~-{}-keys=SEQ\_LEN +\end{lyxcode} +The biotool \textbf{analyze\_seq} will also determine the length of +each sequence (see~\ref{sub:How-to-analyze}). + + +\subsection{How to analyze sequence composition?\label{sub:How-to-analyze}} + +If you want to find out the sequence type, composition, length, as +well as GC content, indel content and proportions of soft and hard +masked sequence, then use \textbf{analyze\_seq}. This handy biotool +will determine all these things per sequence from which it is easy +to get an overview using the \textbf{write\_tab} biotool to output +a table (see~\ref{sub:How-to-write-tab}). So in order to determine +the sequence composition of a FASTA file with just one entry containing +the sequence 'ATCG' we just read the data with \textbf{read\_fasta} +and run the output through \textbf{analyze\_seq} which will add the +analysis to the record like this: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=test.fna~|~analyze\_seq~... + + + +-{}-{}- + +GC\%:~50.00 + +HARD\_MASK\%:~0.00 + +RES:-:~0 + +RES:.:~0 + +RES:A:~1 + +RES:B:~0 + +RES:C:~1 + +RES:D:~0 + +RES:G:~1 + +RES:H:~0 + +RES:K:~0 + +RES:M:~0 + +RES:N:~0 + +RES:R:~0 + +RES:S:~0 + +RES:T:~1 + +RES:U:~0 + +RES:V:~0 + +RES:W:~0 + +RES:Y:~0 + +RES:\textasciitilde{}:~0 + +SEQ:~ATCG + +SEQ\_LEN:~4 + +SEQ\_NAME:~test + +SEQ\_TYPE:~DNA + +SOFT\_MASK\%:~0.00 +\end{lyxcode} +Now to make a table of how may As, Ts, Cs, and Gs you can add the +following: + +\begin{lyxcode} +...~|~analyze\_seq~|~write\_tab~-{}-keys=RES:A,RES:T,RES:C,RES:G +\end{lyxcode} +Or if you want to see the proportions of hard and soft masked sequence: + +\begin{lyxcode} +...~|~analyse\_seq~|~write\_tab~-{}-keys=HARD\_MASK\%,SOFT\_MASK\% +\end{lyxcode} +If you have a stack of sequences in one file and you want to determine +the mean GC content you can do it using the \textbf{mean\_vals} biotool: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=test.fna~|~analyze\_seq~|~mean\_vals~-{}-keys=GC\% +\end{lyxcode} +Or if you want the total count of Ns you can use \textbf{sum\_vals} +like this: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=test.fna~|~analyze\_seq~|~sum\_vals~-{}-keys=RES:N +\end{lyxcode} + +\subsection{How to extract subsequences?\label{sub:How-to-extract}} + +In order to extract a subsequence from a longer sequence use the biotool +extract\_seq, which will replace the sequence in the record with the +subsequence (this behaviour should probably be modified to be dependant +of a -\/-replace or a -\/-no\_replace switch). So to extract the +first 20 residues from all sequences do (first residue is designated +1): + +\begin{lyxcode} +...~|~extract\_seq~-{}-beg=1~-{}-len=20 +\end{lyxcode} +You can also specify a begin and end coordinate set: + +\begin{lyxcode} +...~|~extract\_seq~-{}-beg=20~-{}-end=40 +\end{lyxcode} +If you want the subsequences from position 20 to the sequence end +do: + +\begin{lyxcode} +...~|~extract\_seq~-{}-beg=20 +\end{lyxcode} +If you want to extract subsequences a given distance from the sequence +end you can do this by reversing the sequence with the biotool \textbf{reverse\_seq} +\eqref{sub:How-to-reverse-seq}, followed by \textbf{extract\_seq} +to get the subsequence, and then \textbf{reverse\_seq} again to get +the subsequence back in the original orientation: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=test.fna~|~reverse\_seq + +|~extract\_seq~-{}-beg=10~-{}-len=10~|~reverse\_seq +\end{lyxcode} + +\subsection{How to get genomic sequence?\label{sub:How-to-get-genomic-sequence}} + +The biotool \textbf{get\_genomic\_seq} can extract subsequences for +a given genome specified with the -\/-genome switch explicitly using +the -\/-beg and -\/-end/-\/-len switches: + +\begin{lyxcode} +get\_genome\_seq~-{}-genome=~-{}-beg=1~-{}-len=100 +\end{lyxcode} +Alternatively, \textbf{get\_genome\_seq} can be used to append the +corresponding sequence to BED, PSL, and BLAST records: + +\begin{lyxcode} +read\_bed~-{}-data\_in=~|~get\_genome\_seq~-{}-genome= +\end{lyxcode} + +\subsection{How to upper-case sequences?} + +Sequences can be shifted from lower case to upper case using \textbf{uppercase\_seq}: + +\begin{lyxcode} +...~|~uppercase\_seq +\end{lyxcode} + +\subsection{How to reverse sequences?\label{sub:How-to-reverse-seq}} + +The order of residues in a sequence can be reversed using reverse\_seq: + +\begin{lyxcode} +...~|~reverse\_seq +\end{lyxcode} +Note that in order to reverse/complement a sequence you also need +the \textbf{complement\_seq} biotool (see~\ref{sub:How-to-complement}). + + +\subsection{How to complement sequences?\label{sub:How-to-complement}} + +DNA and RNA sequences can be complemented with \textbf{complement\_seq}, +which automagically determines the sequence type: + +\begin{lyxcode} +...~|~complement\_seq +\end{lyxcode} +Note that in order to reverse/complement a sequence you also need +the \textbf{reverse\_seq} biotool (see~\ref{sub:How-to-reverse-seq}). + + +\subsection{How to remove indels from sequnces?} + +Indels can be removed from sequences with the \textbf{remove\_indels} +biotool. This is useful if you have aligned some sequences (see~\ref{sub:How-to-align}) +and extracted (see~\ref{sub:How-to-extract}) a block of subsequences +from the alignment and you want to use these sequence in a search +where you need to remove the indels first. '-', '\textasciitilde{}', +and '.' are considered indels: + +\begin{lyxcode} +...~|~remove\_indels +\end{lyxcode} + +\subsection{How to split sequences into overlapping subsequences?} + +Sequences can be slit into overlapping subsequences with the \textbf{split\_seq} +biotool. + +\begin{lyxcode} +...~|~split\_seq~-{}-word\_size=20~-{}-uniq +\end{lyxcode} + +\subsection{How to determine the oligo frequency?} + +In order to determine if any oligo usage is over represented in one +or more sequences you can determine the frequency of oligos of a given +size with \textbf{oligo\_freq}: + +\begin{lyxcode} +...~|~oligo\_freq~-{}-word\_size=4 +\end{lyxcode} +And if you have more than one sequence and want to accumulate the +frequences you need the -\/-all switch: + +\begin{lyxcode} +...~|~oligo\_freq~-{}-word\_size=4~-{}-all +\end{lyxcode} +To get a meaningful result you need to write the resulting frequencies +as a table with \textbf{write\_tab} (see~\ref{sub:How-to-write-tab}), +but first it is important to \textbf{grab} (see~\ref{sub:How-to-grab}) +the records with the frequencies to avoid full length sequences in +the table: + +\begin{lyxcode} +...~|~oligo\_freq~-{}-word\_size=4~-{}-all~|~grab~-{}-pattern=OLIGO~-{}-keys\_only + +|~write\_tab~-{}-no\_stream +\end{lyxcode} +And the resulting frequency table can be sorted with Unix sort (man +sort). + + +\subsection{How to search for sequences in genomes?} + +See the following biotool: + +\begin{itemize} +\item \textbf{patscan\_seq} \eqref{sub:How-to-use-patscan} +\item \textbf{blat\_seq} \eqref{sub:How-to-use-BLAT} +\item \textbf{blast\_seq} \eqref{sub:How-to-use-BLAST} +\item \textbf{vmatch\_seq} \eqref{sub:How-to-use-Vmatch} +\end{itemize} + +\subsection{How to search sequences for a pattern?\label{sub:How-to-use-patscan}} + +It is possible to search sequences in the data stream for patterns +using the \textbf{patscan\_seq} biotool which utilizes the powerful +scan\_for\_matches engine. Consult the documentation for scan\_for\_matches +in order to learn how to define patterns (the documentation is included +in Appendix~\ref{sec:scan_for_matches-README}). + +To search all sequences for a simple pattern consisting of the sequence +ATCGATCG allowing for 3 mismatches, 2 insertions and 1 deletion: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~patscan\_seq~-{}-pattern='ATCGATCG{[}3,2,1]' +\end{lyxcode} +The -\/-pattern switch takes a comma seperated list of patterns, +so if you want to search for more that one pattern do: + +\begin{lyxcode} +...~|~patscan\_seq~-{}-pattern='ATCGATCG{[}3,2,1],GCTAGCTA{[}3,2,1]' +\end{lyxcode} +It is also possible to have a list of different patterns to search +for in a file with one pattern per line. In order to get \textbf{patscan\_seq} +to read these patterns use the -\/-pattern\_in switch: + +\begin{lyxcode} +...~|~patscan\_seq~-{}-pattern\_in= +\end{lyxcode} +To also scan the complementary strand in nucleotide sequences (\textbf{patscan\_seq} +automagically determines the sequence type) you need to add the -\/-comp +switch: + +\begin{lyxcode} +...~|~patscan\_seq~-{}-pattern=~-{}-comp +\end{lyxcode} +It is also possible to use \textbf{patscan\_seq} to output those records +that does not contain a certain pattern by using the -\/-invert switch: + +\begin{lyxcode} +...~|~patscan\_seq~-{}-pattern=~-{}-invert +\end{lyxcode} +Finally, \textbf{patscan\_seq} can also scan for patterns in a given +genome sequence, instead of sequences in the stream, using the -\/-genome +switch: + +\begin{lyxcode} +patscan~-{}-pattern=~-{}-genome= +\end{lyxcode} + +\subsection{How to use BLAT for sequence search?\label{sub:How-to-use-BLAT}} + +Sequences in the data stream can be matched against supported genomes +using \textbf{blat\_seq} which is a biotool using BLAT as the name +might suggest. Currently only Mouse and Human genomes are available +and it is not possible to use OOC files since there is still a need +for a local repository for genome files. Otherwise it is just: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~blat\_seq~-{}-genome= +\end{lyxcode} +The search results can then be written to file with \textbf{write\_psl} +(see~\ref{sub:How-to-write-PSL}) or \textbf{write\_bed} (see~\ref{sub:How-to-write-BED}) +allthough with \textbf{write\_bed} some information will be lost). +It is also possible to plot chromosome distribution of the search +results using \textbf{plot\_chrdist} (see~\ref{sub:How-to-plot-chrdist}) +or the distribution of the match lengths using \textbf{plot\_lendist} +(see~\ref{sub:How-to-plot-lendist}) or a karyogram with the hits +using \textbf{plot\_karyogram} (see~\ref{sub:How-to-plot-karyogram}). + + +\subsection{How to use BLAST for sequence search?\label{sub:How-to-use-BLAST}} + +Two biotools exist for blasting sequences: \textbf{create\_blast\_db} +is used to create the BLAST database required for BLAST which is queried +using the biotool \textbf{blast\_seq}. So in order to create a BLAST +database from sequences in the data stream you simple run: + +\begin{lyxcode} +...~|~create\_blast\_db~-{}-database=my\_database~-{}-no\_stream +\end{lyxcode} +The type of sequence to use for the database is automagically determined +by \textbf{create\_blast\_db}, but don't have a mixture of peptide +and nucleic acids sequences in the stream. The -\/-database switch +takes a path as argument, but will default to 'blastdb\_ +if not set. + +The resulting database can now be queried with sequences in another +data stream using \textbf{blast\_seq}: + +\begin{lyxcode} +...~|~blast\_seq~-{}-database=my\_database +\end{lyxcode} +Again, the sequence type is determined automagically and the appropriate +BLAST program is guessed (see below table), however, the program name +can be overruled with the -\/-program switch. + +\noindent \begin{center} +\begin{tabular}{ccc} +Subject sequence & Query sequence & Program guess\tabularnewline +\hline +Nucleotide & Nucleotide & blastn\tabularnewline +Protein & Protein & blastp\tabularnewline +Protein & Nucleotide & blastx\tabularnewline +Nucleotide & Protein & tblastn\tabularnewline +\end{tabular} +\par\end{center} + +Finally, it is also possible to use \textbf{blast\_seq} for blasting +sequences agains a preformatted genome using the -\/-genome switch +instead of the -\/-database switch: + +\begin{lyxcode} +...~|~blast\_seq~-{}-genome= +\end{lyxcode} + +\subsection{How to use Vmatch for sequence search?\label{sub:How-to-use-Vmatch}} + +The powerful suffix array software package Vmatch% +\footnote{\url{http://www.vmatch.de/}% +} can be used for exact mapping of sequences against indexed genomes +using the biotool \textbf{vmatch\_seq}, which will e.g. map 700000 +ESTs to the human genome locating all 160 mio hits in less than an +hour. + +\begin{lyxcode} +...~|~vmatch\_seq~-{}-genome= +\end{lyxcode} +Only nucleotide sequences and sequences longer than 11 nucleotides +will be mapped. The resulting SCORE key will hold the number of genome +matches of a given sequence (multi-mappers). + + +\subsection{How to find all matches between sequences?\label{sub:How-to-find-matches}} + +All matches between two sequences can be determined with the biotool +\textbf{match\_seq}. The match finding engine underneath the hood +of \textbf{match\_seq} is the super fast suffix tree program MUMmer% +\footnote{\url{http://mummer.sourceforge.net/}% +}, which will locate all forward and reverse matches between huge sequences +in a matter of minutes (if the repeat count is not too high and if +the word size used is appropriate). Matching two \emph{Helicobacter +pylori} genomes (1.7Mbp) takes around 10 seconds: + +\begin{lyxcode} +...~|~match\_seq~-{}-word\_size=20~-{}-direction=both +\end{lyxcode} +The output from \textbf{match\_seq} can be used to generate a dot +plot with \textbf{plot\_matches} (see~\ref{sub:How-to-generate-dotplot}). + + +\subsection{How to align sequences?\label{sub:How-to-align}} + +Sequences in the stream can be aligned with the \textbf{align\_seq} +biotool that uses Muscle% +\footnote{\url{http://www.drive5.com/muscle/muscle.html}% +} as aligment engine. Currently you cannot change any of the Muscle +alignment parameters and \textbf{align\_seq} will create an alignment +based on the defaults (which are really good!): + +\begin{lyxcode} +...~|~align\_seq +\end{lyxcode} +The aligned output can be written to file in FASTA format using \textbf{write\_fasta} +(see~\ref{sub:How-to-write-fasta}) or in pretty text using \textbf{write\_align} +(see~\ref{sub:How-to-write-alignment}). + + +\subsection{How to create a weight matrix?} + +If you want a weight matrix to show the sequence composition of a +stack of sequences you can use the biotool create\_weight\_matrix: + +\begin{lyxcode} +...~|~create\_weight\_matrix +\end{lyxcode} +The result can be output in percent using the -\/-percent switch: + +\begin{lyxcode} +...~|~create\_weight\_matrix~-{}-percent +\end{lyxcode} +The weight matrix can be written as tabular output with \textbf{write\_tab} +(see~\ref{sub:How-to-write-tab}) after removeing the records containing +SEQ with \textbf{grab} (see~\ref{sub:How-to-grab}): + +\begin{lyxcode} +...~|~create\_weight\_matrix~|~grab~-{}-invert~-{}-keys=SEQ~-{}-keys\_only + +|~write\_tab~-{}-no\_stream +\end{lyxcode} +The V0 column will hold the residue, while the rest of the columns +will hold the frequencies for each sequence position. + + +\section{Plotting} + +There exists several biotools for plotting. Some of these are based +on GNUplot% +\footnote{\url{http://www.gnuplot.info/}% +}, which is an extremely powerful platform to generate all sorts of +plots and even though GNUplot has quite a steep learning curve, the +biotools utilizing GNUplot are simple to use. GNUplot is able to output +a lot of different formats (called terminals in GNUplot), but the +biotools focusses on three formats only: + +\begin{enumerate} +\item The 'dumb' terminal is default to the GNUplot based biotools and will +output a plot in crude ASCII text (Fig.~\ref{fig:Dumb-terminal}). +This is quite nice for a quick and dirty plot to get an overview of +your data . +\item The 'post' or 'postscript' terminal output postscript code which is +publication grade graphics that can be viewed with applications such +as Ghostview, Photoshop, and Preview. +\item The 'svg' terminal output's scalable vector graphics (SVG) which is +a vector based format. SVG is great because you can edit the resulting +plot using Photoshop or Inkscape% +\footnote{Inkscape is a really handy drawing program that is free and open source. +Availble at \url{http://www.inkscape.org}% +} if you want to add additional labels, captions, arrows, and so on +and then save the result in different formats, such as postscript +without loosing resolution. +\end{enumerate} +The biotools for plotting that are not based on GNUplot only output +SVG (that may change in the future). + +% +\begin{figure} +\noindent \begin{centering} +\includegraphics[width=12cm]{lendist_ascii} +\par\end{centering} + +\caption{\label{fig:Dumb-terminal}Dumb terminal} + + +\begin{quote} +The output of a length distribution plot in the default 'dumb terminal' +to the terminal window. +\end{quote} + +\end{figure} + + + +\subsection{How to plot a histogram?\label{How-to-plot-histogram}} + +A generic histogram for a given value can be plotted with the biotool +\textbf{plot\_histogram} (Fig.~\ref{fig:Histogram}): + +\begin{lyxcode} +...~|~plot\_histogram~-{}-key=TISSUE~-{}-no\_stream +\end{lyxcode} +(Figure missing) + +\noindent \begin{flushleft} +% +\begin{figure} +\noindent \begin{centering} +\includegraphics[width=12cm]{histogram} +\par\end{centering} + +\caption{\label{fig:Histogram}Histogram} + +\end{figure} + +\par\end{flushleft} + + +\subsection{How to plot a length distribution?\label{sub:How-to-plot-lendist}} + +Plotting of length distributions, weather sequence lengths, patterns +lengths, hit lengths, \emph{etc.} is a really handy thing and can +be done with the the biotool \textbf{plot\_lendist}. If you have a +file with FASTA entries and want to plot the length distribution you +do it like this: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~length\_seq + +|~plot\_lendist~-{}-key=SEQ\_LEN~-{}-no\_stream +\end{lyxcode} +The result will be written to the default dumb terminal and will look +like Fig.~\ref{fig:Dumb-terminal}. + +If you instead want the result in postscript format you can do: + +\begin{lyxcode} +...~|~plot\_lendist~-{}-key=SEQ\_LEN~-{}-terminal=post~-{}-result\_out=file.ps +\end{lyxcode} +That will generate the plot and save it to file, but not interrupt +the data stream which can then be used in further analysis. You can +also save the plot implicetly using '>', however, it is then important +to terminate the stream with the -\/-no\_stream switch: + +\begin{lyxcode} +...~|~plot\_lendist~-{}-key=SEQ\_LEN~-{}-terminal=post~-{}-no\_stream~>~file.ps +\end{lyxcode} +The resulting plot can be seen in Fig.~\ref{fig:Length-distribution}. + +% +\begin{figure} + + +\noindent \begin{centering} +\includegraphics[width=12cm]{lendist} +\par\end{centering} + +\caption{\label{fig:Length-distribution}Length distribution} + + +\begin{quote} +Length distribution of 630 piRNA like RNAs. +\end{quote} + +\end{figure} + + + +\subsection{How to plot a chromosome distribution?\label{sub:How-to-plot-chrdist}} + +If you have the result of a sequence search against a multi chromosome +genome, it is very practical to be able to plot the distribution of +search hits on the different chromosomes. This can be done with \textbf{plot\_chrdist}: + +\begin{lyxcode} +read\_fasta~-{}-data\_in=~|~blat\_genome~|~plot\_chrdist~-{}-no\_stream +\end{lyxcode} +The above example will result in a crude plot using the 'dumb' terminal, +and if you want to mess around with the results from the BLAT search +you probably want to save the result to file first (see~\ref{sub:How-to-write-PSL}). +To plot the chromosome distribution from the saved search result you +can do: + +\begin{lyxcode} +read\_bed~-{}-data\_in=file.bed~|~plot\_chrdist~-{}-terminal=post~-{}-result\_out=plot.ps +\end{lyxcode} +That will result in the output show in Fig.~\ref{fig:Chromosome-distribution}. + +% +\begin{figure} + + +\noindent \begin{centering} +\includegraphics[angle=90,width=12cm]{chrdist} +\par\end{centering} + +\caption{\label{fig:Chromosome-distribution}Chromosome distribution} + +\end{figure} + + + +\subsection{How to generate a dotplot?\label{sub:How-to-generate-dotplot}} + +A dotplot is a powerful way to get an overview of the size and location +of sequence insertions, deletions, and duplications between two sequences. +Generating a dotplot with biotools is a two step process where you +initially find all matches between two sequences using the tool \textbf{match\_seq} +(see~\ref{sub:How-to-find-matches}) and plot the resulting matches +with \textbf{plot\_matches}. Matching and plotting two \emph{Helicobacter +pylori} genomes (1.7Mbp) takes around 10 seconds: + +\begin{lyxcode} +...~|~match\_seq~|~plot\_matches~-{}-terminal=post~-{}-result\_out=plot.ps +\end{lyxcode} +The resulting dotplot is in Fig.~\ref{fig:Dotplot}. + +% +\begin{figure} +\noindent \begin{centering} +\includegraphics[width=12cm]{dotplot} +\par\end{centering} + +\caption{\label{fig:Dotplot}Dotplot} + + +\begin{quote} +Forward matches are displayed in green while reverse matches are displayed +in red. +\end{quote} + +\end{figure} + + + +\subsection{How to plot a sequence logo?} + +Sequence logos can be generate with \textbf{plot\_seqlogo}. The sequnce +type is determined automagically and an entropy scale of 2 bits and +4 bits is used for nucleotide and peptide sequences, respectively% +\footnote{\url{http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html}% +}. + +\begin{lyxcode} +...~|~plot\_seqlogo~-{}-no\_stream~-{}-result\_out=seqlogo.svg +\end{lyxcode} +An example of a sequence logo can be seen in Fig.~\ref{fig:Sequence-logo}. + +% +\begin{figure} +\noindent \begin{centering} +\includegraphics[width=12cm]{seqlogo} +\par\end{centering} + +\caption{\label{fig:Sequence-logo}Sequence logo} + +\end{figure} + + + +\subsection{How to plot a karyogram?\label{sub:How-to-plot-karyogram}} + +To plot search hits on genomes use \textbf{plot\_karyogram}, which +will output a nice karyogram in SVG graphics: + +\begin{lyxcode} +...~|~plot\_karyogram~-{}-result\_out=karyogram.svg +\end{lyxcode} +The banding data is taken from the UCSC genome browser database and +currently only Human and Mouse is supported. Fig.~\ref{fig:Karyogram} +shows the distribution of piRNA like RNAs matched to the Human genome. + +% +\begin{figure} +\noindent \begin{centering} +\includegraphics[width=12cm]{karyogram} +\par\end{centering} + +\caption{\label{fig:Karyogram}Karyogram} + + +\begin{quote} +Hits from a search of piRNA like RNAs in the Human genome is displayed +as short horizontal bars. +\end{quote} + +\end{figure} + + + +\section{Uploading Results} + + +\subsection{How do I display my results in the UCSC Genome Browser?} + +Results from the list of biotools below can be uploaded directly to +a local mirror of the UCSC Genome Browser using the biotool \textbf{upload\_to\_ucsc}: + +\begin{itemize} +\item patscan\_seq \eqref{sub:How-to-use-patscan} +\item blat\_seq \eqref{sub:How-to-use-BLAT} +\item blast\_seq \eqref{sub:How-to-use-BLAST} +\item vmatch\_seq \eqref{sub:How-to-use-Vmatch} +\end{itemize} +The syntax for uploading data the most simple way requires two mandatory +switches: -\/-database, which is the UCSC database name (such as +hg18, mm9, etc.) and-\/-table which should be the users initials +followed by an underscore and a short description of the data: + +\begin{lyxcode} +...~|~upload\_to\_ucsc~-{}-database=hg18~-{}-table=mah\_snoRNAs +\end{lyxcode} +The \textbf{upload\_to\_ucsc} biotool modifies the users \textasciitilde{}/ucsc/my\_tracks.ra +file automagically (a backup is created with the name \textasciitilde{}/ucsc/my\_tracks.ra\textasciitilde{}) +with default values that can be overridden using the following switches: + +\begin{itemize} +\item -\/-short\_label - Short label for track - Default=database->table +\item -\/-long\_label - Long label for track - Default=database->table +\item -\/-group - Track group name - Default= +\item -\/-priority - Track display priority - Default=1 +\item -\/-color - Track color - Default=147,73,42 +\item -\/-chunk\_size - Chunks for loading - Default=10000000 +\item -\/-visibility - Track visibility - Default=pack +\end{itemize} +Also, data in BED or PSL format can be uploaded with \textbf{upload\_to\_ucsc} +as long as these reference to genomes and chromosomes existing in +the UCSC Genome Browser: + +\begin{lyxcode} +read\_bed~-{}-data\_in=~|~upload\_to\_ucsc~... + + + +read\_psl~-{}-data\_in=~|~upload\_to\_ucsc~... +\end{lyxcode} + +\section{Trouble shooting} + +Shoot the messenger! + +\appendix + +\section{Keys\label{sec:Keys}} + +HIT + +HIT\_BEG + +HIT\_END + +HIT\_LEN + +HIT\_NAME + +PATTERN + + +\section{Switches\label{sec:Switches}} + +-\/-stream\_in + +-\/-stream\_out + +-\/-no\_stream + +-\/-data\_in + +-\/-result\_out + +-\/-num + + +\section{scan\_for\_matches README\label{sec:scan_for_matches-README}} + +\begin{lyxcode} +~~~~~~~~~~~~~~~~~~~~~~~~~~scan\_for\_matches: + +~~~~A~Program~to~Scan~Nucleotide~or~Protein~Sequences~for~Matching~Patterns + +~~~~~~~~~~~~~~~~~~~~~~~~Ross~Overbeek + +~~~~~~~~~~~~~~~~~~~~~~~~MCS + +~~~~~~~~~~~~~~~~~~~~~~~~Argonne~National~Laboratory + +~~~~~~~~~~~~~~~~~~~~~~~~Argonne,~IL~60439 + +~~~~~~~~~~~~~~~~~~~~~~~~USA + +Scan\_for\_matches~is~a~utility~that~we~have~written~to~search~for + +patterns~in~DNA~and~protein~sequences.~~I~wrote~most~of~the~code, + +although~David~Joerg~and~Morgan~Price~wrote~sections~of~an + +earlier~version.~~The~whole~notion~of~pattern~matching~has~a~rich + +history,~and~we~borrowed~liberally~from~many~sources.~~However,~it~is + +worth~noting~that~we~were~strongly~influenced~by~the~elegant~tools + +developed~and~distributed~by~David~Searls.~~My~intent~is~to~make~the + +existing~tool~available~to~anyone~in~the~research~community~that~might + +find~it~useful.~~I~will~continue~to~try~to~fix~bugs~and~make~suggested + +enhancements,~at~least~until~I~feel~that~a~superior~tool~exists. + +Hence,~I~would~appreciate~it~if~all~bug~reports~and~suggestions~are + +directed~to~me~at~Overbeek@mcs.anl.gov.~~ + +I~will~try~to~log~all~bug~fixes~and~report~them~to~users~that~send~me + +their~email~addresses.~~I~do~not~require~that~you~give~me~your~name + +and~address.~~However,~if~you~do~give~it~to~me,~I~will~try~to~notify + +you~of~serious~problems~as~they~are~discovered. + +Getting~Started: + +~~~~The~distribution~should~contain~at~least~the~following~programs: + +~~~~~~~~~~~~~~~~README~~~~~~~~~~~~~~~~~~-~~~~~This~document + +~~~~~~~~~~~~~~~~ggpunit.c~~~~~~~~~~~~~~~-~~~~~One~of~the~two~source~files + +~~~~~~~~~~~~~~~~scan\_for\_matches.c~~~~~~-~~~~~The~second~source~file + +~~~~~~~~~~~~~~~~ + +~~~~~~~~~~~~~~~~run\_tests~~~~~~~~~~~~~~~-~~~~~A~perl~script~to~test~things + +~~~~~~~~~~~~~~~~show\_hits~~~~~~~~~~~~~~~-~~~~~A~handy~perl~script + +~~~~~~~~~~~~~~~~test\_dna\_input~~~~~~~~~~-~~~~~Test~sequences~for~DNA + +~~~~~~~~~~~~~~~~test\_dna\_patterns~~~~~~~-~~~~~Test~patterns~for~DNA~scan + +~~~~~~~~~~~~~~~~test\_output~~~~~~~~~~~~~-~~~~~Desired~output~from~test + +~~~~~~~~~~~~~~~~test\_prot\_input~~~~~~~~~-~~~~~Test~protein~sequences + +~~~~~~~~~~~~~~~~test\_prot\_patterns~~~~~~-~~~~~Test~patterns~for~proteins + +~~~~~~~~~~~~~~~~testit~~~~~~~~~~~~~~~~~~-~~~~~a~perl~script~used~for~test + +~~~~Only~the~first~three~files~are~required.~~The~others~are~useful, + +~~~~but~only~if~you~have~Perl~installed~on~your~system.~~If~you~do + +~~~~have~Perl,~I~suggest~that~you~type + +~~~~~~~~ + +~~~~~~~~~~~~~~~~which~perl + +~~~~to~find~out~where~it~installed.~~On~my~system,~I~get~the~following + +~~~~response: + +~~~~~~~~ + +~~~~~~~~~~~~~~~~clone\%~which~perl + +~~~~~~~~~~~~~~~~/usr/local/bin/perl + +~~~~indicating~that~Perl~is~installed~in~/usr/local/bin.~~Anyway,~once + +~~~~you~know~where~it~is~installed,~edit~the~first~line~of~files~ + +~~~~~~~~testit + +~~~~~~~~show\_hits + +~~~~replacing~/usr/local/bin/perl~with~the~appropriate~location.~~I + +~~~~will~assume~that~you~can~do~this,~although~it~is~not~critical~(it + +~~~~is~needed~only~to~test~the~installation~and~to~use~the~\char`\"{}show\_hits\char`\"{} + +~~~~utility).~~Perl~is~not~required~to~actually~install~and~run + +~~~~scan\_for\_matches.~ + +~~~~If~you~do~not~have~Perl,~I~suggest~you~get~it~and~install~it~(it + +~~~~is~a~wonderful~utility).~~Information~about~Perl~and~how~to~get~it + +~~~~can~be~found~in~the~book~\char`\"{}Programming~Perl\char`\"{}~by~Larry~Wall~and + +~~~~Randall~L.~Schwartz,~published~by~O'Reilly~\&~Associates,~Inc. + +~~~~To~get~started,~you~will~need~to~compile~the~program.~~~I~do~this + +~~~~using~ + +~~~~~~~~gcc~-O~-o~scan\_for\_matches~~ggpunit.c~scan\_for\_matches.c + +~~~~If~you~do~not~use~GNU~C,~use~ + +~~~~~~~~cc~-O~-DCC~-o~scan\_for\_matches~~ggpunit.c~scan\_for\_matches.c + +~~~~which~works~on~my~Sun.~~ + +~~~~Once~you~have~compiled~scan\_for\_matches,~you~can~verify~that~it + +~~~~works~with + +~~~~~~~~clone\%~run\_tests~tmp + +~~~~~~~~clone\%~diff~tmp~test\_output + +~~~~You~may~get~a~few~strange~lines~of~the~sort + +~~~~~~~~clone\%~run\_tests~tmp + +~~~~~~~~rm:~tmp:~No~such~file~or~directory + +~~~~~~~~clone\%~diff~tmp~test\_output + +~~~~These~should~cause~no~concern.~~However,~if~the~\char`\"{}diff\char`\"{}~shows~that + +~~~~tmp~and~test\_output~are~different,~contact~me~(you~have~a + +~~~~problem).~ + +~~~~You~should~now~be~able~to~use~scan\_for\_matches~by~following~the + +~~~~instructions~given~below~(which~is~all~the~normal~user~should~have + +~~~~to~understand,~once~things~are~installed~properly). + +~============================================================== + +How~to~run~scan\_for\_matches: + +~~~~To~run~the~program,~you~type~need~to~create~two~files + +~~~~1.~~the~first~file~contains~the~pattern~you~wish~to~scan~for;~I'll + +~~~~~~~~call~this~file~pat\_file~in~what~follows~(but~any~name~is~ok) + +~~~~2.~~the~second~file~contains~a~set~of~sequences~to~scan.~~These + +~~~~~~~~should~be~in~\char`\"{}fasta~format\char`\"{}.~~Just~look~at~the~contents~of + +~~~~~~~~test\_dna\_input~to~see~examples~of~this~format.~~Basically, + +~~~~~~~~each~sequence~begins~with~a~line~of~the~form + +~~~~~~~~~~~>sequence\_id + +~~~~~~~~and~is~followed~by~one~or~more~lines~containing~the~sequence. + +~~~~Once~these~files~have~been~created,~you~just~use + +~~~~~~~~scan\_for\_matches~pat\_file~<~input\_file + +~~~~to~scan~all~of~the~input~sequences~for~the~given~pattern.~~As~an + +~~~~example,~suppose~that~pat\_file~contains~a~single~line~of~the~form + +~~~~~~~~~~~~~~~~p1=4...7~3...8~\textasciitilde{}p1 + +~~~~Then, + +~~~~~~~~~~~~~~~~scan\_for\_matches~pat\_file~<~test\_dna\_input + +~~~~should~produce~two~\char`\"{}hits\char`\"{}.~~When~I~run~this~on~my~machine,~I~get + +~~~~~~~~clone\%~scan\_for\_matches~pat\_file~<~test\_dna\_input + +~~~~~~~~>tst1:{[}6,27] + +~~~~~~~~cguaacc~ggttaacc~gguuacg~ + +~~~~~~~~>tst2:{[}6,27] + +~~~~~~~~CGUAACC~GGTTAACC~GGUUACG~ + +~~~~~~~~clone\%~ + +Simple~Patterns~Built~by~Matching~Ranges~and~Reverse~Complements + +~~~~Let~me~first~explain~this~simple~pattern: + +~~~~~~~~~~~~~~~~ + +~~~~~~~~~~~~~~~~p1=4...7~3...8~\textasciitilde{}p1 + +~~~~The~pattern~consists~of~three~\char`\"{}pattern~units\char`\"{}~separated~by~spaces. + +~~~~The~first~pattern~unit~is + +~~~~~~~~~~~~~~~~p1=4...7 + +~~~~which~means~\char`\"{}match~4~to~7~characters~and~call~them~p1\char`\"{}.~~The + +~~~~second~pattern~unit~is + +~~~~~~~~~~~~~~~~3...8 + +~~~~which~means~\char`\"{}then~match~3~to~8~characters\char`\"{}.~~The~last~pattern~unit + +~~~~is~ + +~~~~~~~~~~~~~~~~\textasciitilde{}p1 + +~~~~which~means~\char`\"{}match~the~reverse~complement~of~p1\char`\"{}.~~The~first + +~~~~reported~hit~is~shown~as + +~~~~~~~~>tst1:{[}6,27] + +~~~~~~~~cguaacc~ggttaacc~gguuacg~ + +~~~~which~states~that~characters~6~through~27~of~sequence~tst1~were + +~~~~matched.~~\char`\"{}cguaac\char`\"{}~matched~the~first~pattern~unit,~\char`\"{}ggttaacc\char`\"{}~the + +~~~~second,~and~\char`\"{}gguuacg\char`\"{}~the~third.~~This~is~an~example~of~a~common + +~~~~type~of~pattern~used~to~search~for~sections~of~DNA~or~RNA~that + +~~~~would~fold~into~a~hairpin~loop. + +Searching~Both~Strands + +~~~~Now~for~a~short~aside:~scan\_for\_matches~only~searched~the + +~~~~sequences~in~the~input~file;~it~did~not~search~the~opposite + +~~~~strand.~~With~a~pattern~of~the~sort~we~just~used,~there~is~not + +~~~~need~o~search~the~opposite~strand.~~However,~it~is~normally~the + +~~~~case~that~you~will~wish~to~search~both~the~sequence~and~the + +~~~~opposite~strand~(i.e.,~the~reverse~complement~of~the~sequence). + +~~~~To~do~that,~you~would~just~use~the~\char`\"{}-c\char`\"{}~command~line.~~For~example, + +~~~~~~~~scan\_for\_matches~-c~pat\_file~<~test\_dna\_input + +~~~~Hits~on~the~opposite~strand~will~show~a~beginning~location~greater + +~~~~than~te~end~location~of~the~match. + +Defining~Pairing~Rules~and~Allowing~Mismatches,~Insertions,~and~Deletions + +~~~~Let~us~stop~now~and~ask~\char`\"{}What~additional~features~would~one~need~to + +~~~~really~find~the~kinds~of~loop~structures~that~characterize~tRNAs, + +~~~~rRNAs,~and~so~forth?\char`\"{}~~I~can~immediately~think~of~two: + +~~~~~~~~a)~you~will~need~to~be~able~to~allow~non-standard~pairings + +~~~~~~~~~~~(those~other~than~G-C~and~A-U),~and + +~~~~~~~~b)~you~will~need~to~be~able~to~tolerate~some~number~of + +~~~~~~~~~~~mismatches~and~bulges. + +~~~~~~~~ + +~~~~Let~me~first~show~you~how~to~handle~non-standard~\char`\"{}rules~for + +~~~~pairing~in~reverse~complements\char`\"{}.~~Consider~the~following~pattern, + +~~~~which~I~show~as~two~line~(you~may~use~as~many~lines~as~you~like~in + +~~~~forming~a~pattern,~although~you~can~only~break~a~pattern~at~points + +~~~~where~space~would~be~legal): + +~~~~~~~~~~~~r1=\{au,ua,gc,cg,gu,ug,ga,ag\}~ + +~~~~~~~~~~~~p1=2...3~0...4~p2=2...5~1...5~r1\textasciitilde{}p2~0...4~\textasciitilde{}p1~~~~~~~~ + +~~~~The~first~\char`\"{}pattern~unit\char`\"{}~does~not~actually~match~anything;~rather, + +~~~~it~defines~a~\char`\"{}pairing~rule\char`\"{}~in~which~standard~pairings~are + +~~~~allowed,~as~well~as~G-A~and~A-G~(in~case~you~wondered,~Us~and~Ts + +~~~~and~upper~and~lower~case~can~be~used~interchangably;~for~example + +~~~~r1=\{AT,UA,gc,cg\}~could~be~used~to~define~the~\char`\"{}standard~rule\char`\"{}~for + +~~~~pairings).~~The~second~line~consists~of~six~pattern~units~which + +~~~~may~be~interpreted~as~follows: + +~~~~~~~~~~~~p1=2...3~~~~~match~2~or~3~characters~(call~it~p1) + +~~~~~~~~~~~~0...4~~~~~~~~match~0~to~4~characters + +~~~~~~~~~~~~p2=2...5~~~~~match~2~to~5~characters~(call~it~p2) + +~~~~~~~~~~~~1...5~~~~~~~~match~1~to~5~characters + +~~~~~~~~~~~~r1\textasciitilde{}p2~~~~~~~~match~the~reverse~complement~of~p2, + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~allowing~G-A~and~A-G~pairs + +~~~~~~~~~~~~0...4~~~~~~~~match~0~to~4~characters~~~~~~~~ + +~~~~~~~~~~~~\textasciitilde{}p1~~~~~~~~~~match~the~reverse~complement~of~p1 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~allowing~only~G-C,~C-G,~A-T,~and~T-A~pairs + +~~~~Thus,~r1\textasciitilde{}p2~means~\char`\"{}match~the~reverse~complement~of~p2~using~rule~r1\char`\"{}. + +~~~~Now~let~us~consider~the~issue~of~tolerating~mismatches~and~bulges. + +~~~~You~may~add~a~\char`\"{}qualifier\char`\"{}~to~the~pattern~unit~that~gives~the + +~~~~tolerable~number~of~\char`\"{}mismatches,~deletions,~and~insertions\char`\"{}. + +~~~~Thus, + +~~~~~~~~~~~~~~~~p1=10...10~3...8~\textasciitilde{}p1{[}1,2,1] + +~~~~means~that~the~third~pattern~unit~must~match~10~characters, + +~~~~allowing~one~\char`\"{}mismatch\char`\"{}~(a~pairing~other~than~G-C,~C-G,~A-T,~or + +~~~~T-A),~two~deletions~(a~deletion~is~a~character~that~occurs~in~p1, + +~~~~but~has~been~\char`\"{}deleted\char`\"{}~from~the~string~matched~by~\textasciitilde{}p1),~and~one + +~~~~insertion~(an~\char`\"{}insertion\char`\"{}~is~a~character~that~occurs~in~the~string + +~~~~matched~by~\textasciitilde{}p1,~but~not~for~which~no~corresponding~character + +~~~~occurs~in~p1).~~In~this~case,~the~pattern~would~match + +~~~~~~~~~~~~~~ACGTACGTAC~GGGGGGGG~GCGTTACCT + +~~~~which~is,~you~must~admit,~a~fairly~weak~loop.~~It~is~common~to + +~~~~allow~mismatches,~but~you~will~find~yourself~using~insertions~and + +~~~~deletions~much~more~rarely.~~In~any~event,~you~should~note~that + +~~~~allowing~mismatches,~insertions,~and~deletions~does~force~the + +~~~~program~to~try~many~additional~possible~pairings,~so~it~does~slow + +~~~~things~down~a~bit. + +How~Patterns~Are~Matched + +~~~~Now~is~as~good~a~time~as~any~to~discuss~the~basic~flow~of~control + +~~~~when~matching~patterns.~~Recall~that~a~\char`\"{}pattern\char`\"{}~is~a~sequence~of + +~~~~\char`\"{}pattern~units\char`\"{}.~~Suppose~that~the~pattern~units~were + +~~~~~~~~u1~u2~u3~u4~...~un + +~~~~The~scan~of~a~sequence~S~begins~by~setting~the~current~position + +~~~~to~1.~~Then,~an~attempt~is~made~to~match~u1~starting~at~the + +~~~~current~position.~~Each~attempt~to~match~a~pattern~unit~can + +~~~~succeed~or~fail.~~If~it~succeeds,~then~an~attempt~is~made~to~match + +~~~~the~next~unit.~~If~it~fails,~then~an~attempt~is~made~to~find~an + +~~~~alternative~match~for~the~immediately~preceding~pattern~unit.~~If + +~~~~this~succeeds,~then~we~proceed~forward~again~to~the~next~unit.~~If + +~~~~it~fails~we~go~back~to~the~preceding~unit.~~This~process~is~called + +~~~~\char`\"{}backtracking\char`\"{}.~~If~there~are~no~previous~units,~then~the~current + +~~~~position~is~incremented~by~one,~and~everything~starts~again.~~This + +~~~~proceeds~until~either~the~current~position~goes~past~the~end~of + +~~~~the~sequence~or~all~of~the~pattern~units~succeed.~~On~success, + +~~~~scan\_for\_matches~reports~the~\char`\"{}hit\char`\"{},~the~current~position~is~set + +~~~~just~past~the~hit,~and~an~attempt~is~made~to~find~another~hit. + +~~~~If~you~wish~to~limit~the~scan~to~simply~finding~a~maximum~of,~say, + +~~~~10~hits,~you~can~use~the~-n~option~(-n~10~would~set~the~limit~to + +~~~~10~reported~hits).~~For~example, + +~~~~~~~~scan\_for\_matches~-c~-n~1~pat\_file~<~test\_dna\_input + +~~~~would~search~for~just~the~first~hit~(and~would~stop~searching~the + +~~~~current~sequences~or~any~that~follow~in~the~input~file). + +Searching~for~repeats: + +~~~~In~the~last~section,~I~discussed~almost~all~of~the~details + +~~~~required~to~allow~you~to~look~for~repeats.~~Consider~the~following + +~~~~set~of~patterns: + +~~~~~~~~p1=6...6~3...8~p1~~~(find~exact~6~character~repeat~separated + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~by~to~8~characters) + +~~~~~~~~p1=6...6~3..8~p1{[}1,0,0]~~~(allow~one~mismatch) + +~~~~~~~~p1=3...3~p1{[}1,0,0]~p1{[}1,0,0]~p1{[}1,0,0]~~ + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~(match~12~characters~that~are~the~remains + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~of~a~3-character~sequence~occurring~4~times) + +~~~~~~~~~~~~~~~~ + +~~~~~~~~p1=4...8~0...3~p2=6...8~p1~0...3~p2 + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~(This~would~match~things~like + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ATCT~G~TCTTT~ATCT~TG~TCTTT + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~) + +Searching~for~particular~sequences: + +~~~~Occasionally,~one~wishes~to~match~a~specific,~known~sequence. + +~~~~In~such~a~case,~you~can~just~give~the~sequence~(along~with~an + +~~~~optional~statement~of~the~allowable~mismatches,~insertions,~and + +~~~~deletions).~~Thus, + +~~~~~~~~p1=6...8~GAGA~\textasciitilde{}p1~~~~(match~a~hairpin~with~GAGA~as~the~loop) + +~~~~~~~~RRRRYYYY~~~~~~~~~~~~~(match~4~purines~followed~by~4~pyrimidines) + +~~~~~~~~TATAA{[}1,0,0]~~~~~~~~~(match~TATAA,~allowing~1~mismatch) + +~~~~~~~~ + +Matches~against~a~\char`\"{}weight~matrix\char`\"{}: + +~~~~I~will~conclude~my~examples~of~the~types~of~pattern~units + +~~~~available~for~matching~against~nucleotide~sequences~by~discussing~a + +~~~~crude~implemetation~of~matching~using~a~\char`\"{}weight~matrix\char`\"{}.~~While~I + +~~~~am~less~than~overwhelmed~with~the~syntax~that~I~chose,~I~think~that + +~~~~the~reader~should~be~aware~that~I~was~thinking~of~generating + +~~~~patterns~containing~such~pattern~units~automatically~from + +~~~~alignments~(and~did~not~really~plan~on~typing~such~things~in~by + +~~~~hand~very~often).~~Anyway,~suppose~that~you~wanted~to~match~a + +~~~~sequence~of~eight~characters.~~The~\char`\"{}consensus\char`\"{}~of~these~eight + +~~~~characters~is~GRCACCGS,~but~the~actual~\char`\"{}frequencies~of~occurrence\char`\"{} + +~~~~are~given~in~the~matrix~below.~~Thus,~the~first~character~is~an~A + +~~~~16\%~the~time~and~a~G~84\%~of~the~time.~~The~second~is~an~A~57\%~of + +~~~~the~time,~a~C~10\%~of~the~time,~a~G~29\%~of~the~time,~and~a~T~4\%~of + +~~~~the~time.~~ + +~~~~~~~~~~~~~C1~~~~~C2~~~~C3~~~~C4~~~C5~~~~C6~~~~C7~~~~C8 + +~~~~ + +~~~~~~~A~~~~~16~~~~~57~~~~~0~~~~95~~~~0~~~~18~~~~~0~~~~~0 + +~~~~~~~C~~~~~~0~~~~~10~~~~80~~~~~0~~100~~~~60~~~~~0~~~~50 + +~~~~~~~G~~~~~84~~~~~29~~~~~0~~~~~0~~~~0~~~~20~~~100~~~~50 + +~~~~~~~T~~~~~~0~~~~~~4~~~~20~~~~~5~~~~0~~~~~2~~~~~0~~~~~0~~~ + +~~~~ + +~~~~One~could~use~the~following~pattern~unit~to~search~for~inexact + +~~~~matches~related~to~such~a~\char`\"{}weight~matrix\char`\"{}: + +~~~~~~~~\{(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), + +~~~~~~~~~(0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)\}~>~450 + +~~~~This~pattern~unit~will~attempt~to~match~exactly~eight~characters. + +~~~~For~each~character~in~the~sequence,~the~entry~in~the~corresponding + +~~~~tuple~is~added~to~an~accumulated~sum.~~If~the~sum~is~greater~than + +~~~~450,~the~match~succeeds;~else~it~fails. + +~~~~Recently,~this~feature~was~upgraded~to~allow~ranges.~~Thus, + +~~600~>~~\{(16,0,84,0),(57,10,29,4),(0,80,0,20),(95,0,0,5), + +~~~~~~~~~(0,100,0,0),(18,60,20,2),(0,0,100,0),(0,50,50,0)\}~>~450 + +~~~~will~work,~as~well. + +Allowing~Alternatives: + +~~~~Very~occasionally,~you~may~wish~to~allow~alternative~pattern~units + +~~~~(i.e.,~\char`\"{}match~either~A~or~B\char`\"{}).~~You~can~do~this~using~something + +~~~~like + +~~~~~~~~~~~~~~~~(~GAGA~|~GCGCA) + +~~~~which~says~\char`\"{}match~either~GAGA~or~GCGCA\char`\"{}.~~You~may~take + +~~~~alternatives~of~a~list~of~pattern~units,~for~example + +~~~~~~~~(p1=3...3~3...8~\textasciitilde{}p1~|~p1=5...5~4...4~\textasciitilde{}p1~GGG) + +~~~~would~match~one~of~two~sequences~of~pattern~units.~~There~is~one + +~~~~clumsy~aspect~of~the~syntax:~to~match~a~list~of~alternatives,~you + +~~~~need~to~fully~the~request.~~Thus, + +~~~~~~~~(GAGA~|~(GCGCA~|~TTCGA)) + +~~~~would~be~needed~to~try~the~three~alternatives. + +One~Minor~Extension + +~~~~Sometimes~a~pattern~will~contain~a~sequence~of~distinct~ranges, + +~~~~and~you~might~wish~to~limit~the~sum~of~the~lengths~of~the~matched + +~~~~subsequences.~~~For~example,~suppose~that~you~basically~wanted~to + +~~~~match~something~like + +~~~~ARRYYTT~p1=0...5~GCA{[}1,0,0]~p2=1...6~\textasciitilde{}p1~4...8~\textasciitilde{}p2~p3=4...10~CCT + +~~~~but~that~the~sum~of~the~lengths~of~p1,~p2,~and~p3~must~not~exceed + +~~~~eight~characters.~~To~do~this,~you~could~add~ + +~~~~~~~~length(p1+p2+p3)~<~9 + +~~~~as~the~last~pattern~unit.~~It~will~just~succeed~or~fail~(but~does + +~~~~not~actually~match~any~characters~in~the~sequence). + +~~~~ + +Matching~Protein~Sequences + +~~~~Suppose~that~the~input~file~contains~protein~sequences.~~In~this + +~~~~case,~you~must~invoke~scan\_for\_matches~with~the~\char`\"{}-p\char`\"{}~option.~~You + +~~~~cannot~use~aspects~of~the~language~that~relate~directly~to + +~~~~nucleotide~sequences~(e.g.,~the~-c~command~line~option~or~pattern + +~~~~constructs~referring~to~the~reverse~complement~of~a~previously + +~~~~matched~unit).~~ + +~~~~You~also~have~two~additional~constructs~that~allow~you~to~match + +~~~~either~\char`\"{}one~of~a~set~of~amino~acids\char`\"{}~or~\char`\"{}any~amino~acid~other~than + +~~~~those~a~given~set\char`\"{}.~~For~example, + +~~~~~~~~p1=0...4~any(HQD)~1...3~notany(HK)~p1 + +~~~~would~successfully~match~a~string~like + +~~~~~~~~~~~YWV~D~AA~C~YWV + +Using~the~show\_hits~Utility + +~~~~When~viewing~a~large~set~of~complex~matches,~you~might~find~it + +~~~~convenient~to~post-process~the~scan\_for\_matches~output~to~get~a + +~~~~more~readable~version.~~We~provide~a~simple~post-processor~called + +~~~~\char`\"{}show\_hits\char`\"{}.~~To~see~its~effect,~just~pipe~the~output~of~a + +~~~~scan\_for\_matches~into~show\_hits: + +~~~~~Normal~Output: + +~~~~~~~~clone\%~scan\_for\_matches~-c~pat\_file~<~tmp + +~~~~~~~~>tst1:{[}1,28] + +~~~~~~~~gtacguaacc~~ggttaac~cgguuacgtac~ + +~~~~~~~~>tst1:{[}28,1] + +~~~~~~~~gtacgtaacc~~ggttaac~cggttacgtac~ + +~~~~~~~~>tst2:{[}2,31] + +~~~~~~~~CGTACGUAAC~C~GGTTAACC~GGUUACGTACG~ + +~~~~~~~~>tst2:{[}31,2] + +~~~~~~~~CGTACGTAAC~C~GGTTAACC~GGTTACGTACG~ + +~~~~~~~~>tst3:{[}3,32] + +~~~~~~~~gtacguaacc~g~gttaactt~cgguuacgtac~ + +~~~~~~~~>tst3:{[}32,3] + +~~~~~~~~gtacgtaacc~g~aagttaac~cggttacgtac~ + +~~~~~Piped~Through~show\_hits: + +~~~~ + +~~~~~~~~clone\%~scan\_for\_matches~-c~pat\_file~<~tmp~|~show\_hits + +~~~~~~~~tst1:{[}1,28]:~~gtacguaacc~~~ggttaac~~cgguuacgtac + +~~~~~~~~tst1:{[}28,1]:~~gtacgtaacc~~~ggttaac~~cggttacgtac + +~~~~~~~~tst2:{[}2,31]:~~CGTACGUAAC~C~GGTTAACC~GGUUACGTACG + +~~~~~~~~tst2:{[}31,2]:~~CGTACGTAAC~C~GGTTAACC~GGTTACGTACG + +~~~~~~~~tst3:{[}3,32]:~~gtacguaacc~g~gttaactt~cgguuacgtac + +~~~~~~~~tst3:{[}32,3]:~~gtacgtaacc~g~aagttaac~cggttacgtac + +~~~~~~~~clone\%~ + +~~~~Optionally,~you~can~specify~which~of~the~\char`\"{}fields\char`\"{}~in~the~matches + +~~~~you~wish~to~sort~on,~and~show\_hits~will~sort~them.~~The~field + +~~~~numbers~start~with~0.~~So,~you~might~get~something~like + +~~~~~~~~clone\%~scan\_for\_matches~-c~pat\_file~<~tmp~|~show\_hits~2~1 + +~~~~~~~~tst2:{[}2,31]:~~CGTACGUAAC~C~GGTTAACC~GGUUACGTACG + +~~~~~~~~tst2:{[}31,2]:~~CGTACGTAAC~C~GGTTAACC~GGTTACGTACG + +~~~~~~~~tst3:{[}32,3]:~~gtacgtaacc~g~aagttaac~cggttacgtac + +~~~~~~~~tst1:{[}1,28]:~~gtacguaacc~~~ggttaac~~cgguuacgtac + +~~~~~~~~tst1:{[}28,1]:~~gtacgtaacc~~~ggttaac~~cggttacgtac + +~~~~~~~~tst3:{[}3,32]:~~gtacguaacc~g~gttaactt~cgguuacgtac + +~~~~~~~~clone\%~ + +~~~~In~this~case,~the~hits~have~been~sorted~on~fields~2~and~1~(that~is, + +~~~~the~third~and~second~matched~subfields). + +~~~~show\_hits~is~just~one~possible~little~post-processor,~and~you + +~~~~might~well~wish~to~write~a~customized~one~for~yourself. + +Reducing~the~Cost~of~a~Search + +~~~~The~scan\_for\_matches~utility~uses~a~fairly~simple~search,~and~may + +~~~~consume~large~amounts~of~CPU~time~for~complex~patterns.~~Someday, + +~~~~I~may~decide~to~optimize~the~code.~~However,~until~then,~let~me + +~~~~mention~one~useful~technique.~~ + +~~~~When~you~have~a~complex~pattern~that~includes~a~number~of~varying + +~~~~ranges,~imprecise~matches,~and~so~forth,~it~is~useful~to + +~~~~\char`\"{}pipeline\char`\"{}~matches.~~That~is,~form~a~simpler~pattern~that~can~be + +~~~~used~to~scan~through~a~large~database~extracting~sections~that + +~~~~might~be~matched~by~the~more~complex~pattern.~~Let~me~illustrate + +~~~~with~a~short~example.~~Suppose~that~you~really~wished~to~match~the + +~~~~pattern~ + +~~~~p1=3...5~0...8~\textasciitilde{}p1{[}1,1,0]~p2=6...7~3...6~AGC~3...5~RYGC~\textasciitilde{}p2{[}1,0,0] + +~~~~In~this~case,~the~pattern~units~AGC~3...5~RYGC~can~be~used~to~rapidly + +~~~~constrain~the~overall~search.~~You~can~preprocess~the~overall + +~~~~database~using~the~pattern: + +~~~~~~~~~~31...31~AGC~3...5~RYGC~7...7 + +~~~~Put~the~complex~pattern~in~pat\_file1~and~the~simpler~pattern~in + +~~~~pat\_file2.~~Then~use, + +~~~~~~~~scan\_for\_matches~-c~pat\_file2~<~nucleotide\_database~| + +~~~~~~~~scan\_for\_matches~pat\_file1 + +~~~~The~output~will~show~things~like + +~~~~>seqid:{[}232,280]{[}2,47] + +~~~~matches~pieces + +~~~~Then,~the~actual~section~of~the~sequence~that~was~matched~can~be + +~~~~easily~computed~as~{[}233,278]~(remember,~the~positions~start~from + +~~~~1,~not~0). + +~~~~Let~me~finally~add,~you~should~do~a~few~short~experiments~to~see + +~~~~whether~or~not~such~pipelining~actually~improves~performance~-{}-~it + +~~~~is~not~always~obvious~where~the~time~is~going,~and~I~have + +~~~~sometimes~found~that~the~added~complexity~of~pipelining~actually + +~~~~slowed~things~up.~~It~gets~its~best~improvements~when~there~are + +~~~~exact~matches~of~more~than~just~a~few~characters~that~can~be + +~~~~rapidly~used~to~eliminate~large~sections~of~the~database. + +============= + +Additions: + +Feb~9,~1995:~~~the~pattern~units~\textasciicircum{}~and~\$~now~work~as~in~normal~regular + +~~~~~~~~~~~~~~~expressions.~~That~is + +~~~~~~~~~~~~~~~~~~~~~~~~TTF~\$ + +~~~~~~~~~~~~~~~matches~only~TTF~at~the~end~of~the~string~and~ + +~~~~~~~~~~~~~~~~~~~~~~~~\textasciicircum{}~TTF~ + +~~~~~~~~~~~~~~~matches~only~an~initial~TTF + +~~~~~~~~~~~~~~~The~pattern~unit~ + +~~~~~~~~~~~~~~~~~~~~~~~~> matrix makepattern +/Pat1 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke + 0 4 M 4 8 L 8 4 L 4 0 L 0 4 L stroke} +>> matrix makepattern +/Pat2 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 0 8 L + 8 8 L 8 0 L 0 0 L fill} +>> matrix makepattern +/Pat3 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 8 M 8 -4 L + 0 12 M 12 0 L stroke} +>> matrix makepattern +/Pat4 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 0 M 8 12 L + 0 -4 M 12 8 L stroke} +>> matrix makepattern +/Pat5 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 8 M 4 -4 L + 0 12 M 8 -4 L 4 12 M 10 0 L stroke} +>> matrix makepattern +/Pat6 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 0 M 4 12 L + 0 -4 M 8 12 L 4 -4 M 10 8 L stroke} +>> matrix makepattern +/Pat7 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 8 -2 M -4 4 L + 12 0 M -4 8 L 12 4 M 0 10 L stroke} +>> matrix makepattern +/Pat8 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 -2 M 12 4 L + -4 0 M 12 8 L -4 4 M 8 10 L stroke} +>> matrix makepattern +/Pat9 exch def +/Pattern1 {PatternBgnd KeepColor Pat1 setpattern} bind def +/Pattern2 {PatternBgnd KeepColor Pat2 setpattern} bind def +/Pattern3 {PatternBgnd KeepColor Pat3 setpattern} bind def +/Pattern4 {PatternBgnd KeepColor Landscape {Pat5} {Pat4} ifelse setpattern} bind def +/Pattern5 {PatternBgnd KeepColor Landscape {Pat4} {Pat5} ifelse setpattern} bind def +/Pattern6 {PatternBgnd KeepColor Landscape {Pat9} {Pat6} ifelse setpattern} bind def +/Pattern7 {PatternBgnd KeepColor Landscape {Pat8} {Pat7} ifelse setpattern} bind def +} def +% +% +%End of PostScript Level 2 code +% +/PatternBgnd { + TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse +} def +% +% Substitute for Level 2 pattern fill codes with +% grayscale if Level 2 support is not selected. +% +/Level1PatternFill { +/Pattern1 {0.250 Density} bind def +/Pattern2 {0.500 Density} bind def +/Pattern3 {0.750 Density} bind def +/Pattern4 {0.125 Density} bind def +/Pattern5 {0.375 Density} bind def +/Pattern6 {0.625 Density} bind def +/Pattern7 {0.875 Density} bind def +} def +% +% Now test for support of Level 2 code +% +Level1 {Level1PatternFill} {Level2PatternFill} ifelse +% +/Symbol-Oblique /Symbol findfont [1 0 .167 1 0 0] makefont +dup length dict begin {1 index /FID eq {pop pop} {def} ifelse} forall +currentdict end definefont pop +end +%%EndProlog +%%Page: 1 1 +gnudict begin +gsave +50 50 translate +0.100 0.100 scale +90 rotate +0 -5040 translate +0 setgray +newpath +(Helvetica) findfont 100 scalefont setfont +1.000 UL +LTb +410 660 M +63 0 V +6557 0 R +-63 0 V +350 660 M +( 0) Rshow +1.000 UL +LTb +410 1243 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 20) Rshow +1.000 UL +LTb +410 1826 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 40) Rshow +1.000 UL +LTb +410 2409 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 60) Rshow +1.000 UL +LTb +410 2991 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 80) Rshow +1.000 UL +LTb +410 3574 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 100) Rshow +1.000 UL +LTb +410 4157 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 120) Rshow +1.000 UL +LTb +410 4740 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 140) Rshow +1.000 UL +LTb +698 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr1) Rshow +grestore +1.000 UL +LTb +986 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr2) Rshow +grestore +1.000 UL +LTb +1273 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr3) Rshow +grestore +1.000 UL +LTb +1561 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr4) Rshow +grestore +1.000 UL +LTb +1849 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr5) Rshow +grestore +1.000 UL +LTb +2137 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr6) Rshow +grestore +1.000 UL +LTb +2425 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr7) Rshow +grestore +1.000 UL +LTb +2713 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr8) Rshow +grestore +1.000 UL +LTb +3000 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr9) Rshow +grestore +1.000 UL +LTb +3288 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr10) Rshow +grestore +1.000 UL +LTb +3576 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr11) Rshow +grestore +1.000 UL +LTb +3864 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr12) Rshow +grestore +1.000 UL +LTb +4152 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr13) Rshow +grestore +1.000 UL +LTb +4440 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr14) Rshow +grestore +1.000 UL +LTb +4727 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr15) Rshow +grestore +1.000 UL +LTb +5015 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr16) Rshow +grestore +1.000 UL +LTb +5303 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr17) Rshow +grestore +1.000 UL +LTb +5591 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr18) Rshow +grestore +1.000 UL +LTb +5879 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr19) Rshow +grestore +1.000 UL +LTb +6167 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chrX) Rshow +grestore +1.000 UL +LTb +6454 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chr5_random) Rshow +grestore +1.000 UL +LTb +6742 660 M +0 -60 R +currentpoint gsave translate 90 rotate 0 0 M +(chrY_random) Rshow +grestore +1.000 UL +LTb +1.000 UL +LTb +410 4740 N +410 660 L +6620 0 V +0 4080 V +-6620 0 V +Z stroke +3720 4890 M +(Chromosome Distribution) Cshow +1.000 UP +1.000 UL +LTb +1.000 UL +LT0 +/Helvetica findfont 100 scalefont setfont +1.000 698 660 97 2012 BoxColFill +698 660 N +0 2011 V +96 0 V +794 660 L +-96 0 V +Z stroke +1.000 986 660 97 4023 BoxColFill +986 660 N +0 4022 V +96 0 V +0 -4022 V +-96 0 V +Z stroke +1.000 1273 660 97 1196 BoxColFill +1273 660 N +0 1195 V +96 0 V +0 -1195 V +-96 0 V +Z stroke +1.000 1561 660 97 2332 BoxColFill +1561 660 N +0 2331 V +96 0 V +0 -2331 V +-96 0 V +Z stroke +1.000 1849 660 97 2857 BoxColFill +1849 660 N +0 2856 V +96 0 V +0 -2856 V +-96 0 V +Z stroke +1.000 2137 660 97 2187 BoxColFill +2137 660 N +0 2186 V +96 0 V +0 -2186 V +-96 0 V +Z stroke +1.000 2425 660 97 3148 BoxColFill +2425 660 N +0 3147 V +96 0 V +0 -3147 V +-96 0 V +Z stroke +1.000 2713 660 97 1021 BoxColFill +2713 660 N +0 1020 V +96 0 V +0 -1020 V +-96 0 V +Z stroke +1.000 3000 660 97 3178 BoxColFill +3000 660 N +0 3177 V +96 0 V +0 -3177 V +-96 0 V +Z stroke +1.000 3288 660 97 2274 BoxColFill +3288 660 N +0 2273 V +96 0 V +0 -2273 V +-96 0 V +Z stroke +1.000 3576 660 97 1371 BoxColFill +3576 660 N +0 1370 V +96 0 V +0 -1370 V +-96 0 V +Z stroke +1.000 3864 660 97 1954 BoxColFill +3864 660 N +0 1953 V +96 0 V +0 -1953 V +-96 0 V +Z stroke +1.000 4152 660 97 1458 BoxColFill +4152 660 N +0 1457 V +96 0 V +0 -1457 V +-96 0 V +Z stroke +1.000 4440 660 97 1400 BoxColFill +4440 660 N +0 1399 V +96 0 V +0 -1399 V +-96 0 V +Z stroke +1.000 4727 660 97 2041 BoxColFill +4727 660 N +0 2040 V +96 0 V +0 -2040 V +-96 0 V +Z stroke +1.000 5015 660 97 817 BoxColFill +5015 660 N +0 816 V +96 0 V +0 -816 V +-96 0 V +Z stroke +1.000 5303 660 97 2566 BoxColFill +5303 660 N +0 2565 V +96 0 V +0 -2565 V +-96 0 V +Z stroke +1.000 5591 660 97 1400 BoxColFill +5591 660 N +0 1399 V +96 0 V +0 -1399 V +-96 0 V +Z stroke +1.000 5879 660 97 730 BoxColFill +5879 660 N +0 729 V +96 0 V +0 -729 V +-96 0 V +Z stroke +1.000 6167 660 96 1138 BoxColFill +6167 660 N +0 1137 V +95 0 V +0 -1137 V +-95 0 V +Z stroke +1.000 6454 660 97 59 BoxColFill +6454 660 N +0 58 V +96 0 V +0 -58 V +-96 0 V +Z stroke +1.000 6742 660 97 817 BoxColFill +6742 660 N +0 816 V +96 0 V +0 -816 V +-96 0 V +Z stroke +1.000 UL +LTb +410 4740 N +410 660 L +6620 0 V +0 4080 V +-6620 0 V +Z stroke +1.000 UP +1.000 UL +LTb +stroke +grestore +end +showpage +%%Trailer +%%DocumentFonts: Helvetica +%%Pages: 1 diff --git a/bp_doc/chrdist_ascii.png b/bp_doc/chrdist_ascii.png new file mode 100644 index 0000000..32b367f Binary files /dev/null and b/bp_doc/chrdist_ascii.png differ diff --git a/bp_doc/dotplot.pdf b/bp_doc/dotplot.pdf new file mode 100644 index 0000000..377e3eb Binary files /dev/null and b/bp_doc/dotplot.pdf differ diff --git a/bp_doc/dotplot.ps b/bp_doc/dotplot.ps new file mode 100644 index 0000000..7118d05 --- /dev/null +++ b/bp_doc/dotplot.ps @@ -0,0 +1,13881 @@ +%!PS-Adobe-2.0 +%%Creator: gnuplot 4.2 patchlevel 0 +%%CreationDate: Mon Sep 3 10:34:33 2007 +%%DocumentFonts: (atend) +%%BoundingBox: 50 50 554 770 +%%Orientation: Landscape +%%Pages: (atend) +%%EndComments +%%BeginProlog +/gnudict 256 dict def +gnudict begin +% +% The following 6 true/false flags may be edited by hand if required +% The unit line width may also be changed +% +/Color false def +/Blacktext false def +/Solid false def +/Dashlength 1 def +/Landscape true def +/Level1 false def +/Rounded false def +/TransparentPatterns false def +/gnulinewidth 5.000 def +/userlinewidth gnulinewidth def +% +/vshift -46 def +/dl1 { + 10.0 Dashlength mul mul + Rounded { currentlinewidth 0.75 mul sub dup 0 le { pop 0.01 } if } if +} def +/dl2 { + 10.0 Dashlength mul mul + Rounded { currentlinewidth 0.75 mul add } if +} def +/hpt_ 31.5 def +/vpt_ 31.5 def +/hpt hpt_ def +/vpt vpt_ def +Level1 {} { +/SDict 10 dict def +systemdict /pdfmark known not { + userdict /pdfmark systemdict /cleartomark get put +} if +SDict begin [ + /Title () + /Subject (gnuplot plot) + /Creator (gnuplot 4.2 patchlevel 0) + /Author (Martin Hansen) +% /Producer (gnuplot) +% /Keywords () + /CreationDate (Mon Sep 3 10:34:33 2007) + /DOCINFO pdfmark +end +} ifelse +% +% Gnuplot Prolog Version 4.2 (August 2006) +% +/M {moveto} bind def +/L {lineto} bind def +/R {rmoveto} bind def +/V {rlineto} bind def +/N {newpath moveto} bind def +/Z {closepath} bind def +/C {setrgbcolor} bind def +/f {rlineto fill} bind def +/vpt2 vpt 2 mul def +/hpt2 hpt 2 mul def +/Lshow {currentpoint stroke M 0 vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/Rshow {currentpoint stroke M dup stringwidth pop neg vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/Cshow {currentpoint stroke M dup stringwidth pop -2 div vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/UP {dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def + /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def} def +/DL {Color {setrgbcolor Solid {pop []} if 0 setdash} + {pop pop pop 0 setgray Solid {pop []} if 0 setdash} ifelse} def +/BL {stroke userlinewidth 2 mul setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +/AL {stroke userlinewidth 2 div setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +/UL {dup gnulinewidth mul /userlinewidth exch def + dup 1 lt {pop 1} if 10 mul /udl exch def} def +/PL {stroke userlinewidth setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +% Default Line colors +/LCw {1 1 1} def +/LCb {0 0 0} def +/LCa {0 0 0} def +/LC0 {1 0 0} def +/LC1 {0 1 0} def +/LC2 {0 0 1} def +/LC3 {1 0 1} def +/LC4 {0 1 1} def +/LC5 {1 1 0} def +/LC6 {0 0 0} def +/LC7 {1 0.3 0} def +/LC8 {0.5 0.5 0.5} def +% Default Line Types +/LTw {PL [] 1 setgray} def +/LTb {BL [] LCb DL} def +/LTa {AL [1 udl mul 2 udl mul] 0 setdash LCa setrgbcolor} def +/LT0 {PL [] LC0 DL} def +/LT1 {PL [4 dl1 2 dl2] LC1 DL} def +/LT2 {PL [2 dl1 3 dl2] LC2 DL} def +/LT3 {PL [1 dl1 1.5 dl2] LC3 DL} def +/LT4 {PL [6 dl1 2 dl2 1 dl1 2 dl2] LC4 DL} def +/LT5 {PL [3 dl1 3 dl2 1 dl1 3 dl2] LC5 DL} def +/LT6 {PL [2 dl1 2 dl2 2 dl1 6 dl2] LC6 DL} def +/LT7 {PL [1 dl1 2 dl2 6 dl1 2 dl2 1 dl1 2 dl2] LC7 DL} def +/LT8 {PL [2 dl1 2 dl2 2 dl1 2 dl2 2 dl1 2 dl2 2 dl1 4 dl2] LC8 DL} def +/Pnt {stroke [] 0 setdash gsave 1 setlinecap M 0 0 V stroke grestore} def +/Dia {stroke [] 0 setdash 2 copy vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath stroke + Pnt} def +/Pls {stroke [] 0 setdash vpt sub M 0 vpt2 V + currentpoint stroke M + hpt neg vpt neg R hpt2 0 V stroke + } def +/Box {stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath stroke + Pnt} def +/Crs {stroke [] 0 setdash exch hpt sub exch vpt add M + hpt2 vpt2 neg V currentpoint stroke M + hpt2 neg 0 R hpt2 vpt2 V stroke} def +/TriU {stroke [] 0 setdash 2 copy vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath stroke + Pnt} def +/Star {2 copy Pls Crs} def +/BoxF {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath fill} def +/TriUF {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath fill} def +/TriD {stroke [] 0 setdash 2 copy vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath stroke + Pnt} def +/TriDF {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath fill} def +/DiaF {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath fill} def +/Pent {stroke [] 0 setdash 2 copy gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath stroke grestore Pnt} def +/PentF {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath fill grestore} def +/Circle {stroke [] 0 setdash 2 copy + hpt 0 360 arc stroke Pnt} def +/CircleF {stroke [] 0 setdash hpt 0 360 arc fill} def +/C0 {BL [] 0 setdash 2 copy moveto vpt 90 450 arc} bind def +/C1 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc closepath fill + vpt 0 360 arc closepath} bind def +/C2 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C3 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C4 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 180 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C5 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc + 2 copy moveto + 2 copy vpt 180 270 arc closepath fill + vpt 0 360 arc} bind def +/C6 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C7 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C8 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 270 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C9 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 270 450 arc closepath fill + vpt 0 360 arc closepath} bind def +/C10 {BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill + 2 copy moveto + 2 copy vpt 90 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C11 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 180 arc closepath fill + 2 copy moveto + 2 copy vpt 270 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C12 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 180 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C13 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc closepath fill + 2 copy moveto + 2 copy vpt 180 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C14 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 360 arc closepath fill + vpt 0 360 arc} bind def +/C15 {BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/Rec {newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto + neg 0 rlineto closepath} bind def +/Square {dup Rec} bind def +/Bsquare {vpt sub exch vpt sub exch vpt2 Square} bind def +/S0 {BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare} bind def +/S1 {BL [] 0 setdash 2 copy vpt Square fill Bsquare} bind def +/S2 {BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def +/S3 {BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare} bind def +/S4 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def +/S5 {BL [] 0 setdash 2 copy 2 copy vpt Square fill + exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def +/S6 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare} bind def +/S7 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill + 2 copy vpt Square fill Bsquare} bind def +/S8 {BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare} bind def +/S9 {BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare} bind def +/S10 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill + Bsquare} bind def +/S11 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill + Bsquare} bind def +/S12 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare} bind def +/S13 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill + 2 copy vpt Square fill Bsquare} bind def +/S14 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill + 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def +/S15 {BL [] 0 setdash 2 copy Bsquare fill Bsquare} bind def +/D0 {gsave translate 45 rotate 0 0 S0 stroke grestore} bind def +/D1 {gsave translate 45 rotate 0 0 S1 stroke grestore} bind def +/D2 {gsave translate 45 rotate 0 0 S2 stroke grestore} bind def +/D3 {gsave translate 45 rotate 0 0 S3 stroke grestore} bind def +/D4 {gsave translate 45 rotate 0 0 S4 stroke grestore} bind def +/D5 {gsave translate 45 rotate 0 0 S5 stroke grestore} bind def +/D6 {gsave translate 45 rotate 0 0 S6 stroke grestore} bind def +/D7 {gsave translate 45 rotate 0 0 S7 stroke grestore} bind def +/D8 {gsave translate 45 rotate 0 0 S8 stroke grestore} bind def +/D9 {gsave translate 45 rotate 0 0 S9 stroke grestore} bind def +/D10 {gsave translate 45 rotate 0 0 S10 stroke grestore} bind def +/D11 {gsave translate 45 rotate 0 0 S11 stroke grestore} bind def +/D12 {gsave translate 45 rotate 0 0 S12 stroke grestore} bind def +/D13 {gsave translate 45 rotate 0 0 S13 stroke grestore} bind def +/D14 {gsave translate 45 rotate 0 0 S14 stroke grestore} bind def +/D15 {gsave translate 45 rotate 0 0 S15 stroke grestore} bind def +/DiaE {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath stroke} def +/BoxE {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath stroke} def +/TriUE {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath stroke} def +/TriDE {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath stroke} def +/PentE {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath stroke grestore} def +/CircE {stroke [] 0 setdash + hpt 0 360 arc stroke} def +/Opaque {gsave closepath 1 setgray fill grestore 0 setgray closepath} def +/DiaW {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V Opaque stroke} def +/BoxW {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V Opaque stroke} def +/TriUW {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V Opaque stroke} def +/TriDW {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V Opaque stroke} def +/PentW {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + Opaque stroke grestore} def +/CircW {stroke [] 0 setdash + hpt 0 360 arc Opaque stroke} def +/BoxFill {gsave Rec 1 setgray fill grestore} def +/Density { + /Fillden exch def + currentrgbcolor + /ColB exch def /ColG exch def /ColR exch def + /ColR ColR Fillden mul Fillden sub 1 add def + /ColG ColG Fillden mul Fillden sub 1 add def + /ColB ColB Fillden mul Fillden sub 1 add def + ColR ColG ColB setrgbcolor} def +/BoxColFill {gsave Rec PolyFill} def +/PolyFill {gsave Density fill grestore grestore} def +/h {rlineto rlineto rlineto gsave fill grestore} bind def +% +% PostScript Level 1 Pattern Fill routine for rectangles +% Usage: x y w h s a XX PatternFill +% x,y = lower left corner of box to be filled +% w,h = width and height of box +% a = angle in degrees between lines and x-axis +% XX = 0/1 for no/yes cross-hatch +% +/PatternFill {gsave /PFa [ 9 2 roll ] def + PFa 0 get PFa 2 get 2 div add PFa 1 get PFa 3 get 2 div add translate + PFa 2 get -2 div PFa 3 get -2 div PFa 2 get PFa 3 get Rec + gsave 1 setgray fill grestore clip + currentlinewidth 0.5 mul setlinewidth + /PFs PFa 2 get dup mul PFa 3 get dup mul add sqrt def + 0 0 M PFa 5 get rotate PFs -2 div dup translate + 0 1 PFs PFa 4 get div 1 add floor cvi + {PFa 4 get mul 0 M 0 PFs V} for + 0 PFa 6 get ne { + 0 1 PFs PFa 4 get div 1 add floor cvi + {PFa 4 get mul 0 2 1 roll M PFs 0 V} for + } if + stroke grestore} def +% +/languagelevel where + {pop languagelevel} {1} ifelse + 2 lt + {/InterpretLevel1 true def} + {/InterpretLevel1 Level1 def} + ifelse +% +% PostScript level 2 pattern fill definitions +% +/Level2PatternFill { +/Tile8x8 {/PaintType 2 /PatternType 1 /TilingType 1 /BBox [0 0 8 8] /XStep 8 /YStep 8} + bind def +/KeepColor {currentrgbcolor [/Pattern /DeviceRGB] setcolorspace} bind def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke} +>> matrix makepattern +/Pat1 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke + 0 4 M 4 8 L 8 4 L 4 0 L 0 4 L stroke} +>> matrix makepattern +/Pat2 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 0 8 L + 8 8 L 8 0 L 0 0 L fill} +>> matrix makepattern +/Pat3 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 8 M 8 -4 L + 0 12 M 12 0 L stroke} +>> matrix makepattern +/Pat4 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 0 M 8 12 L + 0 -4 M 12 8 L stroke} +>> matrix makepattern +/Pat5 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 8 M 4 -4 L + 0 12 M 8 -4 L 4 12 M 10 0 L stroke} +>> matrix makepattern +/Pat6 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 0 M 4 12 L + 0 -4 M 8 12 L 4 -4 M 10 8 L stroke} +>> matrix makepattern +/Pat7 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 8 -2 M -4 4 L + 12 0 M -4 8 L 12 4 M 0 10 L stroke} +>> matrix makepattern +/Pat8 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 -2 M 12 4 L + -4 0 M 12 8 L -4 4 M 8 10 L stroke} +>> matrix makepattern +/Pat9 exch def +/Pattern1 {PatternBgnd KeepColor Pat1 setpattern} bind def +/Pattern2 {PatternBgnd KeepColor Pat2 setpattern} bind def +/Pattern3 {PatternBgnd KeepColor Pat3 setpattern} bind def +/Pattern4 {PatternBgnd KeepColor Landscape {Pat5} {Pat4} ifelse setpattern} bind def +/Pattern5 {PatternBgnd KeepColor Landscape {Pat4} {Pat5} ifelse setpattern} bind def +/Pattern6 {PatternBgnd KeepColor Landscape {Pat9} {Pat6} ifelse setpattern} bind def +/Pattern7 {PatternBgnd KeepColor Landscape {Pat8} {Pat7} ifelse setpattern} bind def +} def +% +% +%End of PostScript Level 2 code +% +/PatternBgnd { + TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse +} def +% +% Substitute for Level 2 pattern fill codes with +% grayscale if Level 2 support is not selected. +% +/Level1PatternFill { +/Pattern1 {0.250 Density} bind def +/Pattern2 {0.500 Density} bind def +/Pattern3 {0.750 Density} bind def +/Pattern4 {0.125 Density} bind def +/Pattern5 {0.375 Density} bind def +/Pattern6 {0.625 Density} bind def +/Pattern7 {0.875 Density} bind def +} def +% +% Now test for support of Level 2 code +% +Level1 {Level1PatternFill} {Level2PatternFill} ifelse +% +/Symbol-Oblique /Symbol findfont [1 0 .167 1 0 0] makefont +dup length dict begin {1 index /FID eq {pop pop} {def} ifelse} forall +currentdict end definefont pop +end +%%EndProlog +%%Page: 1 1 +gnudict begin +gsave +50 50 translate +0.100 0.100 scale +90 rotate +0 -5040 translate +0 setgray +newpath +(Helvetica) findfont 140 scalefont setfont +gsave % colour palette begin +/maxcolors 0 def +/HSV2RGB { exch dup 0.0 eq {pop exch pop dup dup} % achromatic gray + { /HSVs exch def /HSVv exch def 6.0 mul dup floor dup 3 1 roll sub + /HSVf exch def /HSVi exch cvi def /HSVp HSVv 1.0 HSVs sub mul def + /HSVq HSVv 1.0 HSVs HSVf mul sub mul def + /HSVt HSVv 1.0 HSVs 1.0 HSVf sub mul sub mul def + /HSVi HSVi 6 mod def 0 HSVi eq {HSVv HSVt HSVp} + {1 HSVi eq {HSVq HSVv HSVp}{2 HSVi eq {HSVp HSVv HSVt} + {3 HSVi eq {HSVp HSVq HSVv}{4 HSVi eq {HSVt HSVp HSVv} + {HSVv HSVp HSVq} ifelse} ifelse} ifelse} ifelse} ifelse + } ifelse} def +/Constrain { + dup 0 lt {0 exch pop}{dup 1 gt {1 exch pop} if} ifelse} def +/YIQ2RGB { + 3 copy -1.702 mul exch -1.105 mul add add Constrain 4 1 roll + 3 copy -0.647 mul exch -0.272 mul add add Constrain 5 1 roll + 0.621 mul exch -0.956 mul add add Constrain 3 1 roll } def +/CMY2RGB { 1 exch sub exch 1 exch sub 3 2 roll 1 exch sub 3 1 roll exch } def +/XYZ2RGB { 3 copy -0.9017 mul exch -0.1187 mul add exch 0.0585 mul exch add + Constrain 4 1 roll 3 copy -0.0279 mul exch 1.999 mul add exch + -0.9844 mul add Constrain 5 1 roll -0.2891 mul exch -0.5338 mul add + exch 1.91 mul exch add Constrain 3 1 roll} def +/SelectSpace {ColorSpace (HSV) eq {HSV2RGB}{ColorSpace (XYZ) eq { + XYZ2RGB}{ColorSpace (CMY) eq {CMY2RGB}{ColorSpace (YIQ) eq {YIQ2RGB} + if} ifelse} ifelse} ifelse} def +/InterpolatedColor false def +/cF7 {sqrt} bind def % sqrt(x) +/cF5 {dup dup mul mul} bind def % x^3 +/cF15 {360 mul sin} bind def % sin(360x) +/pm3dround {maxcolors 0 gt {dup 1 ge + {pop 1} {maxcolors mul floor maxcolors 1 sub div} ifelse} if} def +/pm3dGamma 1.0 1.5 div def +/ColorSpace (RGB) def +Color true and { % COLOUR vs. GRAY map + InterpolatedColor { %% Interpolation vs. RGB-Formula + /g {stroke pm3dround /grayv exch def interpolate + SelectSpace setrgbcolor} bind def + }{ + /g {stroke pm3dround dup cF7 Constrain exch dup cF5 Constrain exch cF15 Constrain + SelectSpace setrgbcolor} bind def + } ifelse +}{ + /g {stroke pm3dround pm3dGamma exp setgray} bind def +} ifelse +1.000 UL +LTb +1.000 UL +LTa +1113 483 M +5849 0 V +stroke +LTb +1113 483 M +-63 0 V +5912 0 R +63 0 V +966 483 M +( 0) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 979 M +5849 0 V +stroke +LTb +1113 979 M +-63 0 V +5912 0 R +63 0 V +966 979 M +( 200000) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 1475 M +5849 0 V +stroke +LTb +1113 1475 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 400000) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 1971 M +5849 0 V +stroke +LTb +1113 1971 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 600000) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 2467 M +5849 0 V +stroke +LTb +1113 2467 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 800000) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 2963 M +5849 0 V +stroke +LTb +1113 2963 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 1e+06) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 3460 M +5849 0 V +stroke +LTb +1113 3460 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 1.2e+06) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 3956 M +5849 0 V +stroke +LTb +1113 3956 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 1.4e+06) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 4452 M +5849 0 V +stroke +LTb +1113 4452 M +-63 0 V +5912 0 R +63 0 V +-6059 0 R +( 1.6e+06) Rshow +1.000 UL +LTb +1.000 UL +LTa +1113 483 M +0 4137 V +stroke +LTb +1113 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 0) Cshow +1.000 UL +LTb +1.000 UL +LTa +1825 483 M +0 4137 V +stroke +LTb +1825 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 200000) Cshow +1.000 UL +LTb +1.000 UL +LTa +2537 483 M +0 4137 V +stroke +LTb +2537 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 400000) Cshow +1.000 UL +LTb +1.000 UL +LTa +3249 483 M +0 4137 V +stroke +LTb +3249 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 600000) Cshow +1.000 UL +LTb +1.000 UL +LTa +3961 483 M +0 4137 V +stroke +LTb +3961 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 800000) Cshow +1.000 UL +LTb +1.000 UL +LTa +4673 483 M +0 4137 V +stroke +LTb +4673 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 1e+06) Cshow +1.000 UL +LTb +1.000 UL +LTa +5385 483 M +0 4137 V +stroke +LTb +5385 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 1.2e+06) Cshow +1.000 UL +LTb +1.000 UL +LTa +6096 483 M +0 4137 V +stroke +LTb +6096 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 1.4e+06) Cshow +1.000 UL +LTb +1.000 UL +LTa +6808 483 M +0 4137 V +stroke +LTb +6808 483 M +0 -63 V +0 4200 R +0 63 V +0 -4403 R +( 1.6e+06) Cshow +1.000 UL +LTb +1.000 UL +LTb +1113 4620 N +0 -4137 V +5849 0 V +0 4137 V +-5849 0 V +Z stroke +LCb setrgbcolor +140 2551 M +currentpoint gsave translate 90 rotate 0 0 M +(gi|6626253|gb|AE000511.1|_Helicobacter_pylori_26695,_complete_genome) Cshow +grestore +LTb +LCb setrgbcolor +4037 70 M +(gi|12057207|gb|AE001439.1|_Helicobacter_pylori_J99,_complete_genome ) Cshow +LTb +4037 4830 M +(plot_matches) Cshow +1.000 UP +1.000 UL +LTb +2.000 UL +LT0 +0.00 1.00 0.00 C /Helvetica findfont 140 scalefont setfont +1113 483 M +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +-1 1661 R +1 -1660 R +1 0 V +0 1 V +1 0 V +1 1 R +0 1 R +0 -1 R +0 3169 R +0 -3169 R +1 0 V +0 3170 R +0 -3170 R +0 1 V +0 -1 R +0 1 R +1 0 V +0 3255 R +0 -3254 R +1 0 V +0 3254 R +0 1 R +0 -919 R +0 -1527 R +1 1 R +0 2445 R +0 -918 R +0 -2337 R +0 2337 R +0 -1527 R +0 2446 R +0 -919 R +0 -2337 R +1 0 V +0 45 R +0 534 R +0 -9 R +0 -525 R +0 2293 R +0 -1768 R +0 1350 R +0 -1920 R +0 1 V +0 2337 R +0 918 R +0 -2445 R +0 -810 R +1 0 R +0 570 R +0 -570 R +0 2338 R +0 918 R +0 -2446 R +0 1109 R +0 -1919 R +0 3256 R +0 -918 R +0 -1528 R +0 1 V +0 -811 R +1 1 V +1 0 V +0 1 V +0 3543 R +0 -3543 R +0 810 R +1 0 V +-1 -810 R +1 0 V +0 1 R +1 0 V +0 3256 R +0 -3256 R +1 0 V +-1 45 R +1 -45 R +0 1 R +0 3071 R +0 -734 R +0 -2292 R +0 765 R +0 2446 R +0 -3211 R +0 1874 R +0 -1919 R +1 0 V +-1 2338 R +1 -2338 R +0 2338 R +0 -2338 R +1140 502 L +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +0 1146 R +0 -1146 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +0 617 R +0 -617 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +0 3671 R +0 272 R +0 -3943 R +1 3943 R +0 -3943 R +0 1 V +1 765 R +0 2445 R +0 -918 R +0 -2337 R +0 3256 R +0 -2446 R +0 -765 R +0 3027 R +0 -735 R +1197 2834 L +0 -2338 R +0 579 R +0 -9 R +0 -525 R +0 525 R +0 -525 R +0 2293 R +1 0 V +-1 918 R +1 0 V +-1 -2445 R +1 0 V +-1 -232 R +1 1 V +-1 -580 R +1 1 V +0 44 R +0 1 V +0 3026 R +0 -3026 R +0 2292 R +0 919 R +0 -2446 R +0 1109 R +0 -1874 R +1 0 V +-1 534 R +1 0 V +-1 2677 R +1 0 V +-1 -918 R +1 0 V +-1 -1528 R +1 0 V +0 -765 R +0 1 V +1 0 R +0 765 R +0 1528 R +0 -2293 R +1 0 V +0 1 V +1 1 R +0 2292 R +1 1769 R +0 -4061 R +0 2293 R +0 918 R +0 -2445 R +0 -766 R +0 -45 R +0 3257 R +0 -3212 R +0 1 V +1 0 R +0 2292 R +0 -2337 R +0 45 R +0 1874 R +0 -1874 R +0 -45 R +0 45 R +1 0 V +0 1 V +0 1873 R +0 1688 R +0 -3561 R +1 0 R +0 3717 R +0 -664 R +0 -2694 R +0 735 R +1 -1093 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +3 1018 R +1 -1015 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 R +1 1 R +1 1 R +1 0 R +0 1 R +1 0 R +0 1 R +1 0 R +1235 567 L +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +0 3260 R +0 -3260 R +1 0 R +0 1 V +1 0 V +1 1 R +0 2200 R +0 -2200 R +1 0 R +0 1 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 4 R +0 1 V +1 0 V +8 0 R +1 9 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1307 625 L +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +0 2314 R +0 -2314 R +0 1181 R +0 -1181 R +1 0 V +0 1 V +1 0 V +1 3 R +0 -2 R +0 2 R +0 -2 R +1 2 R +0 -2 R +1 1 R +1 0 R +0 1 R +2 1 R +1 1 R +1 0 V +0 2749 R +0 -2749 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1373 670 L +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 R +0 2684 R +0 1 V +1 -2685 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 R +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +0 3 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +1 1 R +1 1 V +0 55 R +0 1303 R +0 -1358 R +1 0 V +0 1 V +1 0 V +0 3879 R +0 -3879 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 2721 R +0 -703 R +0 -2018 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +1436 718 L +1 0 V +0 2478 R +0 1 V +0 -2479 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 V +0 666 R +0 -666 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 3069 R +0 -3069 R +1 1 V +0 1537 R +0 -1537 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1303 R +0 -1358 R +0 55 R +0 1 V +0 -56 R +0 1358 R +0 -1302 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1495 758 L +0 1 V +1 0 R +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +0 2857 R +0 -2857 R +1 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1580 R +0 -1580 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +0 3264 R +1 -3264 R +1 0 V +0 1 R +1 0 V +0 2800 R +0 4 R +0 -2800 R +0 4 R +0 2792 R +0 4 R +0 -2800 R +0 4 R +0 -8 R +1554 801 L +0 2799 R +1 1 V +-1 3 R +1 0 V +1554 804 M +1 0 V +-1 4 R +1 0 V +-1 -7 R +1 0 V +0 2803 R +0 -3 R +0 -2797 R +0 4 R +0 1 V +0 2795 R +0 -2803 R +0 3 R +0 1 V +0 4 R +0 2795 R +0 -3 R +0 -2800 R +0 4 R +0 4 R +0 2792 R +0 -2800 R +0 4 R +0 4 R +0 2795 R +0 1 V +0 -4 R +1 0 V +1555 805 M +0 4 R +0 -8 R +1 0 V +-1 2804 R +0 -2800 R +0 4 R +0 -4 R +1 0 V +-1 4 R +1 0 V +-1 2796 R +1 0 V +0 -2803 R +0 2800 R +0 -2797 R +0 2800 R +0 -2796 R +0 1 V +1 0 V +-1 2792 R +0 -2797 R +0 1 R +1 2799 R +0 -2795 R +0 -4 R +0 2799 R +0 -3 R +0 -2792 R +0 2795 R +0 -2795 R +0 2792 R +0 -2800 R +0 2800 R +0 -2800 R +0 8 R +1 0 V +-1 2795 R +0 1 V +1 0 V +-1 -4 R +1 1 V +1557 802 M +1 1 V +0 2803 R +0 -2796 R +0 1 V +0 2792 R +0 -2800 R +0 2803 R +0 -3 R +0 -2800 R +0 8 R +0 2795 R +0 -2795 R +0 2795 R +1 0 V +-1 -3 R +1 0 V +1558 803 M +1 0 V +-1 8 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1566 816 L +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 1121 R +1 0 V +0 -860 R +0 -261 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 2 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +1 0 R +0 1 R +0 639 R +1 -639 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +0 4 R +0 -4 R +0 4 R +0 -4 R +1 0 V +0 4 R +0 -4 R +0 1 V +1 4 R +0 -4 R +1 0 R +0 1 V +0 4 R +0 -4 R +1628 859 L +0 1 R +1 0 V +-1 4 R +1 -4 R +1 1 V +0 -4 R +0 4 R +1 0 V +0 -4 R +0 4 R +0 -4 R +0 4 R +0 1 V +0 -5 R +0 1 V +0 4 R +1 0 V +0 -4 R +0 4 R +1 1 V +0 -4 R +0 4 R +1 0 V +0 1 V +0 -5 R +0 5 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 V +3 0 R +1 1 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1686 898 L +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +0 -1 R +0 1 R +1 0 R +0 1 R +1 0 R +1 1951 R +1 -1950 R +0 1982 R +1 -1981 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 2068 R +1 -2068 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 831 R +0 -831 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +1 1 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +11 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1760 943 L +0 1 V +1 0 R +0 1 V +1 0 V +0 1056 R +0 -1056 R +0 3350 R +0 -1931 R +0 -1419 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1824 988 L +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +2 0 R +0 1 V +1 0 V +0 1 V +1 0 R +0 3305 R +0 -3305 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1389 R +0 -1389 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1679 R +0 -1679 R +0 1 V +0 2255 R +0 -2255 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +0 2332 R +0 -2332 R +1 0 V +0 2333 R +0 -2333 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +1884 1029 L +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +0 1628 R +0 -1628 R +1 0 R +0 1 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +0 1188 R +0 -1188 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +0 1792 R +0 572 R +0 -702 R +0 -1662 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1515 R +0 -1514 R +0 -525 R +0 -45 R +0 3072 R +0 -1153 R +0 1337 R +0 -2446 R +0 1527 R +0 1 V +0 -1759 R +0 2493 R +1939 3568 L +0 -2502 R +0 1 V +0 2501 R +0 -2501 R +0 1767 R +0 919 R +0 -2446 R +0 -240 R +1 0 V +-1 2502 R +1 0 V +-1 -735 R +1 0 V +0 -1767 R +0 -570 R +0 3072 R +0 -3072 R +0 3256 R +0 -918 R +0 -1528 R +0 -765 R +0 534 R +0 -9 R +1 1 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 1 R +0 2501 R +0 -734 R +0 -2337 R +0 570 R +1 0 R +0 -525 R +0 525 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 -9 R +0 9 R +0 1341 R +0 -1341 R +0 1 V +1 0 V +0 -9 R +0 9 R +0 -534 R +0 3211 R +0 -918 R +0 -1528 R +0 -231 R +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 233 R +0 2446 R +0 -2679 R +0 -577 R +0 577 R +0 1 V +0 1341 R +0 418 R +1 -1759 R +0 1341 R +0 1152 R +0 -2493 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +0 251 R +0 -251 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 3355 R +0 -2043 R +0 1220 R +0 -2531 R +0 1703 R +0 -1703 R +1973 1090 L +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 419 R +0 -419 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +0 1296 R +0 -1294 R +0 1 V +0 -3 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +0 -3 R +0 1 V +0 2 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +2034 1132 L +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +-1 1929 R +1 0 V +0 -1929 R +0 1 V +0 1928 R +0 -1928 R +0 1928 R +0 -1928 R +0 1928 R +0 1 V +0 -1929 R +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +0 1925 R +0 -1925 R +0 1925 R +0 -1925 R +0 1925 R +0 -1925 R +1 0 V +0 1 V +1 0 V +-1 -617 R +1 617 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 -595 R +0 595 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 R +2089 1171 L +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 -338 R +0 338 R +0 1 R +1 0 V +0 1577 R +0 -1577 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +0 2173 R +0 -2173 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 3369 R +0 -3368 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +2152 1214 L +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +2 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 -124 R +0 124 R +0 1 V +1 0 V +1 1 R +1 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +2218 1260 L +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 810 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +2287 2117 L +0 -805 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1584 R +1 -173 R +0 173 R +0 -1583 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 R +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +3 1 R +1 0 V +0 1 R +0 2352 R +0 -2563 R +0 211 R +1 0 R +0 1 V +1 0 V +1 9 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +2355 1367 L +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 -300 R +1 301 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +38 2926 R +72 144 R +13 -838 R +38 -2238 R +0 1 V +17 1754 R +0 -2456 R +22 1310 R +7 1398 R +2617 580 M +42 2455 R +29 -1362 R +1 0 V +0 1 V +1 0 V +2 1 R +1 1 R +1 0 V +0 1 V +1 0 V +-1 0 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +2 1 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 2436 R +1 -2436 R +0 1 V +1 0 R +2710 1687 L +0 1 R +1 0 R +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +0 1727 R +0 1 V +0 -1934 R +0 206 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +0 1210 R +1 -1210 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1211 R +1 -1210 R +0 1210 R +0 -1210 R +0 1210 R +1 -1210 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1 V +0 1210 R +0 -1210 R +0 1210 R +1 -1210 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +2761 1724 L +0 1210 R +0 -1210 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 4 R +0 -3 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +2 2 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 R +1 -5 R +1 0 V +0 1 R +1 5 R +1 1 R +1 2 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +0 2284 R +0 -2284 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +2827 1771 L +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 1 V +1 0 V +1 0 V +0 1 R +0 1073 R +0 -1073 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +0 2768 R +0 -2768 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +0 -1 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +2892 1815 L +0 1 V +2 1 R +0 -1181 R +1 1434 R +1 1357 R +0 324 R +0 -3045 R +1 1112 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +0 -615 R +0 2065 R +0 -1450 R +1 0 V +1 1 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +1 1 V +1 0 R +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +2963 1862 L +0 3 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 -2 R +0 1 R +0 1 R +0 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +1 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +0 3 R +0 -1 R +0 -1 R +0 1 R +0 -3 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -5 R +0 1 R +0 1 R +0 3 R +0 -2 R +0 1 V +0 -1 R +0 -2 R +0 -1 R +0 2 R +0 -2 R +0 1 R +0 -1 R +0 1 R +0 4 R +0 -2 R +0 -1 R +0 2 R +0 1 R +0 -2 R +0 1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +1 -1 R +0 1 V +0 3 R +0 -1 R +0 -2 R +0 1 V +0 2 R +0 -1 R +0 1 V +0 -2 R +0 2 R +0 -3 R +0 1 R +0 2 R +0 1 R +0 -2 R +0 1 R +0 -5 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +0 1 R +1 0 V +-1 -1 R +1 0 V +-1 2 R +1 0 V +-1 1 R +1 0 V +-1 -3 R +1 0 V +-1 -2 R +1 0 V +-1 1 R +1 0 V +-1 1 R +1 0 V +-1 3 R +1 0 V +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 -3 R +0 2 R +0 2 R +0 -3 R +2966 1865 L +0 -3 R +0 5 R +0 -1 R +0 -3 R +0 1 V +0 1 R +0 1 R +0 1 V +0 -2 R +0 2 R +0 -3 R +0 1 R +0 2 R +0 1 R +0 -2 R +1 -1 R +0 -1 R +0 -2 R +0 1 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 -3 R +0 2 R +0 2 R +0 -2 R +0 1 V +0 -1 R +0 -2 R +0 -1 R +0 2 R +0 3 R +0 -5 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 3 R +1 0 V +-1 -1 R +1 0 V +-1 -3 R +1 0 V +-1 -1 R +1 0 V +-1 1 R +1 0 V +-1 1 R +1 0 V +-1 0 R +1 0 V +0 1 R +0 -1 R +0 2 R +0 -3 R +0 2 R +0 2 R +0 -2 R +0 1 V +0 -1 R +0 -2 R +0 -1 R +0 2 R +0 3 R +0 -5 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +1 1 R +0 -1 R +0 2 R +0 -3 R +0 2 R +0 2 R +0 -3 R +0 1 V +0 -3 R +0 5 R +0 -1 R +0 -3 R +0 2 R +0 2 R +0 1 R +0 -2 R +0 -2 R +0 1 R +0 -1 R +0 -2 R +0 1 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -3 R +1 3 R +0 -2 R +2970 1866 L +0 -1 R +0 1 R +0 1 R +0 -1 R +0 -1 R +0 1 R +0 -3 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -5 R +0 1 R +0 1 R +0 3 R +0 -2 R +0 1 V +0 -1 R +0 -2 R +0 -1 R +0 1 R +0 1 R +0 1 R +0 -1 R +0 2 R +0 -4 R +0 1 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 2 R +0 -1 R +0 1 R +0 -3 R +0 2 R +0 -1 R +0 2 R +0 -2 R +0 3 R +1 0 V +0 -5 R +0 5 R +0 1 R +0 -2 R +0 -2 R +0 1 R +0 -1 R +0 -2 R +0 1 R +0 2 R +0 -1 R +0 2 R +0 1 R +0 -3 R +0 3 R +0 -1 R +0 -3 R +0 -1 R +0 1 R +0 1 R +0 3 R +0 1 V +0 -1 R +1 0 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 -2 R +1 0 V +-1 -2 R +1 0 V +-1 1 R +1 0 V +0 3 R +0 -1 R +0 1 R +0 -2 R +0 -2 R +0 1 R +0 3 R +1 0 V +0 1 V +1 0 V +0 1 V +0 2 R +1 0 V +-1 -1 R +1 0 V +-1 -1 R +1 0 V +0 2 R +0 -1 R +0 -1 R +0 2 R +0 -1 R +0 -1 R +0 2 R +0 -1 R +0 -1 R +0 1 R +0 -1 R +0 1 R +0 -1 R +1 1 R +0 -1 R +0 1 R +0 -1 R +2976 1871 L +0 1 V +0 -1 R +0 1 R +0 -1 R +0 -1 R +0 2 R +0 -1 R +0 -1 R +0 2 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +0 2373 R +0 -2373 R +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +3037 1914 L +1 0 R +0 1 V +1 0 V +2 1 V +0 1 R +1 0 R +1 0 V +0 1 V +0 1583 R +0 -1583 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +0 2 R +1 0 V +-1 -2 R +1 0 V +0 1 R +1 0 R +0 1 R +0 -1 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 26 R +0 1178 R +0 -1596 R +0 1 V +0 417 R +0 1178 R +0 -1595 R +0 393 R +1 0 V +1 1 V +0 -93 R +1 93 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +3096 1956 L +1 1 V +0 1057 R +0 -1057 R +0 1057 R +0 1 V +0 -1058 R +2 1 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 1 V +1 0 R +2 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 R +0 1039 R +0 -1039 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +3163 2002 L +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +7 13 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 -1358 R +0 55 R +0 -55 R +0 1358 R +1 1 V +1 0 V +0 1 V +1 0 V +3235 2060 L +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1086 R +0 -1086 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 R +1 0 R +0 7 R +0 -7 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +3300 2104 L +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +0 819 R +0 -819 R +0 819 R +0 -819 R +1 0 V +0 1 V +0 819 R +0 -819 R +1 0 V +0 1 V +1 0 V +-1 819 R +1 -819 R +0 1 R +1 0 V +1 2 R +1 1 R +0 1 R +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 R +1 1 R +1 1 R +1 0 V +0 -3 R +0 1 R +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +0 2169 R +0 -2169 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +3358 2145 L +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 -1661 R +0 1661 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +0 625 R +0 -437 R +0 -188 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 V +1 0 R +1 0 R +0 1 V +0 285 R +0 -1452 R +0 1167 R +1 0 V +0 1 V +1 0 V +1 1 V +0 -836 R +1 836 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +3420 2187 L +1 0 R +0 1 V +0 -245 R +0 245 R +1 0 R +0 1994 R +0 -1993 R +1 0 V +3 65 R +1 -65 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 2063 R +0 -2063 R +0 1 R +3479 2225 L +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +0 -1188 R +0 1188 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +0 1374 R +1 -1374 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 333 R +0 -333 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 R +1 0 R +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 1 R +1 0 R +0 1 R +1 0 R +1 1 R +1 0 R +0 4 R +1 0 V +0 1 V +3544 2273 L +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +3 5 R +1 0 R +0 1 R +1 0 V +0 -1537 R +0 1537 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 313 R +1 -313 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +45 27 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +3657 2349 L +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +-1 990 R +1 0 V +0 -990 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +-1 948 R +1 0 V +3677 945 M +1 0 V +0 1419 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 -1580 R +0 1580 R +2 1 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +0 -3 R +1 0 V +0 3 R +1 1 V +0 1880 R +0 -1880 R +0 1880 R +0 -1880 R +1 0 V +0 1 V +0 -950 R +0 950 R +1 0 R +1 1 V +0 1881 R +0 -1881 R +0 -949 R +1 949 R +0 1 V +0 572 R +0 -1522 R +0 1522 R +0 -572 R +0 1880 R +0 1 V +0 -1881 R +1 0 V +-1 572 R +1 0 V +0 -572 R +0 -949 R +0 2830 R +0 -1881 R +3705 2383 L +0 -950 R +0 2830 R +0 -1880 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1888 R +1 0 V +0 -1888 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 -1186 R +0 1187 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +0 -1389 R +0 1389 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1555 R +0 -1555 R +0 1 V +0 1554 R +0 -1554 R +1 0 V +1 1 R +0 1554 R +9 -2897 R +0 2502 R +0 -1153 R +0 -1874 R +0 2292 R +0 1 V +0 734 R +1 0 V +0 -1152 R +0 -1874 R +1 0 V +-1 1874 R +1 0 V +0 -1919 R +0 1919 R +0 1153 R +0 -2502 R +0 -570 R +0 1919 R +0 -1349 R +0 1349 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +1 1 V +0 -1342 R +0 1 V +0 -578 R +0 1919 R +0 -1874 R +0 -45 R +0 2338 R +0 1334 R +0 -3672 R +0 2338 R +0 -419 R +0 -1341 R +0 2493 R +0 1 R +0 -1153 R +3760 2420 L +-1 -1049 R +1 0 V +-1 2737 R +1 0 V +-1 -1688 R +1 0 V +0 1 R +1 0 V +0 1 V +0 -1296 R +0 1296 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 R +1 1 R +2 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +0 -1843 R +1 0 V +-1 1843 R +1 0 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +0 -48 R +0 48 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +3822 2465 L +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +0 1689 R +0 -3644 R +0 2124 R +0 1 V +0 -170 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 656 R +0 -656 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 R +0 -1596 R +0 1596 R +3884 2507 L +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +1 0 R +0 1 V +1 1 R +1 0 R +3 2 R +1 1 R +1 0 R +1 1 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 -179 R +0 180 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 R +3954 2555 L +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +0 983 R +0 -983 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 R +1 -1313 R +0 1313 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 R +0 135 R +1 0 V +-1 -133 R +1 0 V +-1 135 R +1 0 V +-1 -136 R +1 0 V +1 137 R +0 -136 R +0 135 R +1 0 V +-1 -135 R +1 0 V +0 135 R +0 -135 R +0 135 R +0 -135 R +0 135 R +0 -135 R +0 135 R +0 -135 R +2 136 R +0 -136 R +0 134 R +0 -132 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +4007 2592 L +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +0 -1145 R +0 1145 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +1 1 R +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +1 1 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +4070 2637 L +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +5 -1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 -169 R +0 -1955 R +0 2124 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +4141 2680 L +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +0 -1149 R +1 1149 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +0 -756 R +0 1 V +0 755 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 -135 R +0 135 R +0 -135 R +0 144 R +0 174 R +0 -174 R +0 1 V +0 -145 R +1 0 R +0 135 R +0 -134 R +0 134 R +0 -133 R +0 135 R +0 -136 R +0 134 R +0 1 V +1 0 V +0 -134 R +0 136 R +1 0 V +-1 -135 R +4194 2583 L +-1 -1 R +1 0 V +0 136 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 882 R +0 -882 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 7 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +2 1 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +4255 2769 L +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +0 -626 R +0 626 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +-1 1630 R +1 -1630 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +0 613 R +1 0 V +1 5 R +2 -1050 R +0 990 R +0 -990 R +3 436 R +1 1 V +1 0 R +0 1 V +1 0 V +0 -373 R +0 373 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +4321 2814 L +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +4 -1805 R +1 1805 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 -1525 R +0 1526 R +0 1 R +0 1 V +1 302 R +0 1 V +0 -1596 R +0 418 R +1 0 R +0 1178 R +0 -1596 R +0 1 V +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +1 0 V +-1 -1595 R +1 0 V +0 1595 R +0 -1595 R +0 1595 R +0 1 V +0 -1596 R +0 1 V +0 1595 R +0 -1595 R +1 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1596 R +0 -1596 R +0 1596 R +1 0 V +-1 -1596 R +1 1 V +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +1 1596 R +0 -1596 R +0 1 V +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1596 R +1 0 V +-1 -1596 R +1 1 V +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +0 1595 R +0 -1595 R +2 -239 R +2 0 R +0 1527 R +0 -1527 R +0 1527 R +0 -1 R +0 -1 R +0 -1526 R +0 1527 R +1 -1 R +4360 2830 L +0 172 R +0 -172 R +0 2 R +0 -1527 R +0 1527 R +0 -1527 R +0 1527 R +1 0 V +-1 -1527 R +1 0 V +-1 1525 R +1 0 V +1 922 R +0 -3256 R +0 2337 R +0 919 R +0 -2446 R +0 1109 R +0 1337 R +0 -2446 R +0 1528 R +0 918 R +1 0 V +-1 -2446 R +1 0 V +-1 1528 R +1 0 V +-1 -1759 R +1 0 V +0 1759 R +0 -1768 R +0 1768 R +0 918 R +0 -2445 R +0 -811 R +0 1 V +0 2337 R +0 918 R +0 -2445 R +0 -232 R +0 1 V +0 1758 R +0 918 R +0 -2445 R +0 1527 R +0 -2293 R +0 1 V +0 3026 R +0 185 R +0 -2446 R +0 1527 R +0 -2337 R +0 45 R +0 2292 R +1 0 V +0 919 R +0 -2446 R +0 1527 R +0 1 V +0 918 R +0 -2446 R +0 -810 R +0 579 R +0 3310 R +0 -1551 R +0 918 R +0 -918 R +0 918 R +0 -2445 R +0 1527 R +0 918 R +0 -2445 R +0 1527 R +0 918 R +0 -2445 R +0 1527 R +1 918 R +0 1 V +0 -2446 R +0 1527 R +0 919 R +0 -2446 R +0 1527 R +0 -2337 R +0 3256 R +0 -2446 R +0 1527 R +0 919 R +0 -2446 R +0 1527 R +0 1 V +0 918 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 -2445 R +0 1527 R +0 -1527 R +0 -766 R +0 -45 R +0 3257 R +0 -2446 R +0 1527 R +0 -1527 R +0 2446 R +1 -2446 R +4370 1312 L +0 2445 R +0 -919 R +0 -2337 R +0 570 R +0 1767 R +0 1 V +0 -1527 R +0 2445 R +0 -918 R +0 -1768 R +0 241 R +0 2445 R +0 -2445 R +1 0 V +-1 2446 R +1 0 V +0 725 R +0 -1903 R +0 259 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 -1543 R +0 453 R +0 1090 R +0 1 R +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 -659 R +0 659 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1748 R +0 -1748 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +4422 2874 L +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 266 R +0 -266 R +0 -2190 R +0 2190 R +1 0 V +0 266 R +0 -266 R +0 266 R +1 -2456 R +0 2190 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +0 73 R +0 -73 R +1 0 V +1 1 V +1 0 V +-1 4 R +1 -4 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 -4 R +0 4 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +0 -1885 R +0 1885 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1217 R +0 -1217 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 -1574 R +0 1574 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +4479 2912 L +0 1 R +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +0 -1 R +0 1 R +0 -1 R +1 1 R +1 0 V +0 1 R +0 1 R +0 -1 R +1 0 V +-1 1 R +1 0 V +0 -1 R +0 2 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +0 -1 R +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 -819 R +0 819 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 -819 R +0 819 R +1 1 V +1 0 V +0 1 R +0 -819 R +0 819 R +1 0 V +0 -1210 R +0 1211 R +0 -1211 R +1 0 V +-1 1211 R +1 0 V +0 -1211 R +0 1211 R +0 -1210 R +0 1210 R +0 -1210 R +1 0 V +-1 1210 R +1 0 V +0 -1210 R +0 1211 R +0 -1211 R +1 0 V +-1 1211 R +1 0 V +0 -1211 R +0 1 V +0 1210 R +0 -1210 R +0 1210 R +0 -1210 R +0 1211 R +0 -1211 R +0 1211 R +1 -1211 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1211 R +0 -1211 R +0 1 V +0 1210 R +1 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +4517 2939 L +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 -1 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +0 -73 R +0 73 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +0 -576 R +0 -3 R +1 0 V +0 3 R +1 1 V +0 1880 R +0 -1880 R +0 -950 R +0 950 R +1 0 V +0 -949 R +0 950 R +1 0 R +0 1 V +1 0 R +0 1881 R +0 -1881 R +1 0 V +-1 -949 R +1 0 V +0 949 R +0 1 V +0 572 R +0 -1522 R +0 1522 R +1 0 V +-1 -572 R +1 0 V +-1 1880 R +0 1 V +1 -1881 R +0 572 R +0 -572 R +0 -949 R +0 2830 R +0 -1881 R +0 1 V +0 -950 R +0 2830 R +0 -1880 R +1 0 V +0 572 R +1 0 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +4562 2963 L +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 V +1 0 R +0 1206 R +0 -1206 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 -1894 R +0 1894 R +1 0 V +0 1 R +1 0 V +0 350 R +0 -349 R +1 420 R +0 -420 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 0 R +0 1 V +4624 3007 L +1 1 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 1 V +1 -290 R +0 -1944 R +0 1800 R +2 435 R +1 1 R +0 14 R +0 -14 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 R +0 -654 R +0 1246 R +0 -591 R +1 0 V +0 -14 R +0 14 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 -775 R +0 775 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +4689 3054 L +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +0 1155 R +0 1 V +0 -1156 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +5 -2510 R +10 2558 R +4 -543 R +54 542 R +1 0 V +28 -23 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +4 3 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 -18 R +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +14 -1394 R +15 2815 R +4 -3489 R +1 3109 R +1 0 V +0 1 R +1 0 V +2 -1028 R +1 1 R +1 1033 R +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +2 1 V +1 1 V +1 0 R +1 1 V +2 1 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 1 R +1 0 V +7 -413 R +1 425 R +1 0 V +0 1 R +24 419 R +68 -172 R +21 -2146 R +5014 2264 L +12 2164 R +15 -3250 R +11 1261 R +19 -228 R +1 0 V +0 1 R +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 V +0 -1 R +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +5097 497 M +19 34 R +39 2826 R +0 -471 R +1 471 R +0 -2332 R +0 2332 R +0 1 V +1 0 V +0 -2333 R +0 2333 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 -2173 R +0 2173 R +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +2 1 R +0 1 R +1 0 R +1 1 V +1 -3 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +5203 3387 L +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 -1447 R +0 1447 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +1 1 R +1 0 R +0 1 V +5 1 R +1 0 V +1 1 R +1 0 V +0 1 V +0 -614 R +0 1 V +1 0 V +-1 613 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 -615 R +0 616 R +0 -616 R +0 616 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +3 0 R +1 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +5276 3432 L +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 24 R +0 -24 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 R +1 0 R +2 1 R +0 753 R +0 -753 R +0 753 R +0 -753 R +5342 3479 L +0 752 R +0 -752 R +1 0 V +-1 752 R +1 1 V +0 -753 R +0 753 R +0 -753 R +0 753 R +0 -753 R +0 753 R +1 0 V +-1 -753 R +1 1 V +0 753 R +1 0 V +-1 -753 R +1 0 V +0 753 R +2 2 V +-2 -755 R +0 1 V +2 1 V +0 753 R +0 -753 R +0 753 R +0 -753 R +0 753 R +0 -753 R +1 0 V +-1 753 R +1 0 V +0 -753 R +0 753 R +0 -752 R +0 752 R +0 1 V +0 -753 R +1 0 V +-1 753 R +1 0 V +0 -753 R +0 753 R +0 -753 R +0 753 R +0 -752 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 -2288 R +0 -294 R +0 2185 R +0 -1480 R +0 1275 R +0 1 V +0 -2052 R +0 2053 R +0 -2053 R +0 71 R +1 0 V +0 -70 R +0 2050 R +0 1 V +0 601 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 687 R +0 -687 R +1 1 V +1 0 V +0 1 R +1 0 V +5376 3502 L +1 0 V +1 0 R +0 1 R +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +5426 882 M +1 0 V +0 2655 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +2 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 R +5446 3550 L +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +0 -1153 R +0 1153 R +0 -3027 R +0 1875 R +0 -1350 R +0 2502 R +0 -734 R +1 0 V +-1 918 R +1 0 V +-1 -2445 R +1 0 V +-1 -232 R +1 1 V +-1 -580 R +1 1 V +0 3071 R +0 -3071 R +0 569 R +0 1 V +0 2501 R +0 1 V +0 -735 R +0 1 V +0 918 R +0 -2446 R +0 1109 R +1 0 V +0 1 V +0 1152 R +1 -733 R +0 -1527 R +0 2261 R +1 0 V +0 1 V +1 0 V +1 1 R +0 -734 R +0 -2337 R +0 577 R +0 1 V +0 2493 R +1 0 V +0 -2493 R +0 1341 R +0 1152 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +1 1 V +0 -2429 R +0 2429 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +5490 3580 L +0 1 V +1 0 V +1 0 R +0 1 V +0 82 R +1 -82 R +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 4 R +0 -2800 R +0 4 R +0 2792 R +0 4 R +0 -2800 R +0 4 R +0 -8 R +0 2800 R +0 1 V +0 3 R +0 -2800 R +0 4 R +0 -7 R +0 2800 R +0 3 R +0 -2800 R +0 4 R +0 -7 R +0 3 R +0 4 R +0 2793 R +1 0 V +5519 804 M +0 4 R +0 1 V +0 -8 R +0 2803 R +0 -2803 R +1 0 V +-1 3 R +1 0 V +-1 5 R +1 0 V +-1 2795 R +1 0 V +0 -2803 R +0 3 R +0 1 V +0 4 R +0 2792 R +0 -2800 R +0 4 R +0 4 R +0 2795 R +0 1 V +0 -2800 R +0 4 R +0 -8 R +0 2804 R +0 -2800 R +0 4 R +0 -8 R +0 2800 R +0 -2796 R +0 4 R +0 2796 R +0 -2804 R +0 2800 R +0 -2796 R +0 4 R +0 2796 R +0 -2800 R +5521 805 L +-1 4 R +1 0 V +0 -7 R +0 2800 R +0 3 R +0 -2800 R +0 1 R +0 2799 R +0 -2799 R +0 -4 R +0 2800 R +0 -2797 R +0 1 V +0 2796 R +0 -2800 R +0 4 R +0 2796 R +0 -2800 R +0 3 R +0 1 R +0 2796 R +0 -2800 R +0 2800 R +0 -2792 R +0 -4 R +0 2796 R +0 -2800 R +0 3 R +0 1 R +0 2796 R +0 -2800 R +0 2800 R +0 -2792 R +0 -4 R +0 2796 R +0 -2800 R +0 3 R +0 1 R +0 2796 R +0 -2800 R +0 2800 R +0 -2792 R +0 -4 R +0 2796 R +0 -2800 R +0 3 R +0 1 R +0 2796 R +0 -2800 R +0 2800 R +0 -2792 R +0 -4 R +0 2796 R +0 -2800 R +0 3 R +0 1 R +0 2796 R +0 -2800 R +0 2800 R +0 -2792 R +1 2795 R +0 -2795 R +0 -4 R +0 1 V +1 0 V +0 2796 R +0 3 R +0 1 V +0 -2804 R +0 5 R +0 2799 R +0 -4 R +0 -2800 R +0 1 V +0 4 R +0 2869 R +0 -70 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 -882 R +0 882 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 108 R +1 -104 R +0 1 R +1 0 R +5542 3623 L +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +0 -151 R +0 151 R +1 1 V +1 0 V +0 36 R +0 -36 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 -2857 R +0 2857 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +0 -865 R +0 865 R +1 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +0 -36 R +0 36 R +1 0 V +0 1 V +1 0 V +0 -3169 R +0 3169 R +5600 3664 L +1 0 V +-1 -82 R +1 0 V +0 -3088 R +0 3170 R +1 1 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +1 -773 R +2 775 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +0 -1 R +0 1 V +1 0 V +-1 -1 R +1 1 R +1 1 V +1 0 V +0 1 V +0 -2180 R +0 2180 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 -1305 R +0 1305 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +5663 3707 L +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 -1031 R +0 1031 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +3 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 -3254 R +0 3254 R +0 1 V +0 -919 R +0 -1527 R +1 -335 R +0 -474 R +0 2337 R +0 -1527 R +0 2446 R +0 -1 R +0 -2445 R +0 2445 R +0 -2445 R +1 2445 R +0 -2445 R +1 -810 R +0 2337 R +0 919 R +0 -2446 R +0 1109 R +0 1337 R +0 -2446 R +0 1528 R +0 918 R +0 -2446 R +0 1528 R +0 -1759 R +0 1759 R +0 -1768 R +0 1768 R +0 918 R +0 -2445 R +0 -811 R +5730 497 L +0 2337 R +0 918 R +0 -2445 R +0 -232 R +0 1 V +0 1758 R +0 918 R +0 -2445 R +0 1527 R +0 -2293 R +1 1 V +-1 3026 R +1 185 R +0 -2446 R +0 1527 R +0 -2337 R +0 45 R +0 2292 R +0 919 R +0 -2446 R +0 1527 R +0 1 V +0 918 R +0 -2446 R +0 -810 R +0 579 R +0 3310 R +0 -1551 R +0 918 R +0 -918 R +1 918 R +0 -2445 R +0 1527 R +0 918 R +0 -2445 R +0 1527 R +0 918 R +0 -2445 R +0 1527 R +0 918 R +0 1 V +0 -2446 R +0 1527 R +0 919 R +0 -2446 R +0 1527 R +0 -2337 R +0 3256 R +0 -2446 R +0 1527 R +0 919 R +1 0 V +-1 -2446 R +1 0 V +-1 1527 R +1 1 V +0 -2338 R +0 810 R +0 1528 R +1 0 R +0 1 R +0 -2293 R +1 2293 R +0 1 V +1 0 V +0 -2293 R +0 2293 R +1 0 R +0 -2337 R +0 45 R +0 765 R +0 2446 R +0 -2446 R +0 1 V +0 2445 R +0 -919 R +0 1 V +0 1334 R +0 -3672 R +0 2338 R +0 1334 R +0 -415 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +5753 3768 L +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +0 -2278 R +0 2278 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 R +5821 3815 L +0 1 V +0 -3070 R +0 3070 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +2 0 R +0 1 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 -3260 R +0 3260 R +0 -3260 R +0 3260 R +1 5 R +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +5885 3864 L +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +1 1 V +2 1 V +1 1 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 R +2 1 R +0 -1019 R +0 -1392 R +1 2412 R +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +2 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +1 1 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +0 -1 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +5961 3917 L +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +0 -2132 R +0 2132 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 555 R +0 -555 R +0 1 V +1 0 V +0 2 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +0 1 V +1 -451 R +0 451 R +1 0 R +0 1 V +0 -1554 R +1 0 V +0 1554 R +6025 3964 L +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +0 5 R +0 -5 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 -5 R +0 5 R +0 1 R +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 R +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +6092 4010 L +0 1 V +1 0 V +1 0 R +0 1 R +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 8 R +0 -8 R +0 8 R +0 -8 R +0 9 R +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +2 2 R +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +0 -1089 R +0 1089 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +1 0 V +-1 482 R +1 -481 R +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 R +0 -2284 R +0 2284 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +6154 4063 L +1 0 V +6154 798 M +1 0 V +0 3265 R +1 1 V +0 -880 R +0 880 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +4 383 R +8 -1431 R +6 1109 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 1 R +0 1 V +1 0 R +1 1 V +1 1 V +2 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +2 2 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 -3028 R +0 3028 R +6238 4169 L +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 -1334 R +0 -2338 R +0 2338 R +0 1334 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +1 1 V +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +0 -687 R +0 687 R +1 0 R +0 1 R +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 0 R +0 1 R +1 3 R +1 0 V +0 1 V +2 0 R +18 15 R +1 -1 R +0 272 R +0 -3943 R +0 3943 R +0 -3943 R +0 3672 R +0 271 R +1 -272 R +0 272 R +0 -3943 R +0 3671 R +0 272 R +0 -272 R +0 1 V +0 271 R +0 -271 R +0 271 R +1 -271 R +1 0 R +0 1 V +1 0 R +0 1 R +1 1 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +6312 4223 L +0 1 R +1 0 R +0 -1148 R +0 1148 R +0 -2627 R +0 2627 R +1 1 R +0 1 R +1 0 V +1 0 V +1 3 R +1 0 V +0 1 V +1 0 R +2 -752 R +0 753 R +0 -753 R +0 753 R +0 -753 R +0 1 V +0 752 R +0 -752 R +1 0 V +-1 752 R +1 1 V +0 -753 R +0 753 R +0 -753 R +0 753 R +0 -753 R +0 753 R +1 0 V +-1 -753 R +1 1 V +0 753 R +1 0 V +-1 -753 R +1 0 V +0 753 R +2 2 V +-2 -755 R +0 1 V +2 1 V +0 753 R +0 -753 R +0 753 R +0 -753 R +0 753 R +0 -753 R +1 0 V +-1 753 R +1 0 V +0 -753 R +0 753 R +0 -752 R +0 752 R +0 1 V +0 -753 R +1 0 V +-1 753 R +1 0 V +0 -753 R +0 753 R +0 -753 R +0 753 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +0 -3593 R +1 0 V +0 3593 R +0 1 R +0 -2577 R +0 2577 R +1 0 V +0 1 V +1 0 R +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 -2796 R +0 2796 R +6350 4251 L +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +0 -2830 R +0 950 R +1 1880 R +0 1 V +1 0 V +0 -1880 R +0 -950 R +0 2830 R +1 1 V +0 -2830 R +0 2830 R +1 1 V +1 0 V +0 -1880 R +0 -950 R +0 2830 R +0 1 V +1 0 V +1 0 R +0 1 V +0 -2831 R +0 2831 R +0 -2830 R +0 949 R +0 1881 R +1 0 V +-1 -2830 R +1 0 V +0 2830 R +0 1 V +1 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +0 -2063 R +0 2063 R +1 281 R +1 -280 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +6405 4291 L +1 0 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 R +0 1 R +1 0 V +0 1 V +0 -3350 R +0 3350 R +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +-1 -2169 R +1 2169 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +0 1 R +0 -1 R +1 0 R +0 1 V +1 0 R +0 -1 R +0 1 R +0 -1 R +2 2 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 R +6467 4333 L +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +1 1 R +1 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +0 -1410 R +0 1410 R +1 0 R +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 R +1 1 V +6533 4379 L +0 1 V +1 0 V +0 1 R +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 R +0 1 V +1 -623 R +0 623 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +-1 -2936 R +1 0 V +0 2936 R +0 1 V +1 0 V +1 0 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 R +1 0 R +0 1 R +1 0 R +1 1 R +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 R +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 R +0 1 V +1 0 V +0 1 V +0 -1631 R +1 1631 R +1 0 V +0 1 R +1 0 V +0 1 R +1 0 R +1 0 V +0 1 V +6596 4423 L +0 1 V +1 0 R +1 1 R +1 0 R +0 1 V +1 0 V +1 -3404 R +0 3109 R +1 0 R +2 296 R +1 0 R +1 0 V +1 0 V +0 2 R +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 V +2 1 R +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +2 2 R +0 1 V +1 0 R +0 6 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +2 1 R +1 1 V +1 0 V +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +0 -3398 R +0 3398 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +0 -33 R +0 33 R +1 0 R +0 1 V +1 0 V +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +6664 4475 L +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +0 -272 R +1 0 V +-1 272 R +1 0 V +0 1 R +6 -2448 R +20 901 R +5 -631 R +0 1 V +5 316 R +3 -219 R +0 379 R +0 -626 R +14 1219 R +7 -1667 R +1 1 V +1 2594 R +5 -3565 R +85 1092 R +29 2714 R +1 0 R +0 1 R +1 0 R +1 1 R +1 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 0 V +0 1 R +1 0 R +0 1 V +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 R +1 0 R +0 1 V +1 0 R +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 3 R +0 -3 R +0 3 R +1 -3 R +0 1 V +0 2 R +0 1 V +0 -3 R +0 3 R +0 -3 R +0 3 R +0 -3 R +1 0 V +0 3 R +0 -3 R +0 3 R +0 -2 R +1 0 V +0 3 R +0 -3 R +0 3 R +0 -3 R +1 1 V +0 -3 R +0 3 R +0 -3 R +1 0 V +-1 3 R +1 0 V +0 -3 R +0 3 R +0 -3 R +0 3 R +0 -3 R +0 1 V +0 2 R +0 1 V +0 -3 R +0 3 R +0 -3 R +6882 4559 L +-1 3 R +1 0 R +0 -3 R +0 3 R +0 -3 R +0 3 R +0 -3 R +0 3 R +0 1 V +1 0 R +0 -3 R +0 3 R +0 -3 R +0 3 R +1 0 R +0 1 V +0 -4 R +0 4 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 V +0 1 V +1 0 V +1 1 R +1 0 R +0 1 R +1 0 V +0 1 V +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +0 -3879 R +1 3879 R +0 1 V +1 0 V +0 1 R +1 0 R +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +0 1 V +1 0 V +1 0 V +0 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 V +0 1 R +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +0 -2417 R +0 595 R +1 1825 R +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 R +0 1 R +1 0 V +0 1 R +1 0 V +1 0 V +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +6939 4604 L +0 1 V +1 0 V +1 1 R +1 -3993 R +1 3993 R +0 1 V +1 0 V +0 1 V +1 0 V +1 0 V +0 1 R +4 0 R +0 1 V +1 0 R +0 1 V +1 0 V +1 1 V +1 0 R +0 1 V +1 0 V +0 6 R +0 -6 R +0 1 V +0 3 R +0 2 R +0 -5 R +1 0 V +0 5 R +0 1 V +0 -6 R +0 6 R +0 -4137 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +0 4131 R +1 1 R +1 0 R +0 1 V +0 2 R +0 -2 R +1 0 R +1 1 V +-1 2 R +1 0 V +-1 -6 R +1 0 V +0 4 R +1 0 V +0 1 V +stroke +LT0 +1.00 0.00 0.00 C /Helvetica findfont 140 scalefont setfont +6941 3743 M +-39 84 R +-12 657 R +-28 -817 R +-12 824 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 -488 R +0 488 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 3 R +0 -2 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +6823 4510 L +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 1 R +0 -1 V +0 1 R +-1 -482 R +1 0 V +-1 482 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +6793 4531 L +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 -522 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +0 8 R +0 -8 R +0 8 R +0 -8 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 45 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +6765 4074 L +-1 0 R +-1 1 R +1 0 V +-2 0 R +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-3 5 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 1 R +-1 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +-6 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +6720 4104 L +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +0 1 R +0 -1688 R +0 1688 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +1 0 V +-2 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +0 -2436 R +0 2436 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +0 -3427 R +0 3427 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +6691 4125 L +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-2 -59 R +0 1 R +0 -1 V +0 1 R +0 58 R +-2 -59 R +0 1 R +0 58 R +-2 -59 R +-29 -796 R +6638 1343 M +-2 695 R +-2 1 R +-1 1 R +-15 1783 R +-29 -930 R +-6 3 R +6553 1296 M +-11 1518 R +-17 -512 R +-40 1765 R +6464 2871 M +-10 1592 R +-7 -3294 R +-19 2846 R +-3 -2877 R +-1 2957 R +-6 -1190 R +6411 522 M +0 2125 R +0 -1 V +0 -169 R +-10 950 R +0 -703 R +-82 1 R +0 173 R +6313 595 M +-20 3602 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +6283 4204 L +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 0 R +6243 1015 M +-13 555 R +-3 2 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +2 -1 V +-3 2 R +1 -1 V +-2 2 R +1 -1 V +-2 1 R +-1 1 R +1 -1 V +-1 1 R +-1 2 R +3 -2 V +-6 3 R +6196 1594 L +-1 2944 R +-1 -2943 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +1 -1 V +-1 1481 R +1 -1 V +-16 758 R +-5 203 R +0 -1 V +-2 2 R +-2 -2755 R +-4 3273 R +-7 -943 R +6079 2373 M +-42 538 R +1 0 V +6025 496 M +0 570 R +0 2502 R +0 -1153 R +0 1337 R +0 -2446 R +-1 1528 R +1 -1 V +-1 -1758 R +0 2493 R +0 -2502 R +0 1 R +0 -1 V +-1 2502 R +1 0 V +-1 -2501 R +1 0 V +-1 1767 R +0 919 R +0 -2446 R +0 -240 R +0 2502 R +0 -735 R +0 -1767 R +0 -570 R +0 3072 R +0 -3072 R +0 3256 R +0 -918 R +0 -1528 R +0 -765 R +0 534 R +-1 -9 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 2501 R +0 -734 R +0 -2337 R +-1 570 R +1 0 V +-1 0 R +0 -525 R +0 2237 R +0 94 R +-26 -427 R +-28 -242 R +-19 784 R +-44 629 R +-7 -1323 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 1566 R +-14 625 R +5847 680 M +-15 3756 R +5821 2751 M +-12 1803 R +5783 1711 M +-24 2330 R +5737 2441 M +0 724 R +0 -2150 R +0 1389 R +0 -1389 R +-7 2948 R +0 -1554 R +-15 2197 R +5716 4606 L +5651 2055 M +-1 455 R +5622 1460 M +-2 1519 R +1 0 V +-1 0 R +-6 101 R +-69 186 R +-6 -2203 R +0 -283 R +-2 2836 R +5479 2404 M +-5 3 R +0 477 R +0 -476 R +0 -1 V +-1 1555 R +-1 1 R +1 0 V +-1 -1555 R +0 1 R +-38 908 R +-16 178 R +-8 -318 R +-32 250 R +-10 103 R +-10 -41 R +-3 -1849 R +0 1 R +0 2623 R +0 -3358 R +0 734 R +0 -734 R +0 734 R +0 2625 R +0 -3359 R +0 734 R +-1 1 R +0 -1 V +0 2624 R +0 -664 R +0 -2694 R +0 735 R +0 2848 R +0 -1371 R +0 239 R +0 908 R +5246 1755 M +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +0 1 R +0 338 R +0 -338 R +-1 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-6 1 R +0 1 R +0 -1 V +-1 1 R +-1 1 R +-8 1987 R +-3 -324 R +0 -703 R +5159 1355 M +0 -1 V +-4 40 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-2 1 R +0 3 R +0 -3 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-3 1 R +1 -1 V +-1 1 R +-1 0 R +5144 1401 L +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-2 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 1666 R +0 -1666 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +5115 1421 L +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 2830 R +0 -2830 R +-1 1 R +0 2830 R +0 -2830 R +-1 2831 R +0 -2830 R +-1 0 R +-1 2831 R +0 -2830 R +0 950 R +0 -950 R +0 2830 R +-1 -2830 R +0 2831 R +0 -2831 R +0 2831 R +0 -2831 R +0 1 R +0 949 R +0 -949 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 950 R +0 -950 R +-1 1 R +1 -1 V +-1 2831 R +1 0 V +-1 -1881 R +0 -949 R +0 2830 R +0 -2830 R +0 2830 R +0 -2830 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +5088 1439 L +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +-1 2261 R +0 -2261 R +0 1 R +-1 0 R +0 1966 R +0 -2721 R +-1 755 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +5051 1464 L +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 196 R +0 -196 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +1 -1 V +-2 1 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +1 0 V +-1 0 R +-1 6 R +0 -6 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +5025 1482 L +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 0 R +4998 1501 L +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +4967 1523 L +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 1 R +-1 0 R +0 1 R +-1 0 R +0 -365 R +0 366 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1149 R +0 -1 V +0 -1147 R +-1 2483 R +-1 -2478 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +4934 1550 L +0 1 R +-1 -719 R +1 0 V +-1 719 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +1 0 V +-2 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +0 1620 R +0 -1620 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-5 2 R +-1 0 R +1 0 V +-5 1881 R +-4 -1868 R +-1 0 R +-1 1 R +-1 1 R +-1 0 R +4889 1587 L +-1 0 R +0 1 R +0 -1 V +0 1 R +1 0 V +-2 0 R +1 0 V +-1 1 R +1 -1 V +-3 2 R +1 -1 V +-1 2 R +1 -1 V +-3 1 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +-1 1 R +0 2 R +3 -2 V +-6 3 R +1 -1 V +-1 2944 R +-1 -2943 R +-1 0 R +0 1 R +0 -1 V +0 1481 R +0 -1 V +4862 1508 M +-15 1598 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-2 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +-1 2 R +3 -2 V +-5 2 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-5 -1454 R +-1 1 R +1 -1 V +-1 1 R +-2 1106 R +4794 1108 M +-4 514 R +-47 -8 R +-6 6 R +-3 2 R +-6 1489 R +-4 -31 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +4708 1823 M +-45 1340 R +-8 1298 R +0 -1606 R +0 -1392 R +0 -683 R +0 1944 R +0 -144 R +0 144 R +0 1759 R +0 -3702 R +0 -1 V +0 2118 R +-3 1273 R +-4 -1489 R +-14 -307 R +4624 1245 M +-20 -90 R +-2 2750 R +-9 -2143 R +-10 1916 R +0 -1 R +4507 1617 M +-30 481 R +-8 2201 R +4449 2783 M +4443 906 M +-1 3359 R +-9 -512 R +0 -918 R +0 -1528 R +0 -765 R +-5 3416 R +-1 -2285 R +4428 1673 L +-1 0 R +-1 0 R +-3 2095 R +1 0 V +-6 563 R +4408 1798 M +-16 2808 R +0 -1 V +-22 -647 R +-7 5 R +-1 -1554 R +-1 1879 R +4354 697 M +1 0 V +-6 1143 R +1 0 V +4348 698 M +1 -1 V +-42 2610 R +-2 1270 R +-7 -2497 R +-6 -323 R +-1 5 R +-1 0 R +1 0 V +-15 2346 R +0 -2737 R +0 2587 R +4234 1810 M +-1 0 R +1 0 V +-1 0 R +-12 88 R +1 0 V +-2 -92 R +-5 -344 R +-9 -566 R +0 317 R +-38 3247 R +-7 -1712 R +-12 679 R +-8 -1110 R +0 -96 R +-2 20 R +1 0 V +-25 375 R +4091 945 M +0 3350 R +4083 740 M +-27 2384 R +-6 985 R +4040 2662 M +0 1 R +-13 673 R +4015 1666 M +-4 2259 R +0 -1 V +3943 720 M +-25 3781 R +3914 893 M +-27 1500 R +1 0 V +3858 544 M +0 2293 R +3840 945 M +0 3350 R +3830 1362 M +-26 1143 R +-44 -101 R +-1 0 R +0 -1389 R +0 1389 R +-2 1 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1555 R +0 -1554 R +0 -1 V +0 1555 R +0 -1554 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 1554 R +0 -1554 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +3750 2410 L +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 -1349 R +0 2502 R +0 -1153 R +0 -1874 R +0 2293 R +0 -1 V +-1 735 R +0 -1152 R +3741 542 M +1 0 V +-1 1874 R +1 0 V +3741 497 M +0 1919 R +0 1153 R +0 -2502 R +0 -570 R +0 1919 R +0 -1349 R +0 1350 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +-2 2 R +0 -1919 R +0 1919 R +0 -1341 R +0 -1 V +0 -577 R +0 1919 R +0 -1874 R +0 -45 R +0 1919 R +0 -1919 R +0 2338 R +0 1334 R +0 -3672 R +0 2338 R +0 -419 R +0 -1341 R +-1 1342 R +-5 2069 R +-8 -1110 R +-2 -870 R +3693 781 M +3679 502 M +1 0 V +-8 961 R +-13 2948 R +-7 -2091 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +3644 2325 L +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +0 1012 R +0 -1012 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 145 R +-1 -145 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-12 336 R +-5 -293 R +3598 2387 L +-3 1722 R +-13 485 R +-3 -727 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +3547 2275 M +-28 -772 R +-45 1177 R +3464 1448 M +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +-4 2468 R +0 -1 V +3431 496 M +0 2228 R +-1 1150 R +-15 -649 R +-3 -386 R +0 1334 R +0 -3672 R +0 2338 R +-35 1269 R +-44 -747 R +-5 -1742 R +-16 2915 R +3246 788 M +1 0 V +-15 -42 R +-49 3613 R +-12 -184 R +-3 83 R +3156 1285 M +-37 2065 R +3080 1561 M +-14 2759 R +3062 817 M +-129 864 R +-3 1453 R +0 -1595 R +-1 1595 R +1 0 V +-1 -1595 R +1 0 V +-1 1596 R +0 -1596 R +-14 2313 R +-6 -949 R +2896 834 M +-1 2053 R +0 -1982 R +-10 1848 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-5 -9 R +-5 507 R +2865 784 M +-18 1326 R +-13 604 R +2833 650 M +-9 3591 R +-6 -2377 R +1 0 V +-1 1541 R +-1 1 R +-1 5 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 -5 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +0 -613 R +0 613 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-2 -615 R +-1 0 R +1 0 V +-1 617 R +-1 0 R +-17 -451 R +-18 700 R +0 -36 R +2759 1617 M +-69 1524 R +0 1 R +2690 3141 L +-1 1 R +0 -266 R +0 266 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 -656 R +1 0 V +-1 656 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 -1086 R +0 1086 R +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +2662 3161 L +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +2634 3180 L +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 880 R +0 -1 V +-1 -878 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 -1620 R +0 1621 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +1 0 V +-2 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +2605 3200 L +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-2 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 -2348 R +0 2349 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +0 -2516 R +0 2455 R +0 64 R +-1 -1552 R +0 1550 R +2572 3222 L +-1 1 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +0 5 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +2546 3246 L +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 -8 R +0 8 R +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +2514 3268 L +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +0 -2256 R +0 156 R +-1 2100 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 -2065 R +0 615 R +0 1450 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +2486 3287 L +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-2 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +2453 3310 L +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +1 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +-1 0 R +-1 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +2421 3332 L +0 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +-1 1 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +1 -1 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 83 R +0 -1662 R +0 1579 R +0 1 R +0 -1 V +-1 1 R +1 0 V +-1 0 R +-1 0 R +1 0 V +-1 0 R +1 0 V +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 0 R +0 1 R +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +0 1 R +0 -1 V +-1 1 R +-1 -989 R +0 990 R +-10 275 R +1 0 V +-1 -387 R +-23 -835 R +-14 66 R +-8 254 R +-16 -696 R +-20 1928 R +-23 578 R +-4 -1033 R +-6 892 R +2256 1997 M +-15 -869 R +-1 1 R +0 -1 V +-2 1306 R +1 0 V +-25 993 R +2215 3427 L +-1 -703 R +1 0 V +-1 1150 R +-18 -866 R +-38 -264 R +0 -1 V +-4 -1527 R +0 -1 V +-3 1512 R +2131 1063 M +0 47 R +0 2317 R +0 -847 R +0 -1800 R +0 221 R +0 -220 R +0 -1 V +0 2118 R +0 1586 R +0 -1586 R +0 -173 R +-21 -213 R +-30 1878 R +-38 -86 R +0 -3762 R +-50 1861 R +1976 902 M +-4 1953 R +-1 1695 R +1958 641 M +-1 1763 R +-1 0 R +-3 1107 R +0 451 R +-1 -1554 R +0 1555 R +-6 -5 R +-1 0 R +-1 1 R +1 0 V +-1 0 R +-1 1 R +-1 0 R +1 0 V +-1 0 R +0 1 R +0 -1 V +0 1 R +-1 0 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 -450 R +0 451 R +-1 0 R +1 0 V +-1 1 R +0 -1 V +0 1 R +-1 -1554 R +0 1554 R +0 -2838 R +-4 2496 R +0 -1336 R +1900 519 M +0 -1 V +-36 2321 R +0 1334 R +0 -3672 R +0 2338 R +0 1334 R +1762 2477 M +0 -1955 R +0 2125 R +0 -1 V +-4 1397 R +-24 74 R +-1 -711 R +1711 1463 M +0 2411 R +0 -3168 R +0 2192 R +0 -1435 R +0 1964 R +0 324 R +1696 906 M +0 734 R +0 2625 R +0 -3359 R +-1 2715 R +0 -2531 R +-3 2 R +-9 1635 R +-13 180 R +1636 1143 M +-26 2344 R +0 -2288 R +0 2764 R +1609 780 M +0 -74 R +0 295 R +-9 2440 R +1585 1463 M +-1 -682 R +-17 1151 R +-30 135 R +1489 746 M +1489 745 L +0 1 R +-13 -48 R +1 0 V +-1 1 R +1 0 V +-1 1358 R +0 -1303 R +0 1 R +-18 2273 R +-12 -784 R +-12 617 R +1409 746 M +0 -1 V +0 1 R +-2 1210 R +0 1182 R +0 -1595 R +-18 897 R +-2 1394 R +-23 707 R +1330 3475 M +-35 128 R +0 -2799 R +0 4 R +0 2799 R +1274 1597 M +0 1479 R +0 1148 R +1253 3192 M +-28 966 R +1206 834 M +1 0 V +-1 2052 R +0 -1 V +-1 -481 R +-1 0 R +-5 -1266 R +0 2373 R +-1 -1103 R +-1 1555 R +1183 1420 M +-18 -381 R +-1 590 R +0 -1 V +-25 776 R +1 0 V +-1 0 R +-5 4 R +0 -1 V +0 1555 R +-1 -2529 R +1 0 V +-1 975 R +0 1555 R +1122 935 M +stroke +1.000 UL +LTb +1113 4620 N +0 -4137 V +5849 0 V +0 4137 V +-5849 0 V +Z stroke +1.000 UP +1.000 UL +LTb +stroke +grestore +end +showpage +%%Trailer +%%DocumentFonts: Helvetica +%%Pages: 1 diff --git a/bp_doc/histogram.png b/bp_doc/histogram.png new file mode 100644 index 0000000..fb5325e Binary files /dev/null and b/bp_doc/histogram.png differ diff --git a/bp_doc/karyogram.png b/bp_doc/karyogram.png new file mode 100644 index 0000000..ec72292 Binary files /dev/null and b/bp_doc/karyogram.png differ diff --git a/bp_doc/karyogram.svg b/bp_doc/karyogram.svg new file mode 100644 index 0000000..b5a4b1d --- /dev/null +++ b/bp_doc/karyogram.svg @@ -0,0 +1,1182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 6 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 7 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 9 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 11 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 13 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 14 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 15 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 16 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 17 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 18 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 19 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 20 + + + + + + + + + + + + + + + + + + + + + + + + + + 21 + + + + + + + + + + + + + + + + + + + + + + + + + + + + 22 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + X + + + + + + + + + + + + + + + + + + + + + + + Y + + + + \ No newline at end of file diff --git a/bp_doc/lendist.pdf b/bp_doc/lendist.pdf new file mode 100644 index 0000000..fe82147 Binary files /dev/null and b/bp_doc/lendist.pdf differ diff --git a/bp_doc/lendist.ps b/bp_doc/lendist.ps new file mode 100644 index 0000000..97c878f --- /dev/null +++ b/bp_doc/lendist.ps @@ -0,0 +1,817 @@ +%!PS-Adobe-2.0 +%%Creator: gnuplot 4.2 patchlevel 0 +%%CreationDate: Mon Sep 3 10:28:29 2007 +%%DocumentFonts: (atend) +%%BoundingBox: 50 50 554 770 +%%Orientation: Landscape +%%Pages: (atend) +%%EndComments +%%BeginProlog +/gnudict 256 dict def +gnudict begin +% +% The following 6 true/false flags may be edited by hand if required +% The unit line width may also be changed +% +/Color false def +/Blacktext false def +/Solid false def +/Dashlength 1 def +/Landscape true def +/Level1 false def +/Rounded false def +/TransparentPatterns false def +/gnulinewidth 5.000 def +/userlinewidth gnulinewidth def +% +/vshift -33 def +/dl1 { + 10.0 Dashlength mul mul + Rounded { currentlinewidth 0.75 mul sub dup 0 le { pop 0.01 } if } if +} def +/dl2 { + 10.0 Dashlength mul mul + Rounded { currentlinewidth 0.75 mul add } if +} def +/hpt_ 31.5 def +/vpt_ 31.5 def +/hpt hpt_ def +/vpt vpt_ def +Level1 {} { +/SDict 10 dict def +systemdict /pdfmark known not { + userdict /pdfmark systemdict /cleartomark get put +} if +SDict begin [ + /Title () + /Subject (gnuplot plot) + /Creator (gnuplot 4.2 patchlevel 0) + /Author (Martin Hansen) +% /Producer (gnuplot) +% /Keywords () + /CreationDate (Mon Sep 3 10:28:29 2007) + /DOCINFO pdfmark +end +} ifelse +% +% Gnuplot Prolog Version 4.2 (August 2006) +% +/M {moveto} bind def +/L {lineto} bind def +/R {rmoveto} bind def +/V {rlineto} bind def +/N {newpath moveto} bind def +/Z {closepath} bind def +/C {setrgbcolor} bind def +/f {rlineto fill} bind def +/vpt2 vpt 2 mul def +/hpt2 hpt 2 mul def +/Lshow {currentpoint stroke M 0 vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/Rshow {currentpoint stroke M dup stringwidth pop neg vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/Cshow {currentpoint stroke M dup stringwidth pop -2 div vshift R + Blacktext {gsave 0 setgray show grestore} {show} ifelse} def +/UP {dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def + /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def} def +/DL {Color {setrgbcolor Solid {pop []} if 0 setdash} + {pop pop pop 0 setgray Solid {pop []} if 0 setdash} ifelse} def +/BL {stroke userlinewidth 2 mul setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +/AL {stroke userlinewidth 2 div setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +/UL {dup gnulinewidth mul /userlinewidth exch def + dup 1 lt {pop 1} if 10 mul /udl exch def} def +/PL {stroke userlinewidth setlinewidth + Rounded {1 setlinejoin 1 setlinecap} if} def +% Default Line colors +/LCw {1 1 1} def +/LCb {0 0 0} def +/LCa {0 0 0} def +/LC0 {1 0 0} def +/LC1 {0 1 0} def +/LC2 {0 0 1} def +/LC3 {1 0 1} def +/LC4 {0 1 1} def +/LC5 {1 1 0} def +/LC6 {0 0 0} def +/LC7 {1 0.3 0} def +/LC8 {0.5 0.5 0.5} def +% Default Line Types +/LTw {PL [] 1 setgray} def +/LTb {BL [] LCb DL} def +/LTa {AL [1 udl mul 2 udl mul] 0 setdash LCa setrgbcolor} def +/LT0 {PL [] LC0 DL} def +/LT1 {PL [4 dl1 2 dl2] LC1 DL} def +/LT2 {PL [2 dl1 3 dl2] LC2 DL} def +/LT3 {PL [1 dl1 1.5 dl2] LC3 DL} def +/LT4 {PL [6 dl1 2 dl2 1 dl1 2 dl2] LC4 DL} def +/LT5 {PL [3 dl1 3 dl2 1 dl1 3 dl2] LC5 DL} def +/LT6 {PL [2 dl1 2 dl2 2 dl1 6 dl2] LC6 DL} def +/LT7 {PL [1 dl1 2 dl2 6 dl1 2 dl2 1 dl1 2 dl2] LC7 DL} def +/LT8 {PL [2 dl1 2 dl2 2 dl1 2 dl2 2 dl1 2 dl2 2 dl1 4 dl2] LC8 DL} def +/Pnt {stroke [] 0 setdash gsave 1 setlinecap M 0 0 V stroke grestore} def +/Dia {stroke [] 0 setdash 2 copy vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath stroke + Pnt} def +/Pls {stroke [] 0 setdash vpt sub M 0 vpt2 V + currentpoint stroke M + hpt neg vpt neg R hpt2 0 V stroke + } def +/Box {stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath stroke + Pnt} def +/Crs {stroke [] 0 setdash exch hpt sub exch vpt add M + hpt2 vpt2 neg V currentpoint stroke M + hpt2 neg 0 R hpt2 vpt2 V stroke} def +/TriU {stroke [] 0 setdash 2 copy vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath stroke + Pnt} def +/Star {2 copy Pls Crs} def +/BoxF {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath fill} def +/TriUF {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath fill} def +/TriD {stroke [] 0 setdash 2 copy vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath stroke + Pnt} def +/TriDF {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath fill} def +/DiaF {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath fill} def +/Pent {stroke [] 0 setdash 2 copy gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath stroke grestore Pnt} def +/PentF {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath fill grestore} def +/Circle {stroke [] 0 setdash 2 copy + hpt 0 360 arc stroke Pnt} def +/CircleF {stroke [] 0 setdash hpt 0 360 arc fill} def +/C0 {BL [] 0 setdash 2 copy moveto vpt 90 450 arc} bind def +/C1 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc closepath fill + vpt 0 360 arc closepath} bind def +/C2 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C3 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C4 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 180 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C5 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc + 2 copy moveto + 2 copy vpt 180 270 arc closepath fill + vpt 0 360 arc} bind def +/C6 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C7 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 270 arc closepath fill + vpt 0 360 arc closepath} bind def +/C8 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 270 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C9 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 270 450 arc closepath fill + vpt 0 360 arc closepath} bind def +/C10 {BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill + 2 copy moveto + 2 copy vpt 90 180 arc closepath fill + vpt 0 360 arc closepath} bind def +/C11 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 180 arc closepath fill + 2 copy moveto + 2 copy vpt 270 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C12 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 180 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C13 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 0 90 arc closepath fill + 2 copy moveto + 2 copy vpt 180 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/C14 {BL [] 0 setdash 2 copy moveto + 2 copy vpt 90 360 arc closepath fill + vpt 0 360 arc} bind def +/C15 {BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill + vpt 0 360 arc closepath} bind def +/Rec {newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto + neg 0 rlineto closepath} bind def +/Square {dup Rec} bind def +/Bsquare {vpt sub exch vpt sub exch vpt2 Square} bind def +/S0 {BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare} bind def +/S1 {BL [] 0 setdash 2 copy vpt Square fill Bsquare} bind def +/S2 {BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def +/S3 {BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare} bind def +/S4 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def +/S5 {BL [] 0 setdash 2 copy 2 copy vpt Square fill + exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def +/S6 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare} bind def +/S7 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill + 2 copy vpt Square fill Bsquare} bind def +/S8 {BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare} bind def +/S9 {BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare} bind def +/S10 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill + Bsquare} bind def +/S11 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill + Bsquare} bind def +/S12 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare} bind def +/S13 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill + 2 copy vpt Square fill Bsquare} bind def +/S14 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill + 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def +/S15 {BL [] 0 setdash 2 copy Bsquare fill Bsquare} bind def +/D0 {gsave translate 45 rotate 0 0 S0 stroke grestore} bind def +/D1 {gsave translate 45 rotate 0 0 S1 stroke grestore} bind def +/D2 {gsave translate 45 rotate 0 0 S2 stroke grestore} bind def +/D3 {gsave translate 45 rotate 0 0 S3 stroke grestore} bind def +/D4 {gsave translate 45 rotate 0 0 S4 stroke grestore} bind def +/D5 {gsave translate 45 rotate 0 0 S5 stroke grestore} bind def +/D6 {gsave translate 45 rotate 0 0 S6 stroke grestore} bind def +/D7 {gsave translate 45 rotate 0 0 S7 stroke grestore} bind def +/D8 {gsave translate 45 rotate 0 0 S8 stroke grestore} bind def +/D9 {gsave translate 45 rotate 0 0 S9 stroke grestore} bind def +/D10 {gsave translate 45 rotate 0 0 S10 stroke grestore} bind def +/D11 {gsave translate 45 rotate 0 0 S11 stroke grestore} bind def +/D12 {gsave translate 45 rotate 0 0 S12 stroke grestore} bind def +/D13 {gsave translate 45 rotate 0 0 S13 stroke grestore} bind def +/D14 {gsave translate 45 rotate 0 0 S14 stroke grestore} bind def +/D15 {gsave translate 45 rotate 0 0 S15 stroke grestore} bind def +/DiaE {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V closepath stroke} def +/BoxE {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V closepath stroke} def +/TriUE {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V closepath stroke} def +/TriDE {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V closepath stroke} def +/PentE {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + closepath stroke grestore} def +/CircE {stroke [] 0 setdash + hpt 0 360 arc stroke} def +/Opaque {gsave closepath 1 setgray fill grestore 0 setgray closepath} def +/DiaW {stroke [] 0 setdash vpt add M + hpt neg vpt neg V hpt vpt neg V + hpt vpt V hpt neg vpt V Opaque stroke} def +/BoxW {stroke [] 0 setdash exch hpt sub exch vpt add M + 0 vpt2 neg V hpt2 0 V 0 vpt2 V + hpt2 neg 0 V Opaque stroke} def +/TriUW {stroke [] 0 setdash vpt 1.12 mul add M + hpt neg vpt -1.62 mul V + hpt 2 mul 0 V + hpt neg vpt 1.62 mul V Opaque stroke} def +/TriDW {stroke [] 0 setdash vpt 1.12 mul sub M + hpt neg vpt 1.62 mul V + hpt 2 mul 0 V + hpt neg vpt -1.62 mul V Opaque stroke} def +/PentW {stroke [] 0 setdash gsave + translate 0 hpt M 4 {72 rotate 0 hpt L} repeat + Opaque stroke grestore} def +/CircW {stroke [] 0 setdash + hpt 0 360 arc Opaque stroke} def +/BoxFill {gsave Rec 1 setgray fill grestore} def +/Density { + /Fillden exch def + currentrgbcolor + /ColB exch def /ColG exch def /ColR exch def + /ColR ColR Fillden mul Fillden sub 1 add def + /ColG ColG Fillden mul Fillden sub 1 add def + /ColB ColB Fillden mul Fillden sub 1 add def + ColR ColG ColB setrgbcolor} def +/BoxColFill {gsave Rec PolyFill} def +/PolyFill {gsave Density fill grestore grestore} def +/h {rlineto rlineto rlineto gsave fill grestore} bind def +% +% PostScript Level 1 Pattern Fill routine for rectangles +% Usage: x y w h s a XX PatternFill +% x,y = lower left corner of box to be filled +% w,h = width and height of box +% a = angle in degrees between lines and x-axis +% XX = 0/1 for no/yes cross-hatch +% +/PatternFill {gsave /PFa [ 9 2 roll ] def + PFa 0 get PFa 2 get 2 div add PFa 1 get PFa 3 get 2 div add translate + PFa 2 get -2 div PFa 3 get -2 div PFa 2 get PFa 3 get Rec + gsave 1 setgray fill grestore clip + currentlinewidth 0.5 mul setlinewidth + /PFs PFa 2 get dup mul PFa 3 get dup mul add sqrt def + 0 0 M PFa 5 get rotate PFs -2 div dup translate + 0 1 PFs PFa 4 get div 1 add floor cvi + {PFa 4 get mul 0 M 0 PFs V} for + 0 PFa 6 get ne { + 0 1 PFs PFa 4 get div 1 add floor cvi + {PFa 4 get mul 0 2 1 roll M PFs 0 V} for + } if + stroke grestore} def +% +/languagelevel where + {pop languagelevel} {1} ifelse + 2 lt + {/InterpretLevel1 true def} + {/InterpretLevel1 Level1 def} + ifelse +% +% PostScript level 2 pattern fill definitions +% +/Level2PatternFill { +/Tile8x8 {/PaintType 2 /PatternType 1 /TilingType 1 /BBox [0 0 8 8] /XStep 8 /YStep 8} + bind def +/KeepColor {currentrgbcolor [/Pattern /DeviceRGB] setcolorspace} bind def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke} +>> matrix makepattern +/Pat1 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke + 0 4 M 4 8 L 8 4 L 4 0 L 0 4 L stroke} +>> matrix makepattern +/Pat2 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 0 M 0 8 L + 8 8 L 8 0 L 0 0 L fill} +>> matrix makepattern +/Pat3 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 8 M 8 -4 L + 0 12 M 12 0 L stroke} +>> matrix makepattern +/Pat4 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -4 0 M 8 12 L + 0 -4 M 12 8 L stroke} +>> matrix makepattern +/Pat5 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 8 M 4 -4 L + 0 12 M 8 -4 L 4 12 M 10 0 L stroke} +>> matrix makepattern +/Pat6 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop -2 0 M 4 12 L + 0 -4 M 8 12 L 4 -4 M 10 8 L stroke} +>> matrix makepattern +/Pat7 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 8 -2 M -4 4 L + 12 0 M -4 8 L 12 4 M 0 10 L stroke} +>> matrix makepattern +/Pat8 exch def +<< Tile8x8 + /PaintProc {0.5 setlinewidth pop 0 -2 M 12 4 L + -4 0 M 12 8 L -4 4 M 8 10 L stroke} +>> matrix makepattern +/Pat9 exch def +/Pattern1 {PatternBgnd KeepColor Pat1 setpattern} bind def +/Pattern2 {PatternBgnd KeepColor Pat2 setpattern} bind def +/Pattern3 {PatternBgnd KeepColor Pat3 setpattern} bind def +/Pattern4 {PatternBgnd KeepColor Landscape {Pat5} {Pat4} ifelse setpattern} bind def +/Pattern5 {PatternBgnd KeepColor Landscape {Pat4} {Pat5} ifelse setpattern} bind def +/Pattern6 {PatternBgnd KeepColor Landscape {Pat9} {Pat6} ifelse setpattern} bind def +/Pattern7 {PatternBgnd KeepColor Landscape {Pat8} {Pat7} ifelse setpattern} bind def +} def +% +% +%End of PostScript Level 2 code +% +/PatternBgnd { + TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse +} def +% +% Substitute for Level 2 pattern fill codes with +% grayscale if Level 2 support is not selected. +% +/Level1PatternFill { +/Pattern1 {0.250 Density} bind def +/Pattern2 {0.500 Density} bind def +/Pattern3 {0.750 Density} bind def +/Pattern4 {0.125 Density} bind def +/Pattern5 {0.375 Density} bind def +/Pattern6 {0.625 Density} bind def +/Pattern7 {0.875 Density} bind def +} def +% +% Now test for support of Level 2 code +% +Level1 {Level1PatternFill} {Level2PatternFill} ifelse +% +/Symbol-Oblique /Symbol findfont [1 0 .167 1 0 0] makefont +dup length dict begin {1 index /FID eq {pop pop} {def} ifelse} forall +currentdict end definefont pop +end +%%EndProlog +%%Page: 1 1 +gnudict begin +gsave +50 50 translate +0.100 0.100 scale +90 rotate +0 -5040 translate +0 setgray +newpath +(Helvetica) findfont 100 scalefont setfont +1.000 UL +LTb +410 263 M +63 0 V +6557 0 R +-63 0 V +350 263 M +( 0) Rshow +1.000 UL +LTb +410 903 M +63 0 V +6557 0 R +-63 0 V +350 903 M +( 20) Rshow +1.000 UL +LTb +410 1542 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 40) Rshow +1.000 UL +LTb +410 2182 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 60) Rshow +1.000 UL +LTb +410 2821 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 80) Rshow +1.000 UL +LTb +410 3461 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 100) Rshow +1.000 UL +LTb +410 4100 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 120) Rshow +1.000 UL +LTb +410 4740 M +63 0 V +6557 0 R +-63 0 V +-6617 0 R +( 140) Rshow +1.000 UL +LTb +571 263 M +0 -63 V +0 -100 R +( 0) Cshow +1.000 UL +LTb +1379 263 M +0 -63 V +0 -100 R +( 5) Cshow +1.000 UL +LTb +2186 263 M +0 -63 V +0 -100 R +( 10) Cshow +1.000 UL +LTb +2993 263 M +0 -63 V +0 -100 R +( 15) Cshow +1.000 UL +LTb +3801 263 M +0 -63 V +0 -100 R +( 20) Cshow +1.000 UL +LTb +4608 263 M +0 -63 V +0 -100 R +( 25) Cshow +1.000 UL +LTb +5415 263 M +0 -63 V +0 -100 R +( 30) Cshow +1.000 UL +LTb +6223 263 M +0 -63 V +0 -100 R +( 35) Cshow +1.000 UL +LTb +7030 263 M +0 -63 V +0 -100 R +( 40) Cshow +1.000 UL +LTb +1.000 UL +LTb +410 4740 N +410 263 L +6620 0 V +0 4477 V +-6620 0 V +Z stroke +3720 4890 M +(Length Distribution) Cshow +1.000 UP +1.000 UL +LTb +1.000 UL +LT0 +/Helvetica findfont 100 scalefont setfont +1.000 531 263 82 1 BoxColFill +531 263 N +81 0 V +-81 0 V +Z stroke +1.000 693 263 81 1 BoxColFill +693 263 N +80 0 V +-80 0 V +Z stroke +1.000 854 263 82 1 BoxColFill +854 263 N +81 0 V +-81 0 V +Z stroke +1.000 1015 263 82 1 BoxColFill +1015 263 N +81 0 V +-81 0 V +Z stroke +1.000 1177 263 82 1 BoxColFill +1177 263 N +81 0 V +-81 0 V +Z stroke +1.000 1338 263 82 1 BoxColFill +1338 263 N +81 0 V +-81 0 V +Z stroke +1.000 1500 263 82 1 BoxColFill +1500 263 N +81 0 V +-81 0 V +Z stroke +1.000 1661 263 82 1 BoxColFill +1661 263 N +81 0 V +-81 0 V +Z stroke +1.000 1823 263 82 1 BoxColFill +1823 263 N +81 0 V +-81 0 V +Z stroke +1.000 1984 263 82 1 BoxColFill +1984 263 N +81 0 V +-81 0 V +Z stroke +1.000 2146 263 81 1 BoxColFill +2146 263 N +80 0 V +-80 0 V +Z stroke +1.000 2307 263 82 1 BoxColFill +2307 263 N +81 0 V +-81 0 V +Z stroke +1.000 2469 263 81 1 BoxColFill +2469 263 N +80 0 V +-80 0 V +Z stroke +1.000 2630 263 82 1 BoxColFill +2630 263 N +81 0 V +-81 0 V +Z stroke +1.000 2792 263 81 1 BoxColFill +2792 263 N +80 0 V +-80 0 V +Z stroke +1.000 2953 263 82 1 BoxColFill +2953 263 N +81 0 V +-81 0 V +Z stroke +1.000 3115 263 81 1 BoxColFill +3115 263 N +80 0 V +-80 0 V +Z stroke +1.000 3276 263 82 1 BoxColFill +3276 263 N +81 0 V +-81 0 V +Z stroke +1.000 3437 263 82 97 BoxColFill +3437 263 N +0 96 V +81 0 V +0 -96 V +-81 0 V +Z stroke +1.000 3599 263 82 1 BoxColFill +3599 263 N +81 0 V +-81 0 V +Z stroke +1.000 3760 263 82 129 BoxColFill +3760 263 N +0 128 V +81 0 V +0 -128 V +-81 0 V +Z stroke +1.000 3922 263 82 225 BoxColFill +3922 263 N +0 224 V +81 0 V +0 -224 V +-81 0 V +Z stroke +1.000 4083 263 82 257 BoxColFill +4083 263 N +0 256 V +81 0 V +0 -256 V +-81 0 V +Z stroke +1.000 4245 263 81 417 BoxColFill +4245 263 N +0 416 V +80 0 V +0 -416 V +-80 0 V +Z stroke +1.000 4406 263 82 385 BoxColFill +4406 263 N +0 384 V +81 0 V +0 -384 V +-81 0 V +Z stroke +1.000 4568 263 81 1120 BoxColFill +4568 263 N +0 1119 V +80 0 V +0 -1119 V +-80 0 V +Z stroke +1.000 4729 263 82 1408 BoxColFill +4729 263 N +0 1407 V +81 0 V +0 -1407 V +-81 0 V +Z stroke +1.000 4891 263 81 2527 BoxColFill +4891 263 N +0 2526 V +80 0 V +0 -2526 V +-80 0 V +Z stroke +1.000 5052 263 82 2943 BoxColFill +5052 263 N +0 2942 V +81 0 V +0 -2942 V +-81 0 V +Z stroke +1.000 5214 263 81 4094 BoxColFill +5214 263 N +0 4093 V +80 0 V +0 -4093 V +-80 0 V +Z stroke +1.000 5375 263 82 4062 BoxColFill +5375 263 N +0 4061 V +81 0 V +0 -4061 V +-81 0 V +Z stroke +1.000 5536 263 82 1824 BoxColFill +5536 263 N +0 1823 V +81 0 V +0 -1823 V +-81 0 V +Z stroke +1.000 5698 263 82 353 BoxColFill +5698 263 N +0 352 V +81 0 V +0 -352 V +-81 0 V +Z stroke +1.000 5859 263 82 129 BoxColFill +5859 263 N +0 128 V +81 0 V +0 -128 V +-81 0 V +Z stroke +1.000 6021 263 82 33 BoxColFill +6021 263 N +0 32 V +81 0 V +0 -32 V +-81 0 V +Z stroke +1.000 6182 263 82 33 BoxColFill +6182 263 N +0 32 V +81 0 V +0 -32 V +-81 0 V +Z stroke +1.000 6344 263 82 33 BoxColFill +6344 263 N +0 32 V +81 0 V +0 -32 V +-81 0 V +Z stroke +1.000 6505 263 82 65 BoxColFill +6505 263 N +0 64 V +81 0 V +0 -64 V +-81 0 V +Z stroke +1.000 UL +LTb +410 4740 N +410 263 L +6620 0 V +0 4477 V +-6620 0 V +Z stroke +1.000 UP +1.000 UL +LTb +stroke +grestore +end +showpage +%%Trailer +%%DocumentFonts: Helvetica +%%Pages: 1 diff --git a/bp_doc/lendist_ascii.png b/bp_doc/lendist_ascii.png new file mode 100644 index 0000000..d74f589 Binary files /dev/null and b/bp_doc/lendist_ascii.png differ diff --git a/bp_doc/seqlogo.png b/bp_doc/seqlogo.png new file mode 100644 index 0000000..ed5c236 Binary files /dev/null and b/bp_doc/seqlogo.png differ diff --git a/bp_doc/seqlogo.svg b/bp_doc/seqlogo.svg new file mode 100644 index 0000000..4453a4c --- /dev/null +++ b/bp_doc/seqlogo.svg @@ -0,0 +1,938 @@ + + + + + A + + + + . + + + + G + + + + U + + + + A + + + + C + + + + . + + + + G + + + + U + + + + . + + + + G + + + + C + + + + U + + + + A + + + + . + + + + U + + + + C + + + + A + + + + . + + + + G + + + + . + + + + G + + + + A + + + + A + + + + . + + + + G + + + + . + + + + G + + + + C + + + + A + + + + G + + + + < + + + + U + + + + A + + + + < + + + + C + + + + U + + + + C + + + + < + + + + G + + + + A + + + + < + + + + C + + + + < + + + + A + + + + < + + + + U + + + + < + + + + G + + + + A + + + + < + + + + C + + + + < + + + + U + + + + < + + + + U + + + + < + + + + C + + + + < + + + + C + + + + U + + + + < + + + + U + + + + < + + + + U + + + + G + + + + < + + + + A + + + + < + + + + C + + + + U + + + + G + + + + < + + + + A + + + + < + + + + U + + + + A + + + + C + + + + G + + + + < + + + + U + + + + < + + + + C + + + + A + + + + < + + + + C + + + + . + + + + C + + + + < + + + + A + + + + < + + + + U + + + + < + + + + A + + + + . + + + + G + + + + C + + + + U + + + + . + + + + U + + + + G + + + + . + + + + U + + + + A + + + + G + + + + . + + + + U + + + + A + + + + . + + + + U + + + + A + + + + C + + + + . + + + + U + + + + . + + + + U + + + + C + + + + A + + + + C + + + + . + + + + G + + + + U + + + + . + + + + U + + + + . + + + + U + + + + A + + + + G + + + + . + + + + A + + + + C + + + + . + + + + U + + + + A + + + + . + + + + U + + + + A + + + + . + + + + A + + + + U + + + + G + + + + C + + + + . + + + + G + + + + . + + + + A + + + + C + + + + > + + + + U + + + + > + + + + A + + + + > + + + + U + + + + > + + + + G + + + + > + + + + G + + + + > + + + + A + + + + > + + + + A + + + + > + + + + U + + + + > + + + + G + + + + > + + + + U + + + + > + + + + A + + + + > + + + + A + + + + > + + + + G + + + + A + + + + > + + + + G + + + + > + + + + A + + + + > + + + + A + + + + > + + + + G + + + + > + + + + U + + + + > + + + + G + + + + A + + + + > + + + + U + + + + > + + + + G + + + + > + + + + G + + + + U + + + + > + + + + G + + + + A + + + + > + + + + U + + + + G + + + + . + + + + A + + + + C + + + + U + + + + . + + + + G + + + + A + + + + U + + + + A + + + + . + + + + U + + + + C + + + + A + + + + G + + + + C + + + + . + + + + U + + + + A + + + + . + + + + G + + + + A + + + + . + + + + U + + + + G + + + + A + + + + C + + + + . + + + + G + + + + U + + + + + + 2 + + + 1 + + + 0 + + + bits + + \ No newline at end of file diff --git a/bp_scripts/add_ident b/bp_scripts/add_ident new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/add_ident @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/align_seq b/bp_scripts/align_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/align_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/analyze_bed b/bp_scripts/analyze_bed new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/analyze_bed @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/analyze_seq b/bp_scripts/analyze_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/analyze_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/analyze_tags b/bp_scripts/analyze_tags new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/analyze_tags @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/analyze_vals b/bp_scripts/analyze_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/analyze_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/blast_seq b/bp_scripts/blast_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/blast_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/blat_seq b/bp_scripts/blat_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/blat_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/calc_bit_scores b/bp_scripts/calc_bit_scores new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/calc_bit_scores @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/complement_seq b/bp_scripts/complement_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/complement_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/complexity_seq b/bp_scripts/complexity_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/complexity_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/compute b/bp_scripts/compute new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/compute @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/count_records b/bp_scripts/count_records new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/count_records @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/count_vals b/bp_scripts/count_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/count_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/create_blast_db b/bp_scripts/create_blast_db new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/create_blast_db @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/create_vmatch_index b/bp_scripts/create_vmatch_index new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/create_vmatch_index @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/create_weight_matrix b/bp_scripts/create_weight_matrix new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/create_weight_matrix @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/extract_seq b/bp_scripts/extract_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/extract_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/flip_tab b/bp_scripts/flip_tab new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/flip_tab @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/fold_seq b/bp_scripts/fold_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/fold_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/get_genome_align b/bp_scripts/get_genome_align new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/get_genome_align @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/get_genome_phastcons b/bp_scripts/get_genome_phastcons new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/get_genome_phastcons @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/get_genome_seq b/bp_scripts/get_genome_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/get_genome_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/grab b/bp_scripts/grab new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/grab @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/head_records b/bp_scripts/head_records new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/head_records @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/invert_align b/bp_scripts/invert_align new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/invert_align @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/length_seq b/bp_scripts/length_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/length_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/length_vals b/bp_scripts/length_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/length_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/list_biotools b/bp_scripts/list_biotools new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/list_biotools @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/match_seq b/bp_scripts/match_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/match_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/max_vals b/bp_scripts/max_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/max_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/mean_vals b/bp_scripts/mean_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/mean_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/median_vals b/bp_scripts/median_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/median_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/merge_vals b/bp_scripts/merge_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/merge_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/min_vals b/bp_scripts/min_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/min_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/oligo_freq b/bp_scripts/oligo_freq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/oligo_freq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/patscan_seq b/bp_scripts/patscan_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/patscan_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_chrdist b/bp_scripts/plot_chrdist new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_chrdist @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_histogram b/bp_scripts/plot_histogram new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_histogram @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_karyogram b/bp_scripts/plot_karyogram new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_karyogram @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_lendist b/bp_scripts/plot_lendist new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_lendist @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_matches b/bp_scripts/plot_matches new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_matches @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_phastcons_profiles b/bp_scripts/plot_phastcons_profiles new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_phastcons_profiles @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/plot_seqlogo b/bp_scripts/plot_seqlogo new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/plot_seqlogo @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/print_usage b/bp_scripts/print_usage new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/print_usage @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/random_records b/bp_scripts/random_records new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/random_records @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_2bit b/bp_scripts/read_2bit new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_2bit @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_align b/bp_scripts/read_align new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_align @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_bed b/bp_scripts/read_bed new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_bed @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_blast_tab b/bp_scripts/read_blast_tab new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_blast_tab @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_embl b/bp_scripts/read_embl new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_embl @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_fasta b/bp_scripts/read_fasta new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_fasta @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_gff b/bp_scripts/read_gff new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_gff @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_mysql b/bp_scripts/read_mysql new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_mysql @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_phastcons b/bp_scripts/read_phastcons new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_phastcons @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_psl b/bp_scripts/read_psl new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_psl @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_soft b/bp_scripts/read_soft new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_soft @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_solexa b/bp_scripts/read_solexa new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_solexa @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_solid b/bp_scripts/read_solid new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_solid @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_stockholm b/bp_scripts/read_stockholm new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_stockholm @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/read_tab b/bp_scripts/read_tab new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/read_tab @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/remove_indels b/bp_scripts/remove_indels new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/remove_indels @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/remove_keys b/bp_scripts/remove_keys new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/remove_keys @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/rename_keys b/bp_scripts/rename_keys new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/rename_keys @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/reverse_seq b/bp_scripts/reverse_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/reverse_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/shuffle_seq b/bp_scripts/shuffle_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/shuffle_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/sort_records b/bp_scripts/sort_records new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/sort_records @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/split_bed b/bp_scripts/split_bed new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/split_bed @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/split_seq b/bp_scripts/split_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/split_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/sum_vals b/bp_scripts/sum_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/sum_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/tile_seq b/bp_scripts/tile_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/tile_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/translate_seq b/bp_scripts/translate_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/translate_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/transliterate_seq b/bp_scripts/transliterate_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/transliterate_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/transliterate_vals b/bp_scripts/transliterate_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/transliterate_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/uniq_vals b/bp_scripts/uniq_vals new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/uniq_vals @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/upload_to_ucsc b/bp_scripts/upload_to_ucsc new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/upload_to_ucsc @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/uppercase_seq b/bp_scripts/uppercase_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/uppercase_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/vmatch_seq b/bp_scripts/vmatch_seq new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/vmatch_seq @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_2bit b/bp_scripts/write_2bit new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_2bit @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_align b/bp_scripts/write_align new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_align @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_bed b/bp_scripts/write_bed new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_bed @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_blast b/bp_scripts/write_blast new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_blast @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_fasta b/bp_scripts/write_fasta new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_fasta @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_psl b/bp_scripts/write_psl new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_psl @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_solid b/bp_scripts/write_solid new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_solid @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_scripts/write_tab b/bp_scripts/write_tab new file mode 100755 index 0000000..c8c800f --- /dev/null +++ b/bp_scripts/write_tab @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biotools; diff --git a/bp_usage/add_ident b/bp_usage/add_ident new file mode 100644 index 0000000..abb31b6 --- /dev/null +++ b/bp_usage/add_ident @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: May 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Adds a unique identifier to each record in stream. + +Usage: ... | $script [options] + +Options: [-k | --key=] - Identifier key - Default=ID +Options: [-p | --prefix=] - Identifier prefix - Default=ID +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Add identifier to all records. +Examples: ... | $script -k SEQ_NAME - Change the identifier key to SEQ_NAME. +Examples: ... | $script -p ID_ - Change identifier prefix from ID00000000 to ID_00000000. + +Keys out: - The specified key. + diff --git a/bp_usage/align_seq b/bp_usage/align_seq new file mode 100644 index 0000000..b361062 --- /dev/null +++ b/bp_usage/align_seq @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Align sequences in stream using Muscle. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Aligns all sequences in stream. + +Keys in: SEQ_NAME - name of sequence. +Keys in: Q_ID - used as sequence name if no SEQ_NAME. +Keys in: SEQ - unaligned sequence. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ - Aligned sequence. +Keys out: ALIGN - Number indicating what alignment this sequence belong to. diff --git a/bp_usage/analyze_bed b/bp_usage/analyze_bed new file mode 100644 index 0000000..ba6eede --- /dev/null +++ b/bp_usage/analyze_bed @@ -0,0 +1,17 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Analysis BED entries in the stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script - Analyzes all BED entries in the stream. + diff --git a/bp_usage/analyze_seq b/bp_usage/analyze_seq new file mode 100644 index 0000000..55b45f8 --- /dev/null +++ b/bp_usage/analyze_seq @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Analysis the residue composition of each sequence in stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script - Analyzes all sequences in stream. + +Keys out: SEQ_TYPE - Guessed Sequence type. +Keys out: SEQ_LEN - Sequence length. +Keys out: RES - Residue count. +Keys out: RES_SUM - Sum of all non-indel residues. +Keys out: GC% - GC content in percent for DNA/RNA sequences. +Keys out: HARD_MASK% - Percentage of sequence hard-masked with N's. +Keys out: SOFT_MASK% - Percentage of sequence soft-masked with lower case letters. +Keys out: MIX_INDEX - Sequence composition index: most common residue over the sequence length. +Keys out: MELT_TEMP - Melting temperature of DNA/RNA sequence: 4 degrees per GC pair, 2 degrees per AT/U pair. diff --git a/bp_usage/analyze_tags b/bp_usage/analyze_tags new file mode 100644 index 0000000..553a852 --- /dev/null +++ b/bp_usage/analyze_tags @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: May 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Analyze sequence tags in the stream from sequence or BED records resulting in a tag length and clone count distribution. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script - Analyzes all entries with SEQ in the stream. + +Keys in: Q_ID/SEQ_NAME - Identifier with clone/read count: ID00001_123 (123 is the clone count). +Keys in: SEQ/BED_LEN - Sequence. + +Keys out: TAG_LEN - Length of sequence tags. +Keys out: TAG_COUNT - Number of tags with a given TAG_LEN. +Keys out: TAG_CLONES - Total clones or reads for all tags (TAG_COUNT) of TAG_LEN. diff --git a/bp_usage/analyze_vals b/bp_usage/analyze_vals new file mode 100644 index 0000000..175be88 --- /dev/null +++ b/bp_usage/analyze_vals @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: January 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Determine type, count, min, max, sum and mean for values in stream. + +Usage: ... | analyze_vals [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-k | --keys=] - Comma separated list of keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | analyze_vals -x - Analyses all values in stream. +Examples: ... | analyze_vals -x -k CHR_BEG,SCORE - Analyses selected values in stream. diff --git a/bp_usage/blast_seq b/bp_usage/blast_seq new file mode 100644 index 0000000..3c75c76 --- /dev/null +++ b/bp_usage/blast_seq @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: BLAST sequences in stream against a specified database. + +Usage: ... | blast_seq [options] + +Options: [-d | --database=] - Path to database. +Options: [-g | --genome=] - Choose genome instead of database. +Options: [-p | --program= - blastn|blastp|tblastn|blastx|tblastx - Default=guessed! +Options: [-e | --e_val=] - Expectation value - Default=10 +Options: [-f | --filter] - Filter low complexity sequence - Default=OFF +Options: [-F | --no_filter] - Disable low complexity filter - Default +Options: [-c | --cpus=] - Number of CPUs to use - Default=1 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | blast_seq -d my_database - BLAST sequences in stream against custom database. +Examples: ... | blast_seq -g hg18 - BLAST sequences in stream against hg18. diff --git a/bp_usage/blat_seq b/bp_usage/blat_seq new file mode 100644 index 0000000..4c4f382 --- /dev/null +++ b/bp_usage/blat_seq @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: BLAT sequences in stream against a specified genome. + +Usage: ... | $script [options] -g + +Options: [-g | --genome= ] - BLAT against genome. +Options: [-c | --ooc] - Use overused tile file (faster, but less sensitive). +Options: [-t | --tile_size=] - Size of match that triggers an alignment - Default=11 +Options: [-s | --step_size=] - Spacing between tiles - Default=tile_size +Options: [-m | --min_identity=] - Minimum sequence identity in percent - Default=90 +Options: [-M | --min_score=] - Minimum score - Default=0 +Options: [-o | --one_off=] - Allows one mismatch in tile - Default=0 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -g hg18 - BLAT sequences in stream against hg18. diff --git a/bp_usage/calc_bit_scores b/bp_usage/calc_bit_scores new file mode 100644 index 0000000..34f5334 --- /dev/null +++ b/bp_usage/calc_bit_scores @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: March 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Calculate the bit score for each position based on an alignment in the stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Output a bitscore for each column in the alignment. + +Keys out: V0, V1, V2, Vn - Bit score for each position. diff --git a/bp_usage/complement_seq b/bp_usage/complement_seq new file mode 100644 index 0000000..c6d57c8 --- /dev/null +++ b/bp_usage/complement_seq @@ -0,0 +1,16 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Complement sequences in stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Complements all sequences in stream. diff --git a/bp_usage/complexity_seq b/bp_usage/complexity_seq new file mode 100644 index 0000000..09724b9 --- /dev/null +++ b/bp_usage/complexity_seq @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: May 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Generates an index calculated as the most common di-residue over the sequence length for all sequences in stream. An index higher than 0.4 indicates low complexity sequence. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script - Analyzes all sequences in stream. + +Keys out: SEQ_COMPLEXITY - Calculated complexity index. diff --git a/bp_usage/compute b/bp_usage/compute new file mode 100644 index 0000000..ea32c6c --- /dev/null +++ b/bp_usage/compute @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Performs computations on records in stream. + +Usage: ... | $script [options] + +Options: [-e | --eval=] - Evaluate extression. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script -e 'CHR_BEG = 50' - Set CHR_BEG to 50 in all records. +Examples: ... | $script -e 'CHR_BEG = CHR_BEG - 50' - Substracts 50 from CHR_BEG in all records. +Examples: ... | $script -e 'CHR_END = CHR_END + BED_LEN' - Adds BED_LEN to CHR_END in all records. diff --git a/bp_usage/count_records b/bp_usage/count_records new file mode 100644 index 0000000..8662a50 --- /dev/null +++ b/bp_usage/count_records @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Count the number of records in stream. + +Usage: ... | count_records [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | count_records -x -o count.txt - Count records in stream and write result to 'count.txt' + diff --git a/bp_usage/count_seq b/bp_usage/count_seq new file mode 100644 index 0000000..2b4bb7a --- /dev/null +++ b/bp_usage/count_seq @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Count sequences in stream. + +Usage: ... | count_seq [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | count_seq -x -o count.txt - Output records count to 'count.txt'. + diff --git a/bp_usage/count_vals b/bp_usage/count_vals new file mode 100644 index 0000000..8a18d0f --- /dev/null +++ b/bp_usage/count_vals @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Count the number of times values of given keys exists in stream. + +Usage: ... | count_vals [options] + +Options: [-k | --keys=] - Comma separeted list of keys +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | count_vals -k SEQ - Count occurence of each SEQ in stream. + diff --git a/bp_usage/create_blast_db b/bp_usage/create_blast_db new file mode 100644 index 0000000..13bec4c --- /dev/null +++ b/bp_usage/create_blast_db @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Create a BLAST database from sequences in stream for use with BLAST. + +Usage: ... | create_blast_db [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-d | --database=] - Path and name of database to create. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | create_blast_db -x -d /tmp/fly + diff --git a/bp_usage/create_vmatch_index b/bp_usage/create_vmatch_index new file mode 100644 index 0000000..10119ff --- /dev/null +++ b/bp_usage/create_vmatch_index @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: January 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Create a vmatch index using mkvtree from sequences in stream for use with vmatch. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-i | --index_name=] - Directory name to contain index files. +Options: [-p | --prefix_length=] - Minimum prefix that can be matched - Default=guessed +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -i /tmp/fly diff --git a/bp_usage/create_weight_matrix b/bp_usage/create_weight_matrix new file mode 100644 index 0000000..f1c56ee --- /dev/null +++ b/bp_usage/create_weight_matrix @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Create a weight matrix of the residue composition of an alignment in the stream. + +Usage: ... | $script [options] + +Options: [-p | --percent] - Output the result in percent - Default=absolute +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -p - Creates a weight matrix in percent. + +Keys out: V0, V1, V2, Vn - Weight for each position. diff --git a/bp_usage/extract_seq b/bp_usage/extract_seq new file mode 100644 index 0000000..78e0887 --- /dev/null +++ b/bp_usage/extract_seq @@ -0,0 +1,21 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Extract subsequence from sequences in stream. + +Usage: ... | extract_seq [options] + +Options: [-b | --beg=] - Begin position of subsequence (first residue=1) +Options: [-e | --end=] - End position of subsequence +Options: [-l | --len=] - Length of subsequence +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | extract_seq -b 1 -e 10 - Get the first 10 nucleotides of all sequences. +Examples: ... | extract_seq -b 1 -l 10 - Get the first 10 nucleotides of all sequences. + diff --git a/bp_usage/flip_tab b/bp_usage/flip_tab new file mode 100644 index 0000000..deae82b --- /dev/null +++ b/bp_usage/flip_tab @@ -0,0 +1,16 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: June 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Flip table records so rows becomes columns and visa versa. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Flip rows and columns. diff --git a/bp_usage/fold_seq b/bp_usage/fold_seq new file mode 100644 index 0000000..08e2b10 --- /dev/null +++ b/bp_usage/fold_seq @@ -0,0 +1,16 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Folds sequences in stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Folds all sequences in stream. diff --git a/bp_usage/get_genome_align b/bp_usage/get_genome_align new file mode 100644 index 0000000..80dd010 --- /dev/null +++ b/bp_usage/get_genome_align @@ -0,0 +1,42 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: April 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Extract alignment from multiple genome alignment either explicitly or using BED/PSL/BLAST entries in stream. + +Usage: $script [options] -g +Usage: ... | $script [options] -g + +Options: [-g | --genome=] - Genome to get alignment from. +Options: [-c | --chr=] - Chromosome with requested alignment. +Options: [-b | --beg=] - Begin position of alignment (first residue=1). +Options: [-e | --end=] - End position of alignment. +Options: [-l | --len=] - Length of alignment. +Options: [-s | --strand=] - Strand - Default=+ +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: $script -g hg18 -c chr1 -b 1 -e 10 - Get the first 10 nucleotides multiz alignment of human genome chr1. +Examples: $script -g hg18 -c chr1 -b 1 -l 10 - Get the first 10 nucleotides multiz alignment of human genome chr1. +Examples: ... | $script -g mm8 -s '-' - Get the reverse complement alignment of mouse BED/PSL/BLAST entries. + +Keys in: REC_TYPE - Optional record type (BED, PSL, or BLAST). +Keys in: CHR - Chromosome (for use with BED record type). +Keys in: CHR_BEG - Chromosome begin. +Keys in: CHR_END - Chromosome end. +Keys in: S_ID - Chromosome (for use with PSL and BLAST record type). +Keys in: S_BEG - Chromosome begin (for use with PSL and BLAST record type). +Keys in: S_END - Chromosome end (for use with PSL and BLAST record type). +Keys in: STRAND - Sequence strand. + +Keys out: ALIGN - Alignment number that this entry belongs to. +Keys out: CHR - Chromosome. +Keys out: CHR_BEG - Chromosome begin. +Keys out: CHR_END - Chromosome end. +Keys out: STRAND - Strand. +Keys out: SEQ - Sequence. +Keys out: ALIGN_LEN - Sequence length. diff --git a/bp_usage/get_genome_phastcons b/bp_usage/get_genome_phastcons new file mode 100644 index 0000000..ad6621a --- /dev/null +++ b/bp_usage/get_genome_phastcons @@ -0,0 +1,38 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2009 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Extract phastcons scores from a genome either explicitly or using BED/PSL/BLAST entries in stream. + +Usage: $script [options] -g +Usage: ... | $script [options] -g + +Options: [-g | --genome=] - Genome to get phastcons scores from. +Options: [-c | --chr=] - Chromosome with requested scores. +Options: [-b | --beg=] - Begin position of interval (first residue=1). +Options: [-e | --end=] - End position of interval. +Options: [-l | --len=] - Length of interval. +Options: [-f | --flank=] - Include flanking region. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: $script -g hg18 -c chr1 -b 1 -e 10 - Get the first 10 phastcons scores from human genome chr1. +Examples: $script -g hg18 -c chr1 -b 1 -l 10 - Get the first 10 phastcons scores from human genome chr1. +Examples: ... | $script -g mm8 -f 50 - Get phastcons scores including 50nt flanks of mouse BED/PSL/BLAST entries. + +Keys in: REC_TYPE - Optional record type (BED, PSL, or BLAST). +Keys in: CHR - Chromosome (for use with BED record type). +Keys in: CHR_BEG - Chromosome begin. +Keys in: CHR_END - Chromosome end. +Keys in: S_ID - Chromosome (for use with PSL and BLAST record type). +Keys in: S_BEG - Chromosome begin (for use with PSL and BLAST record type). +Keys in: S_END - Chromosome end (for use with PSL and BLAST record type). + +Keys out: CHR - Chromosome. +Keys out: CHR_BEG - Chromosome begin. +Keys out: CHR_END - Chromosome end. +Keys out: PHASTCONS - Comma separated list of phastcons scores. diff --git a/bp_usage/get_genome_seq b/bp_usage/get_genome_seq new file mode 100644 index 0000000..5586b3b --- /dev/null +++ b/bp_usage/get_genome_seq @@ -0,0 +1,41 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: December 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Extract subsequence from genome sequence either explicitly or using BED/PSL/BLAST entries in stream. + +Usage: $script [options] -g +Usage: ... | $script [options] -g + +Options: [-g | --genome=] - Genome to get subsequence from. +Options: [-c | --chr=] - Chromosome with requested subsequence. +Options: [-b | --beg=] - Begin position of subsequence (first residue=1). +Options: [-e | --end=] - End position of subsequence. +Options: [-l | --len=] - Length of subsequence. +Options: [-f | --flank=] - Include flanking sequence. +Options: [-m | --mask] - Softmask non-exonic sequence. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: $script -g hg18 -c chr1 -b 1 -e 10 - Get the first 10 nucleotides of human genome chr1. +Examples: $script -g hg18 -c chr1 -b 1 -l 10 - Get the first 10 nucleotides of human genome chr1. +Examples: ... | $script -g mm8 -f 50 - Get subsequences including 50nt flanks of mouse BED/PSL/BLAST entries. + +Keys in: REC_TYPE - Optional record type (BED, PSL, or BLAST). +Keys in: CHR - Chromosome (for use with BED record type). +Keys in: CHR_BEG - Chromosome begin. +Keys in: CHR_END - Chromosome end. +Keys in: S_ID - Chromosome (for use with PSL and BLAST record type). +Keys in: S_BEG - Chromosome begin (for use with PSL and BLAST record type). +Keys in: S_END - Chromosome end (for use with PSL and BLAST record type). +Keys in: STRAND - Sequence strand. + +Keys out: CHR - Chromosome. +Keys out: CHR_BEG - Chromosome begin. +Keys out: CHR_END - Chromosome end. +Keys out: SEQ - Sequence. +Keys out: SEQ_LEN - Sequence length. diff --git a/bp_usage/grab b/bp_usage/grab new file mode 100644 index 0000000..5748473 --- /dev/null +++ b/bp_usage/grab @@ -0,0 +1,35 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Grab records in stream. + +Usage: ... | $script [options] + +Options: [-p | --patterns=] - Grab using comma separated list of patterns. +Options: [-P | --patterns_in=] - Grab using patterns from file - one pattern per line. +Options: [-r | --regex=] - Grab using Perl regex. +Options: [-e | --eval= - Grab 'key,operator,value'. Operators: '>,<,>=,<=,=,!=,eq,ne'. +Options: [-E | --exact_in= - Grab using exact expressions from file - one expression per line. +Options: [-i | --invert] - Display non-mathing results. +Options: [-c | --case_insensitive] - Turn regex matching case insensitive. +Options: [-k | --keys=] - Comma separated list of keys to grab the value for. +Options: [-K | --keys_only] - Only grab for keys. +Options: [-V | --vals_only] - Only grab for vals. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -p SEQ - Grab all records matching SEQ in keys or vals. +Examples: ... | $script -p 110 -k SEQ_LEN - Grab all records where SEQ_LEN matching 110. +Examples: ... | $script -p 110 -k SEQ_LEN -i - Grab all records where SEQ_LEN is not 110. +Examples: ... | $script -r 'A[TUG]C' - Grab all records matching ATC, AUC, or AGC. +Examples: ... | $script -p SEQ -K - Grab all records with a key matching SEQ. +Examples: ... | $script -p SEQ -V - Grab all records with a value matching SEQ. +Examples: ... | $script -e 'SEQ_LEN<30' - Grab all records with a SEQ_LEN less than 30. +Examples: ... | $script -e 'OS eq D.mel' - Grab all records with OS equal to 'D.mel'. +Examples: ... | $script -E names.txt - Grab all records with exact match to names in file. +Examples: ... | $script -E seq.txt -i - Grab all records with no exact match to sequences in file. diff --git a/bp_usage/head_records b/bp_usage/head_records new file mode 100644 index 0000000..e47def6 --- /dev/null +++ b/bp_usage/head_records @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Emit the first records in stream. + +Usage: ... | head_records + +Options: [-n | --num=] - Number of records to emit - Default=10 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | head_records -n 40 - Emit the 40 first records from the stream. + diff --git a/bp_usage/invert_align b/bp_usage/invert_align new file mode 100644 index 0000000..9c99b33 --- /dev/null +++ b/bp_usage/invert_align @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Inverts an alignment showing only non-mathing residues using the first sequence as reference. + +Usage: ... | $script [options] + +Options: [-s | --soft] - Use soft inversion instead of hard inversion. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script - Invert alignment in stream. +Examples: ... | $script -s - Soft invert alignment in stream. + diff --git a/bp_usage/length_seq b/bp_usage/length_seq new file mode 100644 index 0000000..01983ba --- /dev/null +++ b/bp_usage/length_seq @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Determines the length of each sequence in stream - and a total length. + +Usage: ... | length_seq [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | length_seq -x -o length.txt - Output total length to 'length.txt'. + diff --git a/bp_usage/length_vals b/bp_usage/length_vals new file mode 100644 index 0000000..a72080c --- /dev/null +++ b/bp_usage/length_vals @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Determine the length of the value for given keys. + +Usage: ... | length_vals [options] + +Options: [-k | --keys=] - Comma separated list of keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | length_vals -k SEQ - Determine the length of all SEQ values. +Examples: ... | length_vals -k HIT,PATTERN - Determine the length of all HIT and PATTERN values. diff --git a/bp_usage/list_biotools b/bp_usage/list_biotools new file mode 100644 index 0000000..6e675be --- /dev/null +++ b/bp_usage/list_biotools @@ -0,0 +1,9 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: January 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: List the description of all biotools. diff --git a/bp_usage/match_seq b/bp_usage/match_seq new file mode 100644 index 0000000..e443d5d --- /dev/null +++ b/bp_usage/match_seq @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Find all matches between the first two sequences in stream - or all self-self matches if only one sequence is found. + +Usage: ... | match_seq [options] + +Options: [-w | --word_size=] - Minimum match size - Default=20 +Options: [-d | --direction=] - Match direction: both|forward|reverse - Default=both +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | match_seq -w 25 -d forward + diff --git a/bp_usage/max_vals b/bp_usage/max_vals new file mode 100644 index 0000000..5404948 --- /dev/null +++ b/bp_usage/max_vals @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Find the maximum value in the stream for given keys. + +Usage: ... | $script [options] + +Options: [-k | --keys=] - Comma separated list of keys to inspect. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: ... | $script -k SEQ_LEN - Find the maximum SEQ_LEN. + +Keys out: _MAX - Maximum value of . diff --git a/bp_usage/mean_vals b/bp_usage/mean_vals new file mode 100644 index 0000000..e078caa --- /dev/null +++ b/bp_usage/mean_vals @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Calculate the mean of values of given keys. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-k | --keys=] - Comma separated list of keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -k SEQ_LEN,HIT_LEN -o result.txt - Calculate mean values and save to 'result.txt'. + +Keys out: _MEAN - Mean value of . + diff --git a/bp_usage/median_vals b/bp_usage/median_vals new file mode 100644 index 0000000..09de6a3 --- /dev/null +++ b/bp_usage/median_vals @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: March 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Calculate the median values of given keys. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-k | --keys=] - Comma separated list of keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -k SEQ_LEN,HIT_LEN -o result.txt - Calculate median values and save to 'result.txt'. + +Keys out: _MEDIAN - Median value of . + diff --git a/bp_usage/merge_vals b/bp_usage/merge_vals new file mode 100644 index 0000000..a2ced3f --- /dev/null +++ b/bp_usage/merge_vals @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Merge values of keys in a record so that all values are joined with a delimiter and saved as value of the first key. + +Usage: ... | $script [options] + +Options: [-k | --keys=] - List of values to merge. +Options: [-d | --delimit=] - Merge delimiter - Default='_' +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -k SEQ_NAME,CHR,CHR_BEG,CHR_END - Merges specified values and saves in SEQ_NAME. + diff --git a/bp_usage/min_vals b/bp_usage/min_vals new file mode 100644 index 0000000..0838d43 --- /dev/null +++ b/bp_usage/min_vals @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Find the minimum value in the stream for given keys. + +Usage: ... | $script [options] + +Options: [-k | --keys=] - Comma separated list of keys to inspect. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: ... | $script -k SEQ_LEN - Find the minimum SEQ_LEN. + +Keys out: _MIN - Minimum value of . diff --git a/bp_usage/oligo_freq b/bp_usage/oligo_freq new file mode 100644 index 0000000..6023d8a --- /dev/null +++ b/bp_usage/oligo_freq @@ -0,0 +1,20 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Determines the total oligo frequencies of sequences in stream. + +Usage: ... | oligo_freq [options] + +Options: [-w | --word_size=] - Size of oligos - Default=7. +Options: [-a | --all] - Accumulate oligos for all sequences in stream. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | oligo_freq -w 5 - Determines oligo frequency for all sequences in stream. +Examples: ... | oligo_freq -w 5 -a - Determines accumulated oligo frequency for all sequences in stream. + diff --git a/bp_usage/patscan_seq b/bp_usage/patscan_seq new file mode 100644 index 0000000..0d8d6ff --- /dev/null +++ b/bp_usage/patscan_seq @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Scan sequences in stream or genomes for patterns using scan_for_matches. + +Usage: ... | $script [options] +Usage: $script [options] -g + +Options: [-p | --patterns=] - Comma separated list of patterns to scan for. +Options: [-P | --patterns_in=] - File with one pattern per line. +Options: [-c | --comp] - Scan complementary strand as well. +Options: [-h | --max_hits=] - Stop scanning after max hits. +Options: [-m | --max_misses=] - Stop scanning after max misses. +Options: [-g | --genome= - Scan genome for pattern(s).] +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -p AATTAA,ATTTAA - Scan sequences in stream for given patterns. +Examples: $script -g hg18 -P file.pat - Scan hg18 for patterns in 'file.pat'. diff --git a/bp_usage/plot_chrdist b/bp_usage/plot_chrdist new file mode 100644 index 0000000..7b803c3 --- /dev/null +++ b/bp_usage/plot_chrdist @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Plot chromosome distribution of hits from e.g. BLAT or Vmatch. + +Usage: ... | plot_chrdist [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-t | --terminal=] - Terminal for output: dumb|post|svg - Default=dumb +Options: [-T | --title=] - Set plot title. +Options: [-X | --xlabel=] - Set x-axis label. +Options: [-Y | --ylabel=] - Set y-axis label. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_chrdist -x - Create plot and output to STDOUT in ASCII. +Examples: ... | plot_chrdist -x -t svg -o plot.svg - Create plot and save to 'plot.svg'. diff --git a/bp_usage/plot_histogram b/bp_usage/plot_histogram new file mode 100644 index 0000000..f763fc6 --- /dev/null +++ b/bp_usage/plot_histogram @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Plot generic histogram. + +Usage: ... | plot_histogram [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-k | --key=] - Key to use for plotting. +Options: [-t | --terminal=] - Terminal for output: dumb|post|svg - Default=dumb +Options: [-T | --title=] - Set plot title. +Options: [-X | --xlabel=] - Set x-axis label. +Options: [-Y | --ylabel=] - Set y-axis label. +Options: [-s | --sort=] - Sort criteria for x-axis keys - Default=num +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_histogram -x -k TISSUE - Create plot and output to STDOUT in ASCII. +Examples: ... | plot_histogram -x -k TISSUE -t svg -o plot.svg - Create plot and save to 'plot.svg'. + diff --git a/bp_usage/plot_karyogram b/bp_usage/plot_karyogram new file mode 100644 index 0000000..9da3c93 --- /dev/null +++ b/bp_usage/plot_karyogram @@ -0,0 +1,21 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Plot hits on a karyogram for a given genome. + +Usage: ... | plot_karyogram [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-g | --genome=] - Genome layout of karyogram - Default=hg18 +Options: [-f | --feat_color=] - Color of features - Default=black +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_karyogram -x -g mm8 -o plot.svg - Create plot and save to 'plot.svg'. + diff --git a/bp_usage/plot_lendist b/bp_usage/plot_lendist new file mode 100644 index 0000000..4cda655 --- /dev/null +++ b/bp_usage/plot_lendist @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Plot length distribution. + +Usage: ... | plot_lendist [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-k | --key=] - Key to use for plotting. +Options: [-t | --terminal=] - Terminal for output: dumb|post|svg - Default=dumb +Options: [-T | --title=] - Set plot title. +Options: [-X | --xlabel=] - Set x-axis label. +Options: [-Y | --ylabel=] - Set y-axis label. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_lendist -x -k HIT_LEN - Create plot and output to STDOUT in ASCII. +Examples: ... | plot_lendist -x -k HIT_LEN -t svg -o plot.svg - Create plot and save to 'plot.svg'. diff --git a/bp_usage/plot_matches b/bp_usage/plot_matches new file mode 100644 index 0000000..03571aa --- /dev/null +++ b/bp_usage/plot_matches @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Generate a dotplot of matches in stream. + +Usage: ... | plot_matches [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-t | --terminal= - Terminal for output: dumb|post|svg - Default=dumb +Options: [-d | --direction= - Direction of matches to plot: both|forward|reverse - Default=both +Options: [-T | --title=] - Set plot title. +Options: [-X | --xlabel=] - Set x-axis label. +Options: [-Y | --ylabel=] - Set y-axis label. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_matches -x -d forward -t svg -o plot.svg - Create plot and save to 'plot.svg'. + diff --git a/bp_usage/plot_phastcons_profiles b/bp_usage/plot_phastcons_profiles new file mode 100644 index 0000000..3f17c95 --- /dev/null +++ b/bp_usage/plot_phastcons_profiles @@ -0,0 +1,29 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: January 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Generate a plot of PhastCons profiles based on chromosome coordinates in stream. + +Usage: ... | plot_phastcons_profiles [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-g | --genome=] - Genome from which to obtain PhastCons info. +Options: [-m | --mean] - Calculate a mean profile. +Options: [-M | --median] - Calculate a median profile. +Options: [-f | --flank=] - Include flanking PhastCons scores. +Options: [-t | --terminal= - Terminal for output: dumb|post|svg - Default=dumb +Options: [-d | --direction= - Direction of matches to plot: both|forward|reverse - Default=both +Options: [-T | --title=] - Set plot title. +Options: [-X | --xlabel=] - Set x-axis label. +Options: [-Y | --ylabel=] - Set y-axis label. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_phastcons_profiles -x -g dm3 -f 50 - Create plot of all profiles including 50 flanking scores. +Examples: ... | plot_phastcons_profiles -x -g dm3 -f 50 -m - Create a mean plot of all profiles including 50 flanking scores. + diff --git a/bp_usage/plot_seqlogo b/bp_usage/plot_seqlogo new file mode 100644 index 0000000..bd3ad9d --- /dev/null +++ b/bp_usage/plot_seqlogo @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Renders a sequence logo in SVG format from alignment in stream. + +Usage: ... | plot_seqlogo [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | plot_seqlogo -x -o logo.svg - Create plot and save to 'plot.svg'. diff --git a/bp_usage/print_usage b/bp_usage/print_usage new file mode 100644 index 0000000..0ec1a87 --- /dev/null +++ b/bp_usage/print_usage @@ -0,0 +1,9 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Date: January 2008 + +Description: Prints biotools usage. diff --git a/bp_usage/random_records b/bp_usage/random_records new file mode 100644 index 0000000..54a7168 --- /dev/null +++ b/bp_usage/random_records @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: December 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Select a number of random records from the stream. + +Usage: ... | random_records [options] + +Options: [-n | --num=] - Number of random records to select - Default=10 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | random_records -n 100 - Output 100 random records to stream. + diff --git a/bp_usage/read_2bit b/bp_usage/read_2bit new file mode 100644 index 0000000..0a8c774 --- /dev/null +++ b/bp_usage/read_2bit @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: March 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read sequence entries from one or more files 2bit files. The length of each sequence is also determined. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files or glob expression to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-N | --no_mask] - Ignore soft masking. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.2bit - Read FASTA entries from file. +Examples: $script -i test1.2bit,test2.2bit - Read FASTA entries from files. +Examples: $script -i '*.2bit' - Read FASTA entries from files. +Examples: $script -i test.2bit -n 10 - Read first 10 entries from file. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ - Sequence. +Keys out: SEQ_LEN - Length of sequence. diff --git a/bp_usage/read_align b/bp_usage/read_align new file mode 100644 index 0000000..274a3a8 --- /dev/null +++ b/bp_usage/read_align @@ -0,0 +1,29 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read aligned FASTA entries from one or more files. The aligned sequence length is also determined. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.fna - Read aligned FASTA entries from file. +Examples: $script -i test1.fna,test2,fna - Read aligned FASTA entries from files. +Examples: $script -i '*.fna' - Read aligned FASTA entries from files. +Examples: $script -i test.fna -n 10 - Read first 10 aligned FASTA entries from file. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ - Sequence. +Keys out: ALIGN_LEN - Length of aligned sequence. +Keys out: ALIGN - Number indicating what alignment this sequence belong to. + + + diff --git a/bp_usage/read_bed b/bp_usage/read_bed new file mode 100644 index 0000000..7058f73 --- /dev/null +++ b/bp_usage/read_bed @@ -0,0 +1,35 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read BED data (Browser Extensible Data). + +Usage: read_bed [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: read_bed -i test1.bed,test2.bed +Examples: read_bed -i '*.bed' + +Keys out: CHR - Chromosome name. +Keys out: CHR_BEG - Chromosome begin position. +Keys out: CHR_END - Chromoeoms end position. +Keys out: Q_ID - Query ID (feature name). +Keys out: SCORE - Score. +Keys out: STRAND - Strand. +Keys out: THICK_BEG - The starting position at which the feature is drawn thickly. +Keys out: THICK_END - The ending position at which the feature is drawn thickly. +Keys out: ITEMRGB - An RGB value of the form R,G,B (e.g. 255,0,0). +Keys out: BLOCKCOUNT - The number of blocks (exons) in the BED entry. +Keys out: BLOCKSIZES - A comma separated list of the block sizes. +Keys out: Q_BEGS - A comma separated list of block starts. +Keys out: REC_TYPE - Record type. +Keys out: BED_LEN - Length of BED entry. +Keys out: BED_COLS - Number of columns in BED line. diff --git a/bp_usage/read_blast_tab b/bp_usage/read_blast_tab new file mode 100644 index 0000000..f112707 --- /dev/null +++ b/bp_usage/read_blast_tab @@ -0,0 +1,33 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read tabular BLAST output (-m 8 and -m 9). + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script --data_in=test.blast + +Keys out: Q_ID - Query ID. +Keys out: S_ID - Subject ID. +Keys out: IDENT - Identity (%). +Keys out: ALIGN_LEN - Alignment length. +Keys out: MISMATCHES - Number of mismatches. +Keys out: GAPS - Number of gaps. +Keys out: Q_BEG - Query begin. +Keys out: Q_END - Query end. +Keys out: S_BEG - Subject begin. +Keys out: S_END - Subject end. +Keys out: E_VAL - Expect value. +Keys out: BIT_SCORE - Bit score. +Keys out: STRAND - Strand. +Keys out: REC_TYPE - Record type. diff --git a/bp_usage/read_embl b/bp_usage/read_embl new file mode 100644 index 0000000..95a4b30 --- /dev/null +++ b/bp_usage/read_embl @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read data in EMBL format. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read. +Options: [-k | --keys=] - Match a subset of record keys only. +Options: [-f | --feats=] - Match a subset of features only. +Options: [-q | --quals=] - Match a subset of qualifiers only. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i embl.dat - Read keys, features, and qualifiers. +Examples: $script -i embl.dat -k AC,DE - Read only Accession number and Description. +Examples: $script -i embl.dat -k FT,SEQ -f CDS - Read subset of features matching CDS. +Examples: $script -i embl.dat -k FT,SEQ -f CDS -q gene - Read subset of qualifiers matching gene. diff --git a/bp_usage/read_fasta b/bp_usage/read_fasta new file mode 100644 index 0000000..847746b --- /dev/null +++ b/bp_usage/read_fasta @@ -0,0 +1,25 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read FASTA entries from one or more files. The length of each sequence is also determined. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files or glob expression to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.fna - Read FASTA entries from file. +Examples: $script -i test1.fna,test2.fna - Read FASTA entries from files. +Examples: $script -i '*.fna' - Read FASTA entries from files. +Examples: $script -i test.fna -n 10 - Read first 10 FASTA entries from file. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ - Sequence. +Keys out: SEQ_LEN - Length of sequence. diff --git a/bp_usage/read_gff b/bp_usage/read_gff new file mode 100644 index 0000000..f9957fb --- /dev/null +++ b/bp_usage/read_gff @@ -0,0 +1,31 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read Generic Feature Format (GFF v.3) from one or more files. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files or glob expression to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.gff - Read GFF entries from file. + +Keys out: + +Keys out: Q_ID - Feature ID. +Keys out: SOURCE - Feature source. +Keys out: TYPE - Feature type. +Keys out: Q_BEG - Begin position +Keys out: Q_END - End position +Keys out: SCORE - Score. +Keys out: STRAND - Strand. +Keys out: PHASE - Phase. +Keys out: ATT - Attributes. +Keys out: ATT_ - Breakdown of Attributes into key/value pairs diff --git a/bp_usage/read_mysql b/bp_usage/read_mysql new file mode 100644 index 0000000..06f7224 --- /dev/null +++ b/bp_usage/read_mysql @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: May 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read records from a MySQL query. + +Usage: $script [options] + +Options: [-d | --database=] - MySQL database. +Options: [-q | --query=] - MySQL query. +Options: [-u | --user=] - MySQL user name - Default= +Options: [-p | --password=] - MySQL password - Default= +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -d dm3 -q 'SHOW TABLES' - Retreive table information from database. +Examples: $script -d dm3 -q 'SELECT * FROM estOrientInfo' - Retreive entire estOreintInfo. +Examples: $script -d dm3 -q 'SELECT * FROM table WHERE Score < 100' - Retreive selected lines from table. + diff --git a/bp_usage/read_phastcons b/bp_usage/read_phastcons new file mode 100644 index 0000000..268f0f4 --- /dev/null +++ b/bp_usage/read_phastcons @@ -0,0 +1,38 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: December 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read data in PhastCons format which are included in the stream as BED records. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read and output. +Options: [-m | --min=] - Minimum length of a conserved block - Default=10 +Options: [-d | --dist=] - Maximum distance between conserved blocks - Default=25 +Options: [-t | --threshold=] - Threshold for conserved block - Default=0.8 +Options: [-g | --gap=] - Allow micro-gap in a conserved block - Default=5 +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i chr4.pp + +Keys out: CHR - Chromosome name. +Keys out: CHR_BEG - Chromosome begin position. +Keys out: CHR_END - Chromoeoms end position. +Keys out: Q_ID - Query ID (feature name). +Keys out: SCORE - Score. +Keys out: STRAND - Strand. +Keys out: THICK_BEG - The starting position at which the feature is drawn thickly. +Keys out: THICK_END - The ending position at which the feature is drawn thickly. +Keys out: ITEMRGB - An RGB value of the form R,G,B (e.g. 255,0,0). +Keys out: BLOCKCOUNT - The number of blocks (exons) in the BED entry. +Keys out: BLOCKSIZES - A comma separated list of the block sizes. +Keys out: Q_BEGS - A comma separated list of block starts. +Keys out: REC_TYPE - Record type. +Keys out: BED_LEN - Length of BED entry. +Keys out: BED_COLS - Number of columns in BED line. diff --git a/bp_usage/read_psl b/bp_usage/read_psl new file mode 100644 index 0000000..cdd06a7 --- /dev/null +++ b/bp_usage/read_psl @@ -0,0 +1,43 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read PSL data (BLAT's default output). + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read PSL data from file. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test1.psl,test2.psl +Examples: $script -i '*.psl' + +Keys out: MATCHES - Number of non-repeat matches. +Keys out: MISMATCHES - Number of mismatches. +Keys out: REPMATCHES - Number of repeat matches. +Keys out: NCOUNT - Number of Ns. +Keys out: QNUMINSERT - Number of inserts in query. +Keys out: QBASEINSERT - Number of bases inserted in query. +Keys out: SNUMINSERT - Number of inserts in subject. +Keys out: SBASEINSERT - Number of bases inserted in subject. +Keys out: STRAND - Strand. +Keys out: Q_ID - Query ID. +Keys out: Q_LEN - Query length. +Keys out: Q_BEG - Query begin. +Keys out: Q_END - Query end. +Keys out: S_ID - Subject ID. +Keys out: S_LEN - Subject length. +Keys out: S_BEG - Subject begin. +Keys out: S_END - Subject end. +Keys out: BLOCKCOUNT - Block count. +Keys out: BLOCKSIZES - Block sizes. +Keys out: Q_BEGS - Query sequence blocks begins. +Keys out: S_BEGS - Subject sequence blocks begins. +Keys out: SCORE - Score calculated as in web BLAT results. +Keys out: REC_TYPE - Record type. diff --git a/bp_usage/read_soft b/bp_usage/read_soft new file mode 100644 index 0000000..c35e4ef --- /dev/null +++ b/bp_usage/read_soft @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: January 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read data in GEO soft format. NCBI's deep sequencing and micro array data format. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read and output. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i GSE6734_family.soft + +Keys out: SAMPLE_TITLE - Title of sample. +Keys out: SEQ_NAME - Sequence name composed of Platform Series ID, Sample GEO accession, Sequence number in current experiment, and read count. +Keys out: SEQ - Sequence. diff --git a/bp_usage/read_solexa b/bp_usage/read_solexa new file mode 100644 index 0000000..7b947e7 --- /dev/null +++ b/bp_usage/read_solexa @@ -0,0 +1,28 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: April 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read Solexa deep sequenceing output files. Lowercase sequence indicates low quality. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files or glob expression to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-q | --quality=] - Lowercase nucleotide with quality score below this limit (min:0 max:40) - Default=20 +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.solexa - Read Solexa entries from file. +Examples: $script -i test1.fna,test2.solexa - Read Solexa entries from files. +Examples: $script -i '*.solexa' - Read Solexa entries from files. +Examples: $script -i test.solexa -n 10 - Read first 10 Solexa entries from file. +Examples: $script -i test.solexa -q 10 - Change quality score threshold to 10. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ - Sequence. +Keys out: SEQ_LEN - Length of sequence. +Keys out: SCORE_MEAN - Mean quality score. diff --git a/bp_usage/read_solid b/bp_usage/read_solid new file mode 100644 index 0000000..765d4f1 --- /dev/null +++ b/bp_usage/read_solid @@ -0,0 +1,30 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: April 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read Solid sequence files with Name, Sequence and Quality. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Comma separated list of files or glob expression to read. +Options: [-n | --num=] - Limit number of records to read. +Options: [-q | --quality=] - Lowercase nucleotide with quality score below this limit (min:0 max:40) - Default=20 +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.solid - Read Solid entries from file. +Examples: $script -i test1.fna,test2.solid - Read Solid entries from files. +Examples: $script -i '*.solid' - Read Solid entries from files. +Examples: $script -i test.solid -n 10 - Read first 10 Solid entries from file. +Examples: $script -i test.solid -q 10 - Change quality score threshold to 10. + +Keys out: SEQ_NAME - Name of sequence. +Keys out: SEQ_CS - Sequence in color space. +Keys out: SEQ_QUAL - Sequence quality. +Keys out: SEQ - Sequence. +Keys out: SEQ_LEN - Length of sequence. +Keys out: SCORE_MEAN - Mean quality score. diff --git a/bp_usage/read_stockholm b/bp_usage/read_stockholm new file mode 100644 index 0000000..fc03db2 --- /dev/null +++ b/bp_usage/read_stockholm @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read data in Stockholm format. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read input data from file. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.stockholm + +Keys out: SEQ_NAME - Sequence name. +Keys out: SEQ - Aligned sequence. +Keys out: ALIGN - Number indicating what alignment this sequence belongs to. diff --git a/bp_usage/read_tab b/bp_usage/read_tab new file mode 100644 index 0000000..7e82aa4 --- /dev/null +++ b/bp_usage/read_tab @@ -0,0 +1,24 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Read a table or selected table columns. + +Usage: $script [options] -i + +Options: [-i | --data_in=] - Read tabular data from file. +Options: [-d | --delimit=] - Changes delimiter - Default='\s+' +Options: [-c | --cols=] - Comma separated list of cols to read in that order. +Options: [-k | --keys]=] - Comma separated list of keys to use for each column. +Options: [-s | --skip=] - Skip number of initial records. +Options: [-n | --num=] - Limit number of records to read. +Options: [-I | --stream_in=] - Read input stream from file - Default=STDIN +Options: [-O | --stream_out=] - Write output stream to file - Default=STDOUT + +Examples: $script -i test.tab -d ',' -c 7,4 -k SEQ,SEQ_NAME + +Keys out: V0, V1, V2 ... Vn - Default column names. diff --git a/bp_usage/remove_indels b/bp_usage/remove_indels new file mode 100644 index 0000000..fe73def --- /dev/null +++ b/bp_usage/remove_indels @@ -0,0 +1,17 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Remove indels (-~.) from sequences in stream. + +Usage: ... | remove_indels [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | remove_indels - Removes indels from all sequences in stream. + diff --git a/bp_usage/remove_keys b/bp_usage/remove_keys new file mode 100644 index 0000000..f25ef36 --- /dev/null +++ b/bp_usage/remove_keys @@ -0,0 +1,20 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Remove given keys from records in stream. + +Usage: ... | remove_keys [options] + +Options: [-k | --keys=] - Comma separeted list of keys to remove. +Options: [-K | --save_keys=] - Remove all save these comma separeted keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | remove_keys -k 'SEQ_NAME,SEQ' - Removes SEQ_NAME and SEQ from all records in stream. +Examples: ... | remove_keys -K 'SEQ_NAME,SEQ' - Removes all keys except SEQ_NAME and SEQ. + diff --git a/bp_usage/rename_keys b/bp_usage/rename_keys new file mode 100644 index 0000000..15c2e17 --- /dev/null +++ b/bp_usage/rename_keys @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Rename keys in stream. + +Usage: ... | rename_keys [options] + +Options: [-k | --keys=] - Keys to find and replace. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | rename_keys -k PATTERN_LEN,HIT_LEN - Renames PATTERN_LEN key to HIT_LEN. + diff --git a/bp_usage/reverse_seq b/bp_usage/reverse_seq new file mode 100644 index 0000000..594ec8b --- /dev/null +++ b/bp_usage/reverse_seq @@ -0,0 +1,17 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Reverse sequences in stream. + +Usage: ... | reverse_seq [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | reverse_seq - Reverses all sequences in stream. + diff --git a/bp_usage/shuffle_seq b/bp_usage/shuffle_seq new file mode 100644 index 0000000..cc7c035 --- /dev/null +++ b/bp_usage/shuffle_seq @@ -0,0 +1,18 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: December 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Shuffle sequences in stream. + +Usage: ... | $script [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | $script - Shuffles all sequences in stream. + +Keys in: SEQ - Sequence. diff --git a/bp_usage/sort_records b/bp_usage/sort_records new file mode 100644 index 0000000..8cdce35 --- /dev/null +++ b/bp_usage/sort_records @@ -0,0 +1,21 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: December 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Sort records in the stream. + +Usage: ... | sort_records [options] + +Options: [-k | --keys=] - Comma separated list of keys to sort by. Append n for numeric sorting instead of alphabetic. +Options: [-r | --reverse] - Reverse sort order. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | sort_records -k SEQ - Output records sorted alphabetically according to SEQ. +Examples: ... | sort_records -k SEQ_LENn - Output records sorted numerically according to SEQ_LEN. +Examples: ... | sort_records -k SEQ_LENn,SEQ -r - Output records in reverse order sorted according to SEQ_LEN and SEQ. + diff --git a/bp_usage/split_bed b/bp_usage/split_bed new file mode 100644 index 0000000..672209a --- /dev/null +++ b/bp_usage/split_bed @@ -0,0 +1,28 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Split BED records into overlapping windows. + +Usage: ... | $script [options] + +Options: [-w | --window_size=] - Window size - Default=20 +Options: [-s | --step_size=] - Step size - Default=1 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -w 12 - Split BED record into windows of size 12. +Examples: ... | $script -s 5 - Split BED record usint windows overlapping every 5th nucleotide. + +Keys in: CHR - Chromosome. +Keys in: CHR_BEG - Chromosome begin position. +Keys in: CHR_END - Chromosome end position. + +Keys out: REC_TYPE - Record type (BED). +Keys out: CHR - Chromosome. +Keys out: CHR_BEG - Chromosome begin position. +Keys out: CHR_END - Chromosome end position. diff --git a/bp_usage/split_seq b/bp_usage/split_seq new file mode 100644 index 0000000..daff988 --- /dev/null +++ b/bp_usage/split_seq @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Split sequences in stream into overlapping oligos. + +Usage: ... | $script [options] + +Options: [-w | --word_size=] - Word size of oligos - Default=7 +Options: [-u | --uniq] - Only emit unique oligos. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -w 12 -u - Only emit uniq oligoes of size 12. + +Keys in: SEQ_NAME - Sequence name. +Keys in: SEQ - Sequence. + diff --git a/bp_usage/sum_vals b/bp_usage/sum_vals new file mode 100644 index 0000000..22e8047 --- /dev/null +++ b/bp_usage/sum_vals @@ -0,0 +1,21 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Calculate the total sums for the values of given keys. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-k | --keys=] - Comma separated list of keys. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -k SEQ_LEN,HIT_LEN -o result.txt - Sum values and save to 'result.txt'. + +Keys out: - _SUM - Sum of value of diff --git a/bp_usage/tile_seq b/bp_usage/tile_seq new file mode 100644 index 0000000..0c48126 --- /dev/null +++ b/bp_usage/tile_seq @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: February 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Using the first sequence in stream as reference, tile all subsequent sequences based on pairwise alignments. + +Usage: ... | $script [options] + +Options: [-i | --identity=] - Minimum identity (%) for pairwise alignment - Default=70 +Options: [-s | --supress_indels] - Supress insertions in query sequence. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -i 75 - Tile all sequences in stream that have a similarity higher than 75%. + diff --git a/bp_usage/translate_seq b/bp_usage/translate_seq new file mode 100644 index 0000000..0939737 --- /dev/null +++ b/bp_usage/translate_seq @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Translate DNA sequence into protein sequence. + +Usage: ... | $script [options] + +Options: [-f | --frames=] - Comma separated list of frames of translation: 1,2,3,-1,-2,-3 - Default=all +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -f 1 - Translate into first forward reading frame. +Examples: ... | $script -f "-1,-2,-3" - Translate into all reverse reading frames. + +Keys in: SEQ - Sequence. + +Keys out: FRAME - Frame of translation. diff --git a/bp_usage/transliterate_seq b/bp_usage/transliterate_seq new file mode 100644 index 0000000..9a3af94 --- /dev/null +++ b/bp_usage/transliterate_seq @@ -0,0 +1,22 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Transliterate chars from sequences in stream. + +Usage: ... | $script [options] + +Options: [-s | --search=] - String of chars to locate and replace +Options: [-r | --replace=] - String of chars for replacing +Options: [-d | --delete=] - String of chars to delete +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -s Uu -r Tt - Replacing u with t as in converting RNA to DNA. +Examples: ... | $script -d '.~-' - Removing indels. + +Keys in: SEQ - Sequence. diff --git a/bp_usage/transliterate_vals b/bp_usage/transliterate_vals new file mode 100644 index 0000000..5101a07 --- /dev/null +++ b/bp_usage/transliterate_vals @@ -0,0 +1,21 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: April 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Transliterate chars from values in stream. + +Usage: ... | $script [options] + +Options: [-k | --keys=] - List of values to transliterate +Options: [-s | --search=] - String of chars to locate and replace +Options: [-r | --replace=] - String of chars for replacing +Options: [-d | --delete=] - String of chars to delete +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -k SEQ -s Uu -r Tt - Replacing u with t in SEQ (i.e converting RNA to DNA). +Examples: ... | $script -k PATTERN -d '.~-' - Removing indels from PATTERN. diff --git a/bp_usage/uniq_vals b/bp_usage/uniq_vals new file mode 100644 index 0000000..dd0d53a --- /dev/null +++ b/bp_usage/uniq_vals @@ -0,0 +1,19 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Locate records in stream where the values for a given key is unique or non-unique. + +Usage: ... | uniq_vals [options] + +Options: [-k | --key=] - Key for which the value is checked for uniqueness. +Options: [-i | --invert] - Display non-unique records. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | uniq_vals -k SEQ_NAME - Locate records with unique SEQ_NAME value. + diff --git a/bp_usage/upload_to_ucsc b/bp_usage/upload_to_ucsc new file mode 100644 index 0000000..f8eee8b --- /dev/null +++ b/bp_usage/upload_to_ucsc @@ -0,0 +1,29 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: September 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Upload data to local UCSC database for viewing in Genome Browser. In order to display secondary structures for folded RNA you must include 'rnaSecStr' in the table name. + +Usage: ... | $script [options] <-d > <-t > + +Options: [-d | --database=] - Genome database to upload track to. +Options: [-t | --table=] - Table name of track - NB! prefix with initials. e.g.: mah_test. +Options: [-x | --no_stream] - Do not emit records. +Options: [-s | --short_label=] - Short label for track - Default= +Options: [-l | --long_label=] - Long label for track - Default=
+Options: [-g | --group=] - Track group name - Default=m.hansen +Options: [-p | --priority=] - Track display priority - Default=1 +Options: [-u | --use_score] - Use the score to grey shade the track. +Options: [-v | --visibility= - Track visibility: hide|dense|squish|pack|full - Default=pack +Options: [-w | --wiggle - Create wiggle track based on overlapping sequences. +Options: [-c | --color=] - Track color e.g. '147,73,42' - Default= +Options: [-C | --chunk_size=] - Chunks for loading - Default=10000000 +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -d hg18 -t initials_my_test + diff --git a/bp_usage/uppercase_seq b/bp_usage/uppercase_seq new file mode 100644 index 0000000..bc7c441 --- /dev/null +++ b/bp_usage/uppercase_seq @@ -0,0 +1,17 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Uppercases sequences in stream. + +Usage: ... | uppercase_seq [options] + +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to file - Default=STDOUT + +Examples: ... | uppercase_seq - Uppercases all sequences in stream. + diff --git a/bp_usage/vmatch_seq b/bp_usage/vmatch_seq new file mode 100644 index 0000000..bb9bae6 --- /dev/null +++ b/bp_usage/vmatch_seq @@ -0,0 +1,28 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: October 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: vmatch sequences in stream against a specified genome. + +Usage: ... | $script [options] -g +Usage: ... | $script [options] -i + +Options: [-g | --genome= | --index_name=] - Custom index to vmatch. +Options: [-c | --count] - Replace score with hit count. +Options: [-m | --max_hits] - Skip hits with more than maximum hits (implies --count). +Options: [-h | --hamming_dist=] - Allow mismatches. +Options: [-e | --edit_dist=] - Allow mismatches and indels. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -g hg18 - Match sequences in stream againt human genome. +Examples: ... | $script -g hg18 -h 1 - allowing for one mismatch. +Examples: ... | $script -g hg18 -h 5p - allowing for 5% mismatches. +Examples: ... | $script -g hg18 -e 2 - allowing for 2 mismatches or indels. +Examples: ... | $script -g hg18 -e 10p - allowing for 10% mismatches or indels. + diff --git a/bp_usage/write_2bit b/bp_usage/write_2bit new file mode 100644 index 0000000..fec079d --- /dev/null +++ b/bp_usage/write_2bit @@ -0,0 +1,23 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: March 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write sequences in 2bit format. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-N | --no_mask] - Ignore soft masking. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -o test.2bit - Write entries to file 'test.2bit'. + +Keys in: SEQ_NAME - Sequence name. +Keys in: Q_ID - Used as sequence name if no SEQ_NAME. +Keys in: SEQ - Sequence. diff --git a/bp_usage/write_align b/bp_usage/write_align new file mode 100644 index 0000000..d189686 --- /dev/null +++ b/bp_usage/write_align @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write aligned sequences in pretty alignment format. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-w | --wrap=] - Wrap sequences to a given width. +Options: [-R | --no_ruler] - Suppress ruler for multiple alignments. +Options: [-C | --no_consensus] - Suppress consensus for multiple alignments. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT + +Examples: ... | $script -x -w 80 - Write entries wrapped to blocks of 80 to STDOUT. +Examples: ... | $script -x -o test.aln - Write entries to file 'test.aln'. + +Keys in: SEQ_NAME - Sequence name. +Keys in: SEQ - Sequence. +Keys in: ALIGN - Number specifying what alignment the sequence belong to. diff --git a/bp_usage/write_bed b/bp_usage/write_bed new file mode 100644 index 0000000..a71f9b7 --- /dev/null +++ b/bp_usage/write_bed @@ -0,0 +1,28 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write records from stream as BED lines. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -x -o table.bed - Output data to file 'table.bed'. +Examples: ... | $script -Z -o table.bed.gz - Output zipped data to file 'table.bed.gz'. + +Keys in: REC_TYPE - Either BED,PSL,PATSCAN,BLAST,VMATCH, otherwise generic. +Keys in: CHR - Chromosome. +Keys in: CHR_BEG - Chromosome begin position. +Keys in: CHR_END - Chromosome end position. +Keys in: Q_ID - Feature name. +Keys in: SCORE - Score. +Keys in: STRAND - Strand. diff --git a/bp_usage/write_blast b/bp_usage/write_blast new file mode 100644 index 0000000..c9d937e --- /dev/null +++ b/bp_usage/write_blast @@ -0,0 +1,36 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write BLAST records from stream in BLAST tabular format (-m8 and 9). + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-c | --comment] - Print comment line - Default=no +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -x -c -o blast.tab - Write BLAST table with comment line to 'blast.tab'. +Examples: ... | $script -x -Z -o blast.tab.gz - Write zipped BLAST table to 'blast.tab.gz'. + +Keys in: Q_ID - Query ID. +Keys in: S_ID - Subject ID. +Keys in: IDENT - Identity. +Keys in: ALIGN_LEN - Alignment length. +Keys in: MISMATCHES - Mismatches. +Keys in: GAPS - Gaps. +Keys in: Q_BEG - Query begin. +Keys in: Q_END - Query end. +Keys in: S_BEG - Subject begin. +Keys in: S_END - Subject end. +Keys in: E_VAL - Expect value. +Keys in: BIT_SCORE - Bit score. +Keys in: STRAND - Strand. +Keys in: REC_TYPE - Record type must be BLAST. diff --git a/bp_usage/write_fasta b/bp_usage/write_fasta new file mode 100644 index 0000000..8c2fbda --- /dev/null +++ b/bp_usage/write_fasta @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write sequences in FASTA format. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-w | --wrap=] - Wrap sequences to a given width. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -x -w 80 - Write entries wrapped to blocks of 80 to STDOUT. +Examples: ... | $script -x -o test.fna - Write entries to file 'test.fna'. +Examples: ... | $script -x -Z -o test.fna.gz - Write zipped entries to file 'test.fna.gz'. + +Keys in: SEQ_NAME - Sequence name. +Keys in: Q_ID - Used as sequence name if no SEQ_NAME. +Keys in: SEQ - Sequence. diff --git a/bp_usage/write_psl b/bp_usage/write_psl new file mode 100644 index 0000000..97df85b --- /dev/null +++ b/bp_usage/write_psl @@ -0,0 +1,45 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write records from stream in PSL format. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -x -o table.psl - Output data to file 'table.psl'. +Examples: ... | $script -Z -o table.psl.gz - Output zipped data to file 'table.psl.gz'. + +Keys in: MATCHES - Number of non-repeat matches. +Keys in: MISMATCHES - Number of mismatches. +Keys in: REPMATCHES - Number of repeat matches. +Keys in: NCOUNT - Number of Ns. +Keys in: QNUMINSERT - Number of inserts in query. +Keys in: QBASEINSERT - Number of bases inserted in query. +Keys in: SNUMINSERT - Number of inserts in subject. +Keys in: SBASEINSERT - Number of bases inserted in subject. +Keys in: STRAND - Strand. +Keys in: Q_ID - Query ID. +Keys in: Q_LEN - Query length. +Keys in: Q_BEG - Query begin. +Keys in: Q_END - Query end. +Keys in: S_ID - Subject ID. +Keys in: S_LEN - Subject length. +Keys in: S_BEG - Subject begin. +Keys in: S_END - Subject end. +Keys in: BLOCKCOUNT - Block count. +Keys in: BLOCKSIZES - Block sizes. +Keys in: Q_BEGS - Query sequence blocks begins. +Keys in: S_BEGS - Subject sequence blocks begins. +Keys in: SCORE - Score calculated as in web BLAT results. +Keys in: REC_TYPE - Record type. + diff --git a/bp_usage/write_solid b/bp_usage/write_solid new file mode 100644 index 0000000..15a19c4 --- /dev/null +++ b/bp_usage/write_solid @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: April 2008 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write di-base encoded Solid sequences. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-w | --wrap=] - Wrap sequences to a given width. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -x -w 80 - Write entries wrapped to blocks of 80 to STDOUT. +Examples: ... | $script -x -o test.solid - Write entries to file 'test.solid'. +Examples: ... | $script -x -Z -o test.solid.gz - Write zipped entries to file 'test.solid.gz'. + +Keys in: SEQ_NAME - Sequence name. +Keys in: Q_ID - Used as sequence name if no SEQ_NAME. +Keys in: SEQ - Sequence. diff --git a/bp_usage/write_tab b/bp_usage/write_tab new file mode 100644 index 0000000..f8da7f6 --- /dev/null +++ b/bp_usage/write_tab @@ -0,0 +1,26 @@ +Author: Martin Asser Hansen - Copyright (C) - All rights reserved + +Contact: mail@maasha.dk + +Date: August 2007 + +License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) + +Description: Write records from stream as tab separated table. + +Usage: ... | $script [options] + +Options: [-x | --no_stream] - Do not emit records. +Options: [-o | --data_out=] - Write result to file. +Options: [-c | --comment] - Print comment line - Default=no +Options: [-d | --delimit=] - Changes delimiter - Default='\t' +Options: [-k | --keys=] - Comma separated list of keys to print in that order. +Options: [-K | --no_keys=] - Comma separated list of keys to ignore. +Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN +Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT +Options: [-Z | --compress] - Compress output using gzip. + +Examples: ... | $script -c -o table.csv - Output tabular data to file 'table.csv' with comment line. +Examples: ... | $script -k SEQ_NAME,SEQ -x - Output tabular data for columns SEQ_NAME and SEQ only. +Examples: ... | $script -d ',' -K SEQ - Output comma separated data ignoring SEQ column. +Examples: ... | $script -Z -o test.tab.gz - Output zipped tabular data to file 'test.tab.gz'. diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile new file mode 100644 index 0000000..c5bf7ba --- /dev/null +++ b/code_c/Maasha/src/Makefile @@ -0,0 +1,58 @@ +CC = gcc +# Cflags = -Wall +# Cflags = -Wall -g -O0 # for valgrind +Cflags = -Wall -g -pg # for gprof + +INC_DIR = inc/ +LIB_DIR = lib/ + +INC = -I $(INC_DIR) +LIB = -lm $(LIB_DIR)*.o + +all: libs test test_revcomp fasta_count test_fasta test_oligo2bin repeat-O-matic test_list test_hash test_split test_binary_search test_biotools test_file_buffer test_bed + +libs: + cd $(LIB_DIR) && ${MAKE} all + +test: test.c + $(CC) $(Cflags) $(INC) $(LIB) test.c -o test + +test_revcomp: test_revcomp.c + $(CC) $(Cflags) $(INC) $(LIB) test_revcomp.c -o test_revcomp + +fasta_count: fasta_count.c + $(CC) $(Cflags) $(INC) $(LIB) fasta_count.c -o fasta_count + +test_fasta: test_fasta.c + $(CC) $(Cflags) $(INC) $(LIB) test_fasta.c -o test_fasta + +test_oligo2bin: test_oligo2bin.c + $(CC) $(Cflags) $(INC) $(LIB) test_oligo2bin.c -o test_oligo2bin + +repeat-O-matic: repeat-O-matic.c + $(CC) $(Cflags) $(INC) $(LIB) repeat-O-matic.c -o repeat-O-matic + +test_list: test_list.c + $(CC) $(Cflags) $(INC) $(LIB) test_list.c -o test_list + +test_hash: test_hash.c + $(CC) $(Cflags) $(INC) $(LIB) test_hash.c -o test_hash + +test_split: test_split.c + $(CC) $(Cflags) $(INC) $(LIB) test_split.c -o test_split + +test_binary_search: test_binary_search.c + $(CC) $(Cflags) $(INC) $(LIB) test_binary_search.c -o test_binary_search + +test_biotools: test_biotools.c + $(CC) $(Cflags) $(INC) $(LIB) test_biotools.c -o test_biotools + +test_file_buffer: test_file_buffer.c $(LIB_DIR)filesys.c + $(CC) $(Cflags) $(INC) $(LIB) test_file_buffer.c -o test_file_buffer + +test_bed: test_bed.c $(LIB_DIR)ucsc.c + $(CC) $(Cflags) $(INC) $(LIB) test_bed.c -o test_bed + +clean: + cd $(LIB_DIR) && ${MAKE} clean + rm -f test test_revcomp fasta_count test_fasta test_oligo2bin repeat-O-matic test_list test_hash test_split test_binary_search test_biotools test_file_buffer test_bed diff --git a/code_c/Maasha/src/fasta_count b/code_c/Maasha/src/fasta_count new file mode 100755 index 0000000..1a5b8f0 Binary files /dev/null and b/code_c/Maasha/src/fasta_count differ diff --git a/code_c/Maasha/src/fasta_count.c b/code_c/Maasha/src/fasta_count.c new file mode 100644 index 0000000..0296d55 --- /dev/null +++ b/code_c/Maasha/src/fasta_count.c @@ -0,0 +1,40 @@ +#include "common.h" +#include "filesys.h" +#include "fasta.h" + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ + + +int main( int argc, char *argv[] ) +{ + int i; + int count; + int total; + FILE *fp; + + count = 0; + total = 0; + + for ( i = 1; argv[ i ]; i++ ) + { + fp = read_open( argv[ i ] ); + + count = fasta_count( fp ); + + close_stream( fp ); + + printf( "%s: %d\n", argv[ i ], count ); + + total += count; + } + + if ( total > count ) { + printf( "total: %d\n", total ); + } + + return 0; +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ diff --git a/code_c/Maasha/src/gmon.out b/code_c/Maasha/src/gmon.out new file mode 100644 index 0000000..efb143a Binary files /dev/null and b/code_c/Maasha/src/gmon.out differ diff --git a/code_c/Maasha/src/inc/common.h b/code_c/Maasha/src/inc/common.h new file mode 100644 index 0000000..073e3af --- /dev/null +++ b/code_c/Maasha/src/inc/common.h @@ -0,0 +1,113 @@ +/* Including standard libraries */ +#include +#include +#include +#include +#include +#include + +/* Define a shorthand for unsigned int */ +#define uint unsigned int + +/* Define a boolean type */ +#define bool char +#define TRUE 1 +#define FALSE 0 + +/* Macro for resetting a pointer to all \0's. */ +#define ZERO( pt ) ( memset( pt, '\0', sizeof( *pt ) ) ) + +/* Macro for dynamic allocation of memory. */ +#define MEM_GET( pt ) ( pt = mem_get( sizeof( *pt ) ) ) + +/* Macro for cloning a structure in memroy. */ +#define MEM_CLONE( pt ) mem_clone( pt, sizeof( ( pt )[ 0 ] ) ) + +/* Macros for determining min or max of two given values. */ +#define MAX( a, b ) a < b ? b : a +#define MIN( a, b ) a > b ? b : a + +/* Macros for abs and int functions. */ +#define ABS( x ) ( ( x ) < 0 ) ? -( x ) : ( x ) +#define INT( x ) ( int ) x + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> STRUCTURE DECLARATIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Singly linked list with a pointer to the next element and a pointer to a value. */ +struct list +{ + struct list *next; + void *val; +}; + +/* Singly linked list with a pointer to the next element and an integer value. */ +struct list_int +{ + struct list *next; + int val; +}; + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ERROR HANDLING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Print error message to stderr and exit. */ +void die( char *error_msg ); + +/* Print warning message to stderr. */ +void warn( char *warn_msg ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MEMORY HANDLING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Get a pointer with a given size of allocated memory. */ +void *mem_get( size_t size ); + +/* Get a pointer with a given size of allocated and zero'ed memory. */ +void *mem_get_zero( size_t size ); + +/* Resize allocated memory for a given pointer. */ +void *mem_resize( void* pt, size_t size ); + +/* Resize allocated memory for a given pointer with extra memory zero'ed. */ +void *mem_resize_zero( void* pt, size_t old_size, size_t new_size ); + +/* Clone a structure in memory and return a pointer to the clone. */ +void *mem_clone( void *old_pt, size_t size ); + +/* Free memory from a given pointer. */ +void mem_free( void *pt ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ARRAYS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Binary search an array of integers for an integer value. */ +bool binary_search_array( int *array, int array_size, int val ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Remove the last char from a string. */ +void chop( char *string ); + +/* Remove the last char from a string if the char is a newline (safer than chop). */ +void chomp( char *string ); + +/* Split a given line and a delimiter return the split result as a list. */ +void split( char *string, char delimit, struct list **fields ); + +/* Mockup version of Perl substr. */ +char *substr( char *string, int offset, int len ); + +/* Return a binary number as a string of 1's and 0's. */ +char *bits2string( uint bin ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + diff --git a/code_c/Maasha/src/inc/fasta.h b/code_c/Maasha/src/inc/fasta.h new file mode 100644 index 0000000..6151876 --- /dev/null +++ b/code_c/Maasha/src/inc/fasta.h @@ -0,0 +1,27 @@ +#define FASTA_BUFFER 256 * 1024 + +/* Structure of a sequence entry. */ +struct seq_entry +{ + char *seq_name; + char *seq; + size_t seq_len; +}; + +/* Count all entries in a FASTA file given a file pointer. */ +uint fasta_count( FILE *fp ); + +/* Get next sequence entry from a FASTA file given a file pointer. */ +bool fasta_get_entry( FILE *fp, struct seq_entry *entry ); + +/* Output a sequence entry in FASTA format. */ +void fasta_put_entry( struct seq_entry *entry ); + +/* Get all sequence entries from a FASTA file in a list. */ +void fasta_get_entries( FILE *fp, struct list **entries ); + +/* Output all sequence entries from a list in FASTA format. */ +void fasta_put_entries( struct list *entries ); + +/* Deallocates memory from a seq_entry. */ +void fasta_free_entry( struct seq_entry *entry ); diff --git a/code_c/Maasha/src/inc/filesys.h b/code_c/Maasha/src/inc/filesys.h new file mode 100644 index 0000000..87e5e89 --- /dev/null +++ b/code_c/Maasha/src/inc/filesys.h @@ -0,0 +1,64 @@ +//#define FILE_BUFFER_SIZE 64 * 1024 +#define FILE_BUFFER_SIZE 1024 * 1024 + +struct file_buffer +{ + FILE *fp; /* file pointer */ + char *str; /* the buffer string */ + size_t pos; /* index pointing to last position where some token was found */ + size_t use; /* index indicating how much of the buffer is scanned */ + size_t end; /* end position of buffer */ + size_t size; /* default buffer size */ + bool eof; /* flag indicating that buffer reached EOF */ +}; + +/* Read-open a file and return a file pointer. */ +FILE *read_open( char *file ); + +/* Write-open a file and return a file pointer. */ +FILE *write_open( char *file ); + +/* Append-open a file and return a file pointer. */ +FILE *append_open( char *file ); + +/* Close a stream defined by a file pointer. */ +void close_stream( FILE *fp ); + +/* Read in len number of bytes from the current position of a */ +/* file pointer into a string that is allocated and null terminated. */ +char *file_read( FILE *fp, size_t len ); + +/* Delete a file. */ +void file_unlink( char *file ); + +/* Rename a file. */ +void file_rename( char *old_name, char *new_name ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FILE BUFFER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +/* Opens a file for reading and loads a new buffer.*/ +struct file_buffer *read_open_buffer( char *file ); + +/* Get the next char from a file buffer, which is resized if necessary, until EOF.*/ +char buffer_getc( struct file_buffer *buffer ); + +/* Get the next line that is terminated by \n or EOF from a file buffer. */ +char *buffer_gets( struct file_buffer *buffer ); + +/* Increases buffer size until it is larger than len. */ +void buffer_new_size( struct file_buffer *buffer, int len ); + +/* Resize file buffer discarding any old buffer before offset, */ +/* and merge remaining old buffer with a new chunk of buffer. */ +void buffer_resize( struct file_buffer *buffer ); + +/* Deallocates memory and close stream used by file buffer. */ +void buffer_destroy( struct file_buffer *buffer ); + +/* Debug function that prints the content of a file_buffer. */ +void buffer_print( struct file_buffer *buffer ); + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ diff --git a/code_c/Maasha/src/inc/hash.h b/code_c/Maasha/src/inc/hash.h new file mode 100644 index 0000000..5c26d34 --- /dev/null +++ b/code_c/Maasha/src/inc/hash.h @@ -0,0 +1,40 @@ +/* Structure of a generic hash. */ +struct hash +{ + struct hash_elem **table; + uint mask; + int table_size; + int elem_count; +}; + +/* Structure of a generic hash element. */ +struct hash_elem +{ + struct hash_elem *next; + char *key; + void *val; +}; + +/* Initialize a new generic hash structure. */ +struct hash *hash_new( size_t size ); + +/* Hash function that generates a hash key, */ +uint hash_key( char *string ); + +/* Add a new hash element consisting of a key/value pair to an existing hash. */ +void hash_add( struct hash *myhash, char *key, void *val ); + +/* Lookup a key in a given hash and return the value - or NULL if not found. */ +void *hash_get( struct hash *myhash, char *key ); + +/* Lookup a key in a given hash and return the hash element - or NULL if not found. */ +struct hash_elem *hash_get_elem( struct hash *myhash, char *key ); + +/* Remove key/value pair from a given hash. Returns true if a remove was successful. */ +bool hash_del( struct hash *myhash, char *key ); + +/* Deallocate memory for hash and all hash elements. */ +void hash_destroy( struct hash *myhash ); + +/* Output some collision stats for a given hash. */ +void hash_collision_stats( struct hash *myhash ); diff --git a/code_c/Maasha/src/inc/list.h b/code_c/Maasha/src/inc/list.h new file mode 100644 index 0000000..7e3850a --- /dev/null +++ b/code_c/Maasha/src/inc/list.h @@ -0,0 +1,21 @@ +/* Add a new singly linked list element with a pointer. */ +void list_add( struct list **list_ppt, void *val ); + +/* Add a new singly linked list element with an integer. */ +void list_add_int( struct list_int **list_ppt, int val ); + +/* Reverse the order of elements in a singly linked list. */ +void list_reverse( void *old_list ); + +/* Check if a given string exists in a singly linked list. */ +bool list_exists( struct list *list_pt, char *string ); + +/* Check if a given integer exists in a singly linked list. */ +bool list_exists_int( struct list_int *list_pt, int val ); + +/* Free memory for all elements of a singly linked list. */ +void list_free( void *list_pt ); + +/* Debug function to print all elements from a singly linked list. */ +void list_print( struct list *list_pt ); + diff --git a/code_c/Maasha/src/inc/seq.h b/code_c/Maasha/src/inc/seq.h new file mode 100644 index 0000000..ef7e032 --- /dev/null +++ b/code_c/Maasha/src/inc/seq.h @@ -0,0 +1,59 @@ +/* Macro to test if a given char is DNA. */ +#define dna_clean( c ) ( c == 'A' || c == 'a' || c == 'T' || c == 't' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 + +/* Macro to test if a given char is RNA. */ +#define rna_clean( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 + +/* Uppercase sequence. */ +void uppercase_seq( char *seq ); + +/* Lowercase sequence. */ +void lowercase_seq( char *seq ); + +/* Reverse compliments DNA sequence. */ +void revcomp_dna( char *seq ); + +/* Reverse compliments RNA sequence. */ +void revcomp_rna( char *seq ); + +/* Reverse compliment nucleotide sequnce after guessing the sequence type. */ +void revcomp_nuc( char *seq ); + +/* Complement DNA sequence. (NB it is not reversed!). */ +void complement_dna( char *seq ); + +/* Complement RNA sequence. (NB it is not reversed!). */ +void complement_rna( char *seq ); + +/* Complement nucleotide sequence after guessing the sequence type. */ +void complement_nuc( char *seq ); + +/* Reverse sequence. */ +void reverse( char *seq ); + +/* Convert all non-nucleotide letters to Ns. */ +void seq2nuc_simple( char *seq ); + +/* Convert DNA into RNA by change t and T to u and U, respectively. */ +void dna2rna( char *seq ); + +/* Convert RNA into DNA by change u and U to t and T, respectively. */ +void rna2dna( char *seq ); + +/* Check if a sequence is DNA by inspecting the first 100 residues. */ +bool is_dna( char *seq ); + +/* Check if a sequence is RNA by inspecting the first 100 residues. */ +bool is_rna( char *seq ); + +/* Check if a sequence is protein by inspecting the first 100 residues. */ +bool is_protein( char *seq ); + +/* Guess if a sequence is DNA, RNA, or protein by inspecting the first 100 residues. */ +char *seq_guess_type( char *seq ); + +/* Check if a sequence contain N or n. */ +bool contain_N( char *seq ); + +/* Pack a nucleotide oligo (max length 15) into a binary/integer (good for hash keys). */ +int oligo2bin( char *oligo ); diff --git a/code_c/Maasha/src/inc/ucsc.h b/code_c/Maasha/src/inc/ucsc.h new file mode 100644 index 0000000..752b012 --- /dev/null +++ b/code_c/Maasha/src/inc/ucsc.h @@ -0,0 +1,28 @@ +#define BED_BUFFER 1024 + +struct bed_entry3 +{ + char *chr; + uint chr_beg; + uint chr_end; +}; + +struct bed_entry12 +{ + char *chr; + uint chr_beg; + uint chr_end; + char *q_id; + float score; + char strand; + uint thick_beg; + uint thick_end; + char *itemrgb; + uint blockcount; + char *blocksizes; + char *q_begs; +}; + +void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols ); +void bed_split( char *string, struct bed_entry12 *bed, int cols ); + diff --git a/code_c/Maasha/src/lib/Makefile b/code_c/Maasha/src/lib/Makefile new file mode 100644 index 0000000..84dc3a3 --- /dev/null +++ b/code_c/Maasha/src/lib/Makefile @@ -0,0 +1,30 @@ +CC = gcc +# Cflags = -Wall +Cflags = -Wall -g -pg # gprof +INC_DIR = -I ../inc/ + +all: common.o seq.o filesys.o fasta.o list.o hash.o ucsc.o + +common.o: common.c + $(CC) $(Cflags) $(INC_DIR) -c common.c + +seq.o: seq.c + $(CC) $(Cflags) $(INC_DIR) -c seq.c + +filesys.o: filesys.c + $(CC) $(Cflags) $(INC_DIR) -c filesys.c + +fasta.o: fasta.c + $(CC) $(Cflags) $(INC_DIR) -c fasta.c + +list.o: list.c + $(CC) $(Cflags) $(INC_DIR) -c list.c + +hash.o: hash.c + $(CC) $(Cflags) $(INC_DIR) -c hash.c + +ucsc.o: ucsc.c + $(CC) $(Cflags) $(INC_DIR) -c ucsc.c + +clean: + rm common.o seq.o filesys.o fasta.o list.o hash.o ucsc.o diff --git a/code_c/Maasha/src/lib/biotools.c b/code_c/Maasha/src/lib/biotools.c new file mode 100644 index 0000000..e69de29 diff --git a/code_c/Maasha/src/lib/common.c b/code_c/Maasha/src/lib/common.c new file mode 100644 index 0000000..8203a2c --- /dev/null +++ b/code_c/Maasha/src/lib/common.c @@ -0,0 +1,322 @@ +#include "common.h" +#include "list.h" + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ERROR HANDLING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +void die( char *msg ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Print error message and exits. */ + + fprintf( stderr, "ERROR: %s\n", msg ); + + exit( 1 ); +} + + +void warn( char *msg ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Print warning message and exits. */ + + fprintf( stderr, "WARNING: %s\n", msg ); +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MEMORY HANDLING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +void *mem_get( size_t size ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Allocate a given chunk of memory to a pointer that is returned. */ + + void *pt; + + if ( size == 0 ) { + die( "could not allocate 0 bytes of memory." ); + } else if ( ( pt = malloc( size ) ) == NULL ) { + die( "could not allocate memory." ); + } + + return pt; +} + + +void *mem_get_zero( size_t size ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Allocate a given chunk of zero'ed memory to a pointer that is returned. */ + + void *pt; + + if ( size == 0 ) { + die( "could not allocate 0 bytes of memory." ); + } else if ( ( pt = malloc( size ) ) == NULL ) { + die( "could not allocate memory." ); + } + + memset( pt, '\0', size ); + + return pt; +} + + +void *mem_resize( void *pt, size_t size ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Resize an allocated chunk of memory for a given pointer and new size. */ + + void *pt_new; + + if ( size == 0 ) { + die( "could not re-allocate 0 bytes of memory." ); + } else if ( ( pt_new = realloc( pt, size ) ) == NULL ) { + die( "could not re-allocate memory." ); + } + + return pt_new; +} + + +void *mem_resize_zero( void *pt, size_t old_size, size_t new_size ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Resize an allocated chunk of memory for a given pointer and zero any extra memory. */ + + void *pt_new; + + pt_new = mem_resize( pt, new_size ); + + if ( new_size > old_size ) { + memset( ( ( void * ) pt_new ) + old_size, '\0', new_size - old_size ); + } + + return pt_new; +} + + +void *mem_clone( void *old_pt, size_t size ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Clone a structure in memory and return a pointer to the clone. */ + + void *new_pt; + + new_pt = mem_get( size ); + + memcpy( new_pt, old_pt, size ); + + return new_pt; +} + + +void mem_free( void *pt ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Free memory from a given pointer. */ + + if ( pt != NULL ) + { + free( pt ); + + pt = NULL; + } +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ARRAYS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +bool binary_search_array( int *array, int array_size, int val ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Binary search an array of integers for an integer value. */ + + int high; + int low; + int try; + + high = array_size; + low = 0; + + while ( low < high ) + { + try = ( ( high + low ) / 2 ); + + if ( val < array[ try ] ) { + high = try; + } else if ( val > array[ try ] ) { + low = try + 1; + } else { + return TRUE; + } + } + + return FALSE; +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +void chop( char *string ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Removes the last char from a string. */ + + int len; + + len = strlen( string ); + + string[ len - 1 ] = '\0'; +} + + +void chomp( char *string ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Removes the last char from a string if the char is a newline. */ + + int len; + + len = strlen( string ); + + if ( string[ len - 1 ] == '\n' ) { + string[ len - 1 ] = '\0'; + } +} + + +void split( char *string, char delimit, struct list **fields ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Split a given line and a delimiter return the split result as a list. */ + + int i; + int j; + + char field[ 256 ] = ""; + char *field_copy; + + j = 0; + + for ( i = 0; string[ i ]; i++ ) + { + if ( string[ i ] != delimit ) + { + field[ j ] = string[ i ]; + + j++; + } + else + { + field_copy = mem_clone( field, j + 1 ); + + list_add( fields, field_copy ); + + ZERO( field ); + + j = 0; + } + } + + field_copy = mem_clone( field, j + 1 ); + + list_add( fields, field_copy ); + + list_reverse( fields ); +} + + +char *substr( char *string, int offset, int len ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Create equavalent of Perls substr command. */ + /* Currently implemented without optional length */ + /* and the replace feature. */ + + int string_len; + int i; + int j; + char *substr; + + string_len = strlen( string ); + + if ( offset < 0 ) { + die( "substr offset < 0." ); + } else if ( len < 0 ) { + die( "substr length < 0." ); + } else if ( offset > string_len ) { + die( "substr offset outside string." ); + } else if ( offset + len > string_len ) { + die( "substr offset + len outside string." ); + } + + substr = mem_get( len + 1 ); + + i = offset; + j = 0; + + while ( i < offset + len ) + { + substr[ j ] = string[ i ]; + + i++; + j++; + } + + substr[ j ] = '\0'; + + return substr; +} + + +char *bits2string( uint bin ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Return a binary number as a string of 1's and 0's. */ + + int i; + uint j; + char *string; + + string = mem_get( ( sizeof( uint ) * 8 ) + 1 ); + + j = 1; + + for ( i = 0; i < sizeof( uint ) * 8; i++ ) + { + + if ( ( bin & j ) != 0 ) { + string[ 31 - i ] = '1'; + } else { + string[ 31 - i ] = '0'; + } + + j <<= 1; + } + + string[ i ] = '\0'; + + return string; +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ diff --git a/code_c/Maasha/src/lib/fasta.c b/code_c/Maasha/src/lib/fasta.c new file mode 100644 index 0000000..5a2ca7f --- /dev/null +++ b/code_c/Maasha/src/lib/fasta.c @@ -0,0 +1,188 @@ +#include "common.h" +#include "fasta.h" +#include "list.h" + + +uint fasta_count( FILE *fp ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Counts all entries in a FASTA file given a file pointer. */ + + char buffer[ FASTA_BUFFER ]; + uint count; + + count = 0; + + while ( ( fgets( buffer, sizeof( buffer ), fp ) ) != NULL ) + { + if ( buffer[ 0 ] == '>' ) { + count++; + } + } + + return count; +} + + +bool fasta_get_entry( FILE *fp, struct seq_entry *entry ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Get next sequence entry from a FASTA file given a file pointer. */ + + int i; + size_t j; + size_t offset; + size_t seq_len; + char buffer[ FASTA_BUFFER ]; + int buffer_len; + char *seq_name = NULL; + char *seq = NULL; + + MEM_GET( entry ); + + offset = ftell( fp ); + + /* ---- Skip ahead until header line and include header ---- */ + + while ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + buffer_len = strlen( buffer ); + + offset += buffer_len; + + if ( ( buffer[ 0 ] == '>' ) ) + { + seq_name = mem_get_zero( buffer_len - 1 ); + + for ( i = 1; i < buffer_len - 1; i++ ) { + seq_name[ i - 1 ] = buffer[ i ]; + } + + seq_name[ i ] = '\0'; + + break; + } + } + + /* ---- Determine length of sequence ---- */ + + seq_len = 0; + + while ( ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) ) + { + for ( i = 0; buffer[ i ]; i++ ) + { + if ( buffer[ i ] > 32 && buffer[ i ] < 127 ) { + seq_len++; + } + } + + if ( ( buffer[ 0 ] == '>' ) ) + { + seq_len -= strlen( buffer ) - 1; + + break; + } + } + + /* ---- Allocate memory for sequence ---- */ + + seq = mem_get_zero( seq_len + 1 ); + + /* ---- Rewind file pointer and read sequence ---- */ + + if ( fseek( fp, offset, SEEK_SET ) < 0 ) { + die( "fseek SEEK_SET failed." ); + } + + j = 0; + + while ( ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) ) + { + for ( i = 0; buffer[ i ]; i++ ) + { + if ( buffer[ i ] > 32 && buffer[ i ] < 127 ) + { + seq[ j ] = buffer[ i ]; + + if ( j == seq_len - 1 ) + { + seq[ j + 1 ] = '\0'; + + entry->seq_name = seq_name; + entry->seq = seq; + entry->seq_len = seq_len; + + return TRUE; + } + + j++; + } + } + } + + return FALSE; +} + + +void fasta_put_entry( struct seq_entry *entry ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Output a sequence entry in FASTA format. */ + printf( ">%s\n%s\n", entry->seq_name, entry->seq ); +} + + +void fasta_get_entries( FILE *fp, struct list **entries ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Given a file pointer to a FASTA file retreives all */ + /* sequence entries and insert those in a list. */ + + struct seq_entry *entry; + + while ( 1 ) + { + MEM_GET( entry ); + + if ( ! fasta_get_entry( fp, entry ) ) { + break; + } + + list_add( entries, entry ); + } + + list_reverse( entries ); +} + + +void fasta_put_entries( struct list *entries ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Output a list of sequence entries as FASTA records. */ + + struct list *elem; + + for ( elem = entries; elem != NULL; elem = elem->next ) { + fasta_put_entry( elem->val ); + } +} + + +void fasta_free_entry( struct seq_entry *entry ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Deallocates memory from a seq_entry. */ + + mem_free( entry->seq_name ); + mem_free( entry->seq ); + mem_free( entry ); +} + + diff --git a/code_c/Maasha/src/lib/filesys.c b/code_c/Maasha/src/lib/filesys.c new file mode 100644 index 0000000..a7489a6 --- /dev/null +++ b/code_c/Maasha/src/lib/filesys.c @@ -0,0 +1,324 @@ +#include "common.h" +#include "filesys.h" + + +FILE *read_open( char *file ) +{ + /* Martin A. Hansen, November 2005 */ + + /* Given a file name, read-opens the file, */ + /* and returns a file pointer. */ + + FILE *fp; + char *msg; + + if ( ( fp = fopen( file, "r" ) ) == NULL ) + { + sprintf( msg, "Could not read-open file '%s'.", file ); + + die( msg ); + } + + return fp; +} + + +FILE *write_open( char *file ) +{ + /* Martin A. Hansen, November 2005 */ + + /* Given a file name, write-opens the file, */ + /* and returns a file pointer. */ + + FILE *fp; + char *msg; + + if ( ( fp = fopen( file, "w" ) ) == NULL ) + { + sprintf( msg, "Could not write-open file '%s'.", file ); + + die( msg ); + } + + return fp; +} + + +FILE *append_open( char *file ) +{ + /* Martin A. Hansen, November 2005 */ + + /* Given a file name, append-opens the file, */ + /* and returns a file pointer. */ + + FILE *fp; + char *msg; + + if ( ( fp = fopen( file, "a" ) ) == NULL ) + { + sprintf( msg, "Could not append-open file '%s'.", file ); + + die( msg ); + } + + return fp; +} + + +void close_stream( FILE *fp ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Closes a stream or file associated with a given file pointer. */ + + if ( ( fclose( fp ) ) != 0 ) { + die( "Could not close stream." ); + } +} + + +char *file_read( FILE *fp, size_t len ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Read in len number of bytes from the current position of a */ + /* file pointer into a string that is allocated and null terminated. */ + + char *string; + + string = mem_get( len + 1 ); + + fread( string, len, 1, fp ); + + if ( ferror( fp ) != 0 ) { + die( "fread failed." ); + } + + string[ len + 1 ] = '\0'; + + return string; +} + + +void file_unlink( char *file ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Delete a file. */ + + char *msg; + + if ( unlink( file ) == -1 ) + { + sprintf( msg, "Could not delete file '%s'.", file ); + + die( msg ); + } +} + + +void file_rename( char *old_name, char *new_name ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Rename a file. */ + + char *msg; + + if ( rename( old_name, new_name ) == -1 ) + { + sprintf( msg, "Could not rename file '%s' -> '%s'.", old_name, new_name ); + + die( msg ); + } +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FILE BUFFER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ + + +struct file_buffer *read_open_buffer( char *file ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Opens a file for reading and loads a new buffer.*/ + + struct file_buffer *buffer; + FILE *fp; + char *str; + bool eof; + + MEM_GET( buffer ); + + fp = read_open( file ); + + str = file_read( fp, FILE_BUFFER_SIZE ); + + feof( fp ) ? ( eof = TRUE ) : ( eof = FALSE ); + + buffer->fp = fp; + buffer->str = str; + buffer->pos = 0; + buffer->use = 0; + buffer->end = strlen( str ); + buffer->size = FILE_BUFFER_SIZE; + buffer->eof = eof; + + return buffer; +} + + +char buffer_getc( struct file_buffer *buffer ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Get the next char from a file buffer, which is resized if necessary, until EOF.*/ + + while ( 1 ) + { + if ( buffer->use == buffer->end ) + { + if ( buffer->eof ) + { + return '\0'; + } + else + { + buffer->pos = buffer->use; + buffer_new_size( buffer, buffer->use ); + buffer_resize( buffer ); + } + } + + return buffer->str[ buffer->use++ ]; + } +} + + +char *buffer_gets( struct file_buffer *buffer ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Get the next line that is terminated by \n or EOF from a file buffer. */ + + char *pt; + char *line; + size_t line_size; + + while ( 1 ) + { + if ( ( pt = memchr( &buffer->str[ buffer->use ], '\n', buffer->end - buffer->use ) ) != NULL ) + { + line_size = pt - &buffer->str[ buffer->use ] + 1; + + line = mem_get( line_size ); + + memcpy( line, &buffer->str[ buffer->use ], line_size ); + + line[ line_size ] = '\0'; + + buffer->use += line_size; + + buffer_new_size( buffer, line_size ); + + return line; + } + else + { + if ( buffer->eof ) { + return NULL; + } else { + buffer_resize( buffer ); + } + } + } +} + + +void buffer_new_size( struct file_buffer *buffer, int len ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Increases buffer size until it is larger than len. */ + + while ( buffer->size < len ) + { + buffer->size <<= 1; + + if ( buffer->size <= 0 ) { + die( "buffer_new_size failed." ); + } + } +} + + +void buffer_resize( struct file_buffer *buffer ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Resize file buffer. */ + + char *str; + size_t str_len; + size_t new_end; + + str = file_read( buffer->fp, buffer->size ); + + str_len = strlen( str ); + + feof( buffer->fp ) ? ( buffer->eof = TRUE ) : ( buffer->eof = FALSE ); + + if ( buffer->pos != 0 ) + { + memmove( buffer->str, &buffer->str[ buffer->pos ], buffer->use - buffer->pos ); + + buffer->end -= buffer->pos; + buffer->use = 0; + buffer->pos = 0; + } + + new_end = buffer->end + str_len; + + buffer->str = mem_resize( buffer->str, new_end + 1 ); + + memcpy( &buffer->str[ buffer->end ], str, str_len ); + + buffer->str[ new_end + 1 ] = '\0'; + + buffer->end = new_end; + + mem_free( str ); +} + + +void buffer_destroy( struct file_buffer *buffer ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Deallocates memory and close stream used by file buffer. */ + + close_stream( buffer->fp ); + + mem_free( buffer->str ); + mem_free( buffer ); +} + + +void buffer_print( struct file_buffer *buffer ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Debug function that prints the content of a file_buffer. */ + + printf( "buffer: {\n" ); + printf( " pos : %lu\n", buffer->pos ); + printf( " use : %lu\n", buffer->use ); + printf( " end : %lu\n", buffer->end ); + printf( " eof : %d\n", buffer->eof ); + printf( " str : ->%s<-\n", buffer->str ); + printf( " str_len: %lu\n", strlen( buffer->str ) ); + printf( "}\n" ); +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ diff --git a/code_c/Maasha/src/lib/hash.c b/code_c/Maasha/src/lib/hash.c new file mode 100644 index 0000000..17e4531 --- /dev/null +++ b/code_c/Maasha/src/lib/hash.c @@ -0,0 +1,196 @@ +#include "common.h" +#include "hash.h" +#include "list.h" + + +struct hash *hash_new( size_t size ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Initialize a new generic hash structure. */ + + struct hash *new_hash; + int table_size; + + MEM_GET( new_hash ); + + table_size = 1 << size; /* table_size = ( 2 ** size ) */ + + new_hash->table_size = table_size; + new_hash->mask = table_size - 1; + new_hash->table = mem_get( sizeof( struct hash_elem * ) * table_size ); + + new_hash->elem_count = 0; + + return new_hash; +} + + +void hash_add( struct hash *myhash, char *key, void *val ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Add a new hash element consisting of a key/value pair to an existing hash. */ + + struct hash_elem *old_elem; + struct hash_elem *new_elem; + int hash_index; + + if ( ( old_elem = hash_get_elem( myhash, key ) ) != NULL ) + { + old_elem->val = val; + } + else + { + MEM_GET( new_elem ); + + hash_index = ( hash_key( key ) & myhash->mask ); + + new_elem->key = mem_clone( key, strlen( key ) ); + new_elem->val = val; + new_elem->next = myhash->table[ hash_index ]; + + myhash->table[ hash_index ] = new_elem; + myhash->elem_count++; + } +} + + +void *hash_get( struct hash *myhash, char *key ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Lookup a key in a given hash and return the value - or NULL if not found. */ + + struct hash_elem *bucket; + + bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; + + while ( bucket != NULL ) + { + if ( strcmp( bucket->key, key ) == 0 ) { + return bucket->val; + } + + bucket = bucket->next; + } + + return NULL; +} + + +struct hash_elem *hash_get_elem( struct hash *myhash, char *key ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Lookup a key in a given hash and return the hash element - or NULL if not found. */ + + struct hash_elem *bucket; + + bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; + + while ( bucket != NULL ) + { + if ( strcmp( bucket->key, key ) == 0 ) { + return bucket; + } + + bucket = bucket->next; + } + + return NULL; +} + + +bool hash_del( struct hash *myhash, char *key ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Remove key/value pair from a given hash. */ + /* Returns true if a remove was successful. */ + + struct hash_elem *bucket; + + bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; + + while ( bucket != NULL ) + { + if ( strcmp( bucket->key, key ) == 0 ) + { + myhash->elem_count--; + return TRUE; + } + + bucket = bucket->next; + } + + return FALSE; +} + + +void hash_destroy( struct hash *myhash ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Deallocate memory for hash and all hash elements. */ + + int i; + struct hash_elem *bucket; + + for ( i = 0; i < myhash->table_size; i++ ) + { + for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) + { + mem_free( bucket->key ); +// mem_free( bucket->val ); + mem_free( bucket ); + } + } + + mem_free( myhash->table ); + mem_free( myhash ); +} + + +uint hash_key( char *string ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Hash function that generates a hash key, */ + /* based on the Jim Kent's stuff. */ + + char *key = string; + uint result = 0; + int c; + + while ( ( c = *key++ ) != '\0' ) { + result += ( result << 3 ) + c; + } + + return result; +} + + +void hash_collision_stats( struct hash *myhash ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Output some collision stats for a given hash. */ + + /* Use with biotools: ... | plot_histogram -k Col -x */ + + int i; + int col; + struct hash_elem *bucket; + + for ( i = 0; i < myhash->table_size; i++ ) + { + col = 0; + + for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) { + col++; + } + + printf( "Col: %d\n---\n", col ); + } +} diff --git a/code_c/Maasha/src/lib/list.c b/code_c/Maasha/src/lib/list.c new file mode 100644 index 0000000..efc0f01 --- /dev/null +++ b/code_c/Maasha/src/lib/list.c @@ -0,0 +1,142 @@ +#include "common.h" +#include "list.h" + + +void list_add( struct list **list_ppt, void *val ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Add a new singly linked list element with a pointer. */ + + struct list *elem = NULL; + + MEM_GET( elem ); + + elem->val = val; + elem->next = *( list_ppt ); + *( list_ppt ) = ( elem ); +} + + +void list_add_int( struct list_int **list_ppt, int val ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Add a new singly linked list element with a integer. */ + + struct list_int *elem = NULL; + + MEM_GET( elem ); + + elem->val = val; + elem->next = *( list_ppt ); + *( list_ppt ) = ( elem ); +} + + +void list_reverse( void *old_list ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Reverse the order of elements in a singly linked list. */ + + struct list **ppt = ( struct list ** ) old_list; + struct list *new_list = NULL; + struct list *elem; + struct list *next; + + next = *ppt; + + while ( next != NULL ) + { + elem = next; + next = elem->next; + elem->next = new_list; + new_list = elem; + } + + *ppt = new_list; +} + + +bool list_exists( struct list *list_pt, char *string ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Check if a given string exists in a singly linked list. */ + + struct list *elem; + + MEM_GET( elem ); + + for ( elem = list_pt; elem != NULL; elem = elem->next ) + { + if ( strcmp( elem->val, string ) == 0 ) { + return TRUE; + } + } + + return FALSE; +} + + +bool list_exists_int( struct list_int *list_pt, int val ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Check if a given integer exists in a singly linked list. */ + + struct list_int *elem; + + MEM_GET( elem ); + + for ( elem = list_pt; elem != NULL; elem = elem->next ) + { + if ( elem->val == val ) { + return TRUE; + } + } + + return FALSE; +} + + +void list_free( void *list_pt ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Free memory for all elements of a singly linked list. */ + + struct list **ppt = ( struct list ** ) list_pt; + struct list *next = *ppt; + struct list *elem; + + while ( next != NULL ) + { + elem = next; + next = elem->next; + mem_free( elem ); + } + + ppt = NULL; +} + + +void list_print( struct list *list_pt ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Debug function to print all elements from a singly linked list. */ + + int i = 0; + + struct list *elem; + + for ( elem = list_pt; elem != NULL; elem = elem->next ) + { + printf( "elem %d: ->%s<-\n", i, ( char * ) elem->val ); + + i++; + } +} + diff --git a/code_c/Maasha/src/lib/seq.c b/code_c/Maasha/src/lib/seq.c new file mode 100644 index 0000000..c2a5e3e --- /dev/null +++ b/code_c/Maasha/src/lib/seq.c @@ -0,0 +1,500 @@ +#include "common.h" +#include "seq.h" + + +void uppercase_seq( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Uppercase a sequence in place. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) { + seq[ i ] = toupper( seq[ i ] ); + } +} + + +void lowercase_seq( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Lowercase a sequence in place. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) { + seq[ i ] = tolower( seq[ i ] ); + } +} + + +void revcomp_dna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Reverse complement a DNA sequence in place. */ + + complement_dna( seq ); + reverse( seq ); +} + + +void revcomp_rna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Reverse complement a RNA sequence in place. */ + + complement_rna( seq ); + reverse( seq ); +} + + +void revcomp_nuc( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Reverse complements a nucleotide sequence in place. */ + + complement_nuc( seq ); + reverse( seq ); +} + + +void complement_nuc( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Complements a nucleotide sequence, */ + /* after guess the type. */ + + if ( is_dna( seq ) ) { + complement_dna( seq ); + } else if ( is_rna( seq ) ) { + complement_rna( seq ); + } else { + die( "Complement nuc failed.\n" ); + } +} + + +void complement_dna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Complements a DNA sequence including */ + /* ambiguity coded nucleotides. */; + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'a': seq[ i ] = 't'; break; + case 'A': seq[ i ] = 'T'; break; + case 'c': seq[ i ] = 'g'; break; + case 'C': seq[ i ] = 'G'; break; + case 'g': seq[ i ] = 'c'; break; + case 'G': seq[ i ] = 'C'; break; + case 't': seq[ i ] = 'a'; break; + case 'u': seq[ i ] = 'a'; break; + case 'T': seq[ i ] = 'A'; break; + case 'U': seq[ i ] = 'A'; break; + case 'm': seq[ i ] = 'k'; break; + case 'M': seq[ i ] = 'K'; break; + case 'r': seq[ i ] = 'y'; break; + case 'R': seq[ i ] = 'Y'; break; + case 'w': seq[ i ] = 'w'; break; + case 'W': seq[ i ] = 'W'; break; + case 's': seq[ i ] = 'S'; break; + case 'S': seq[ i ] = 'S'; break; + case 'y': seq[ i ] = 'r'; break; + case 'Y': seq[ i ] = 'R'; break; + case 'k': seq[ i ] = 'm'; break; + case 'K': seq[ i ] = 'M'; break; + case 'b': seq[ i ] = 'v'; break; + case 'B': seq[ i ] = 'V'; break; + case 'd': seq[ i ] = 'h'; break; + case 'D': seq[ i ] = 'H'; break; + case 'h': seq[ i ] = 'd'; break; + case 'H': seq[ i ] = 'D'; break; + case 'v': seq[ i ] = 'b'; break; + case 'V': seq[ i ] = 'B'; break; + case 'n': seq[ i ] = 'n'; break; + case 'N': seq[ i ] = 'N'; break; + default: break; + } + } +} + + +void complement_rna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Complements an RNA sequence including */ + /* ambiguity coded nucleotides. */; + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'a': seq[ i ] = 'u'; break; + case 'A': seq[ i ] = 'U'; break; + case 'c': seq[ i ] = 'g'; break; + case 'C': seq[ i ] = 'G'; break; + case 'g': seq[ i ] = 'c'; break; + case 'G': seq[ i ] = 'C'; break; + case 't': seq[ i ] = 'a'; break; + case 'u': seq[ i ] = 'a'; break; + case 'T': seq[ i ] = 'A'; break; + case 'U': seq[ i ] = 'A'; break; + case 'm': seq[ i ] = 'k'; break; + case 'M': seq[ i ] = 'K'; break; + case 'r': seq[ i ] = 'y'; break; + case 'R': seq[ i ] = 'Y'; break; + case 'w': seq[ i ] = 'w'; break; + case 'W': seq[ i ] = 'W'; break; + case 's': seq[ i ] = 'S'; break; + case 'S': seq[ i ] = 'S'; break; + case 'y': seq[ i ] = 'r'; break; + case 'Y': seq[ i ] = 'R'; break; + case 'k': seq[ i ] = 'm'; break; + case 'K': seq[ i ] = 'M'; break; + case 'b': seq[ i ] = 'v'; break; + case 'B': seq[ i ] = 'V'; break; + case 'd': seq[ i ] = 'h'; break; + case 'D': seq[ i ] = 'H'; break; + case 'h': seq[ i ] = 'd'; break; + case 'H': seq[ i ] = 'D'; break; + case 'v': seq[ i ] = 'b'; break; + case 'V': seq[ i ] = 'B'; break; + case 'n': seq[ i ] = 'n'; break; + case 'N': seq[ i ] = 'N'; break; + default: break; + } + } +} + + +void reverse( char *string ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Reverses a string in place. */ + + char c; + size_t i; + size_t j; + + i = 0; + j = strlen( string ) - 1; + + while ( i <= j ) + { + c = string[ i ]; + + string[ i ] = string[ j ]; + string[ j ] = c; + + i++; + j--; + } +} + + +void seq2nuc_simple( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Uppercases all DNA letters, while transforming */ + /* all non-DNA letters in sequence to Ns. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'A': break; + case 'T': break; + case 'C': break; + case 'G': break; + case 'U': break; + case 'N': break; + case 'a': seq[ i ] = 'A'; break; + case 't': seq[ i ] = 'T'; break; + case 'c': seq[ i ] = 'C'; break; + case 'g': seq[ i ] = 'G'; break; + case 'u': seq[ i ] = 'U'; break; + default: seq[ i ] = 'N'; + } + } +} + + +void dna2rna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Converts a DNA sequence to RNA by changing T and t to U and u. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 't': seq[ i ] = 'u'; break; + case 'T': seq[ i ] = 'U'; break; + default: break; + } + } +} + + +void rna2dna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Converts a RNA sequence to RNA by changing T and u to T and t. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'u': seq[ i ] = 't'; break; + case 'U': seq[ i ] = 'T'; break; + default: break; + } + } +} + + +bool is_dna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Determines if a given sequence is DNA, */ + /* from inspection of the first 100 residues. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'A': case 'a': break; + case 'G': case 'g': break; + case 'C': case 'c': break; + case 'T': case 't': break; + case 'R': case 'r': break; + case 'Y': case 'y': break; + case 'W': case 'w': break; + case 'S': case 's': break; + case 'M': case 'm': break; + case 'K': case 'k': break; + case 'H': case 'h': break; + case 'D': case 'd': break; + case 'V': case 'v': break; + case 'B': case 'b': break; + case 'N': case 'n': break; + case '-': break; + case '~': break; + case '_': break; + case '.': break; + default: return FALSE; + } + + if ( i == 100 ) { + break; + } + } + + return TRUE; +} + + +bool is_rna( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Determines if a given sequence is RNA, */ + /* from inspection of the first 100 residues. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'A': case 'a': break; + case 'G': case 'g': break; + case 'C': case 'c': break; + case 'U': case 'u': break; + case 'R': case 'r': break; + case 'Y': case 'y': break; + case 'W': case 'w': break; + case 'S': case 's': break; + case 'M': case 'm': break; + case 'K': case 'k': break; + case 'H': case 'h': break; + case 'D': case 'd': break; + case 'V': case 'v': break; + case 'B': case 'b': break; + case 'N': case 'n': break; + case '-': break; + case '~': break; + case '_': break; + case '.': break; + default: return FALSE; + } + + if ( i == 100 ) { + break; + } + } + + return TRUE; +} + + +bool is_protein( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Determines if a given sequence is protein, */ + /* from inspection of the first 100 residues. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'K': case 'k': break; + case 'R': case 'r': break; + case 'H': case 'h': break; + case 'D': case 'd': break; + case 'E': case 'e': break; + case 'S': case 's': break; + case 'T': case 't': break; + case 'N': case 'n': break; + case 'Q': case 'q': break; + case 'A': case 'a': break; + case 'V': case 'v': break; + case 'I': case 'i': break; + case 'L': case 'l': break; + case 'M': case 'm': break; + case 'F': case 'f': break; + case 'Y': case 'y': break; + case 'W': case 'w': break; + case 'C': case 'c': break; + case 'G': case 'g': break; + case 'P': case 'p': break; + case 'Z': case 'z': break; + case 'B': case 'b': break; + case 'X': case 'x': break; + case '*': break; + case '-': break; + case '~': break; + case '_': break; + case '.': break; + default: return FALSE; + } + + if ( i == 100 ) { + break; + } + } + + return TRUE; +} + + +char *seq_guess_type( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Guess the type of a given sequnce, */ + /* which is returned as a pointer to a string. */ + + char *type; + + type = mem_get( 8 ); + + if ( is_dna( seq ) ) { + type = "DNA"; + } else if ( is_rna( seq ) ) { + type = "RNA"; + } else if ( is_protein( seq ) ) { + type = "PROTEIN"; + } else { + die( "Could not guess sequence type.\n" ); + } + + return type; +} + + +bool contain_N( char *seq ) +{ + /* Martin A. Hansen, May 2008 */ + + /* Check if a sequence contain N or n residues. */ + + size_t i; + + for ( i = 0; seq[ i ]; i++ ) + { + switch ( seq[ i ] ) + { + case 'N': case 'n': return TRUE; + default: break; + } + } + + return FALSE; +} + + +int oligo2bin( char *oligo ) +{ + /* Martin A. Hansen, August 2004 */ + + /* Pack a max 15 nucleotide long oligo into a four byte integer. */ + + int i; + int bin; + + if ( strlen( oligo ) > 15 ) { + die( "Oligo will not fit in an integer." ); + } + + bin = 0; + + for ( i = 0; oligo[ i ]; i++ ) + { + bin <<= 2; + + switch ( oligo[ i ] ) + { + case 'A': case 'a': bin |= 0; break; + case 'N': case 'n': bin |= 0; break; + case 'T': case 't': bin |= 1; break; + case 'U': case 'u': bin |= 1; break; + case 'C': case 'c': bin |= 2; break; + case 'G': case 'g': bin |= 3; break; + default: die( "Unrecognized nucleotide." ); + } + } + + return bin; +} diff --git a/code_c/Maasha/src/lib/ucsc.c b/code_c/Maasha/src/lib/ucsc.c new file mode 100644 index 0000000..2bc6dba --- /dev/null +++ b/code_c/Maasha/src/lib/ucsc.c @@ -0,0 +1,120 @@ +#include "common.h" +#include "ucsc.h" + +void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Get next 3 column bed entry from stream. */ + + char bed_buffer[ BED_BUFFER ]; + struct bed_entry12 *bed12 = NULL; + + MEM_GET( bed12 ); + + if ( ( fgets( bed_buffer, sizeof( bed_buffer ), fp ) != NULL ) ) + { + printf( "buffer: %s\n", bed_buffer ); + + bed_split( bed_buffer, bed12, 3 ); + + return; + } + + return NULL; +} + + +void bed_split( char *string, struct bed_entry12 *bed, int cols ) +{ + int i; + int field_num; + int offset; + char *new_line; + int new_line_pos; + char *pt; + int pos; + int field_seps[ cols ]; + int field_len; + char *field; + + if ( ( new_line = memchr( string, '\n', 1024 ) ) != NULL ) { + new_line_pos = new_line - string; + } else { + die( "bed_split: no newline found." ); + } + + field_num = 0; + offset = 0; + + for ( i = 0; i < cols; i++ ) + { + if ( ( pt = memchr( &string[ offset ], '\t', new_line_pos - offset ) ) != NULL ) + { + pos = pt - string; + + pos = MIN( pos, new_line_pos ); + + field_seps[ field_num ] = pos; + + field_num++; + } + else + { + die( "bed_split: no tab found." ); + } + + offset += pos + 1; + } + + offset = 0; + + for ( i = 0; i < cols; i++ ) + { + field_len = field_seps[ i ] - offset; + + field = mem_get( field_len ); + + field[ field_len ] = '\0'; + + memcpy( field, &string[ offset ], field_len ); + + if ( i == 0 ) { + bed->chr = mem_clone( ( char * ) field, field_len ); + } else if ( i == 1 ) { + bed->chr_beg = strtod( field, &pt ); + } else if ( i == 2 ) { + bed->chr_end = strtod( field, &pt ); + } else if ( i == 3 ) { + bed->q_id = mem_clone( ( char * ) field, field_len ); + } else if ( i == 4 ) { + bed->score = strtof( field, &pt ); + } else if ( i == 5 ) { + bed->strand = field[ 0 ]; + } else if ( i == 6 ) { + bed->thick_beg = strtod( field, &pt ); + } else if ( i == 7 ) { + bed->thick_end = strtod( field, &pt ); + } else if ( i == 8 ) { + bed->itemrgb = mem_clone( ( char * ) field, field_len ); + } else if ( i == 9 ) { + bed->blockcount = strtod( field, &pt ); + } else if ( i == 10 ) { + bed->blocksizes = mem_clone( ( char * ) field, field_len ); + } else if ( i == 11 ) { + bed->q_begs = mem_clone( ( char * ) field, field_len ); + } + + if ( pt == NULL ) { + die( "bed parse failed." ); + } + + offset = field_seps[ i ] + 1; + } + + printf( "chr ->%s\n", bed->chr ); + printf( "chr_beg->%u\n", bed->chr_beg ); + printf( "chr_end->%u\n", bed->chr_end ); +} + + diff --git a/code_c/Maasha/src/repeat-O-matic b/code_c/Maasha/src/repeat-O-matic new file mode 100755 index 0000000..223d4b0 Binary files /dev/null and b/code_c/Maasha/src/repeat-O-matic differ diff --git a/code_c/Maasha/src/repeat-O-matic.c b/code_c/Maasha/src/repeat-O-matic.c new file mode 100644 index 0000000..905d298 --- /dev/null +++ b/code_c/Maasha/src/repeat-O-matic.c @@ -0,0 +1,270 @@ +/* + Copyright (C) 2008, Martin A. Hansen + + This program determines the repetiveness of a genome by determining + the number of identical 15-mers for each position in the genome. + + The output is a fixedStep file ala the phastCons files from the UCSC + Genome browser. + + It is very fast and efficient using less than 8 Gb of memory to + complete the human genome in roughly 30 minutes. +*/ + + + + +#include +#include +#include "common.h" +#include "filesys.h" +#include "fasta.h" + +#define OLIGO_SIZE 15 +#define SIZE ( 1 << ( OLIGO_SIZE * 2 ) ) + +#define UINT_BITS 32 +#define T 3 /* 11 on the rightmost two bits of bin. */ +#define C 1 /* 01 on the rightmost two bits of bin. */ +#define G 2 /* 10 on the rightmost two bits of bin. */ + +uint mask_create( int oligo_size ); +uint *oligo_count( char *path ); +void oligo_count_output( char *path, uint *array ); +void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ); + +int main( int argc, char *argv[] ) +{ + char *path; + uint *array; + + path = argv[ 1 ]; + + array = oligo_count( path ); + + oligo_count_output( path, array ); + + return 0; +} + + +uint mask_create( int oligo_size ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Create a bit mask for binary encode oligos less than sizeof( uint ). */ + + uint i; + uint mask; + + mask = 0; + + for ( i = 0; i < oligo_size; i++ ) + { + mask <<= 2; + + mask |= 3; + } + + return mask; +} + + +uint *oligo_count( char *path ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Count the occurence of all oligos of a fixed size in a FASTA file. */ + + uint *array; + uint i; + uint mask; + uint bin; + uint bin_rc1; + uint bin_rc2; + uint j; + uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */ + uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */ + uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */ + struct seq_entry *entry; + FILE *fp; + + array = mem_get_zero( sizeof( uint ) * SIZE ); + + mask = mask_create( OLIGO_SIZE ); + + MEM_GET( entry ); + + fp = read_open( path ); + + while ( ( fasta_get_entry( fp, entry ) ) ) + { + fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name ); + + bin = 0; + bin_rc1 = 0; + j = 0; + + for ( i = 0; entry->seq[ i ]; i++ ) + { + bin <<= 2; + bin_rc1 >>= 2; + + switch( entry->seq[ i ] ) + { + case 'A': case 'a': bin_rc1 |= A_rc; j++; break; + case 'T': case 't': bin |= T; j++; break; + case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break; + case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break; + default: bin = 0; bin_rc1 = 0; j = 0; break; + } + + if ( j >= OLIGO_SIZE ) + { + array[ ( bin & mask ) ]++; + + bin_rc2 = bin_rc1; + + bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) ); + + array[ ( bin_rc2 ) ]++; +/* + printf( "\n" ); + printf( "mask : %s\n", bits2string( mask ) ); + printf( "bin : %s\n", bits2string( bin ) ); + printf( "bin & mask: %s\n", bits2string( bin & mask ) ); + printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) ); + printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) ); +*/ + } + } + + fprintf( stderr, "done.\n" ); + } + + close_stream( fp ); + + fasta_free_entry( entry ); + + return array; +} + + +void oligo_count_output( char *path, uint *array ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Output oligo count for each sequence position. */ + + struct seq_entry *entry; + FILE *fp; + uint mask; + uint i; + uint j; + uint bin; + int count; + uint *block; + uint block_pos; + uint block_beg; + uint chr_pos; + + mask = mask_create( OLIGO_SIZE ); + + MEM_GET( entry ); + + fp = read_open( path ); + + while ( ( fasta_get_entry( fp, entry ) ) ) + { + fprintf( stderr, "Writing results for: %s ... ", entry->seq_name ); + + bin = 0; + j = 0; + block_pos = 0; + block = mem_get_zero( sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ) ); + + for ( i = 0; entry->seq[ i ]; i++ ) + { + bin <<= 2; + + switch( entry->seq[ i ] ) + { + case 'A': case 'a': j++; break; + case 'T': case 't': bin |= T; j++; break; + case 'C': case 'c': bin |= C; j++; break; + case 'G': case 'g': bin |= G; j++; break; + default: bin = 0; j = 0; break; + } + + if ( j >= OLIGO_SIZE ) + { + count = array[ ( bin & mask ) ]; + + if ( count > 1 ) + { + chr_pos = i - OLIGO_SIZE + 1; + + if ( block_pos == 0 ) + { + ZERO( block ); + + block_beg = chr_pos; + + block[ block_pos ] = count; + + block_pos++; + } + else + { + if ( chr_pos > block_beg + block_pos ) + { + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + + block_pos = 0; + } + else + { + block[ block_pos ] = count; + + block_pos++; + } + } + } + } + } + + if ( block_pos > 0 ) + { + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + + mem_free( block ); + } + + fprintf( stderr, "done.\n" ); + } + + close_stream( fp ); + + fasta_free_entry( entry ); +} + + +void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Outputs a block of fixedStep values. */ + + int i; + + if ( block_size > 0 ) + { + beg += 1; /* fixedStep format is 1 based. */ + + printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size ); + + for ( i = 0; i < block_size; i++ ) { + printf( "%u\n", block_array[ i ] ); + } + } +} diff --git a/code_c/Maasha/src/test.c b/code_c/Maasha/src/test.c new file mode 100644 index 0000000..7a141d5 --- /dev/null +++ b/code_c/Maasha/src/test.c @@ -0,0 +1,61 @@ +#include "common.h" +#include "filesys.h" +#include "fasta.h" + +int main( int argc, char *argv[] ) +{ + char *file; + FILE *fp; + char chr[ 10 ]; + int chr_beg; + int chr_end; + + file = argv[ 1 ]; + + fp = read_open( file ); + + fscanf( "%s\t%d\t%d", fp, chr, chr_beg, chr_end ); + + print ( "CHR: %s CHR_BEG: %d CHR_END: %d\n", chr, chr_beg, chr_end ); + + close_stream( fp ); + + return 0; +} + + +/* + +int main( int argc, char *argv[] ) +{ + char *file; + FILE *fp; + struct seq_entry *entry = NULL; + int count; + + count = 0; + + file = argv[ 1 ]; + + fp = read_open( file ); + + while ( ( fasta_get_entry( fp, entry ) ) != FALSE ) + { + printf( "seq_name: %s\n", entry->seq_name ); + +// mem_free( entry->seq_name ); +// mem_free( entry->seq ); +// entry = NULL; + + count++; + } + + printf( "count: %d\n", count ); + + close_stream( fp ); + + return 0; +} + + +*/ diff --git a/code_c/Maasha/src/test2 b/code_c/Maasha/src/test2 new file mode 100755 index 0000000..1d6bc6f Binary files /dev/null and b/code_c/Maasha/src/test2 differ diff --git a/code_c/Maasha/src/test2.c b/code_c/Maasha/src/test2.c new file mode 100644 index 0000000..787a231 --- /dev/null +++ b/code_c/Maasha/src/test2.c @@ -0,0 +1,26 @@ +#include + +#define BUFFER 100 * 1024 + +int main( int argc, char *argv[] ) +{ + int count = 0; + char line[ BUFFER ]; + FILE *fp; + + if ( ( fp = fopen( argv[ 1 ], "r" ) ) == NULL ) + { + return 1; + } + + while ( ( fgets( line, BUFFER, fp ) ) != NULL ) + { + if ( line[ 0 ] == '>' ) { + count++; + } + } + + printf( "count: %d\n", count ); + + return 0; +} diff --git a/code_c/Maasha/src/test_bed b/code_c/Maasha/src/test_bed new file mode 100755 index 0000000..da27613 Binary files /dev/null and b/code_c/Maasha/src/test_bed differ diff --git a/code_c/Maasha/src/test_bed.c b/code_c/Maasha/src/test_bed.c new file mode 100644 index 0000000..8738ac6 --- /dev/null +++ b/code_c/Maasha/src/test_bed.c @@ -0,0 +1,26 @@ +#include "common.h" +#include "filesys.h" +#include "ucsc.h" + + +int main( int argc, char *argv[] ) +{ + char *file; + FILE *fp; + struct bed_entry3 *bed; + int count; + + file = argv[ 1 ]; + + fp = read_open( file ); + + count = 0; + + bed_get_entry( fp, bed, 3 ); + + printf( "Count: %d\n", count ); + + close_stream( fp ); + + return 0; +} diff --git a/code_c/Maasha/src/test_binary_search b/code_c/Maasha/src/test_binary_search new file mode 100755 index 0000000..ddf04f5 Binary files /dev/null and b/code_c/Maasha/src/test_binary_search differ diff --git a/code_c/Maasha/src/test_binary_search.c b/code_c/Maasha/src/test_binary_search.c new file mode 100644 index 0000000..49d1065 --- /dev/null +++ b/code_c/Maasha/src/test_binary_search.c @@ -0,0 +1,17 @@ +#include +#include "common.h" + +int main() +{ + int size = 10; + int val = 40; + int array[ 10 ] = { 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 }; + + if ( binary_search_array( array, size, val ) ) { + printf( "val->%d found in array\n", val ); + } else { + printf( "val->%d NOT found in array\n", val ); + } + + return 0; +} diff --git a/code_c/Maasha/src/test_biotools b/code_c/Maasha/src/test_biotools new file mode 100755 index 0000000..e316d43 Binary files /dev/null and b/code_c/Maasha/src/test_biotools differ diff --git a/code_c/Maasha/src/test_biotools.c b/code_c/Maasha/src/test_biotools.c new file mode 100644 index 0000000..5377ac3 --- /dev/null +++ b/code_c/Maasha/src/test_biotools.c @@ -0,0 +1,129 @@ +#include "common.h" +#include "filesys.h" +#include "hash.h" + +bool get_record( struct file_buffer *buffer, struct hash *record ); +void put_record( struct hash *record ); + +int main( int argc, char *argv[] ) +{ + int count; + char *file; + struct file_buffer *buffer = NULL; + struct hash *record = NULL; + + file = argv[ 1 ]; + + buffer = read_open_buffer( file ); + + record = hash_new( 5 ); + + count = 0; + + while ( ( get_record( buffer, record ) ) != FALSE ) + { + put_record( record ); + + count++; + } + + fprintf( stderr, "Count: %d\n", count ); + + hash_destroy( record ); + + buffer_destroy( buffer ); + + return 0; +} + + +bool get_record( struct file_buffer *buffer, struct hash *record ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Get next record from the stream. */ + + char *line = NULL; + char *val = NULL; + char key[ 256 ]; + int len; + int i; + bool key_ok; + + while ( ( line = buffer_gets( buffer ) ) ) + { + key_ok = FALSE; + + //printf( "LINE->%s<-", line ); + + if ( strcmp( line, "---\n" ) == 0 ) + { +// printf( "found\n" ); + + return TRUE; + } + else + { + len = strlen( line ); + + i = 0; + + while ( i < len ) + { + if ( i < len - 1 && line[ i ] == ':' && line[ i + 1 ] == ' ' ) + { + key_ok = TRUE; + + key[ i ] = '\0'; + + i += 2; + + break; + } + + key[ i ] = line[ i ]; + + i++; + } + + if ( ! key_ok ) { + die( "Could not locate key." ); + } + + val = mem_get( len - i ); + + memcpy( val, &line[ i ], len - i - 1 ); + + val[ len - i ] = '\0'; + +// printf( "key: ->%s<- val: ->%s<-\n", key, val ); + + hash_add( record, key, val ); + } + } + + return FALSE; +} + + +void put_record( struct hash *record ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Output a record to the stream. */ + + int i; + struct hash_elem *bucket; + + for ( i = 0; i < record->table_size; i++ ) + { + for ( bucket = record->table[ i ]; bucket != NULL; bucket = bucket->next ) { + printf( "%s: %s\n", ( char * ) bucket->key, ( char * ) bucket->val ); + } + } + + printf( "---\n" ); +} + + + diff --git a/code_c/Maasha/src/test_fasta b/code_c/Maasha/src/test_fasta new file mode 100755 index 0000000..c8598a6 Binary files /dev/null and b/code_c/Maasha/src/test_fasta differ diff --git a/code_c/Maasha/src/test_fasta.c b/code_c/Maasha/src/test_fasta.c new file mode 100644 index 0000000..86a4698 --- /dev/null +++ b/code_c/Maasha/src/test_fasta.c @@ -0,0 +1,22 @@ +#include "common.h" +#include "filesys.h" +#include "list.h" +#include "fasta.h" + +int main() +{ + char *file = "/Users/m.hansen/test.fna"; + FILE *fp; + + struct list *entries = NULL; + + fp = read_open( file ); + + fasta_get_entries( fp, &entries ); + + fasta_put_entries( entries); + close_stream( fp ); + + return 0; +} + diff --git a/code_c/Maasha/src/test_file_buffer b/code_c/Maasha/src/test_file_buffer new file mode 100755 index 0000000..8753c66 Binary files /dev/null and b/code_c/Maasha/src/test_file_buffer differ diff --git a/code_c/Maasha/src/test_file_buffer.c b/code_c/Maasha/src/test_file_buffer.c new file mode 100644 index 0000000..85dcf95 --- /dev/null +++ b/code_c/Maasha/src/test_file_buffer.c @@ -0,0 +1,58 @@ +#include "common.h" +#include "filesys.h" + + +int main( int argc, char *argv[] ) +{ + struct file_buffer *buffer; + char *line; + char c; + + buffer = read_open_buffer( argv[ 1 ] ); + + while ( ( line = buffer_gets( buffer ) ) ) { + printf( "LINE->%s<-", line ); + } + +/* + while ( ( c = buffer_getc( buffer ) ) ) + { + if ( c == '\n' ) { + printf( "CHAR->\\n\n" ); + } else { + printf( "CHAR->%c\n", c ); + } + } +*/ + buffer_destroy( buffer ); + + return 0; +} + + + +/* +#define SUBSTR_SIZE 15 + +int main() +{ + char *string="foobarfoobarfoobarfoobarfoobarfoobarfoobarfoobar"; + char substr[ SUBSTR_SIZE + 1 ]; + int i; + int j; + + for ( i = 0; i < strlen( string ) - SUBSTR_SIZE + 1; i++ ) + { + for ( j = 0; j < SUBSTR_SIZE; j++ ) { + substr[ j ] = string[ i + j ]; + } + + substr[ j ] = '\0'; + + printf( "substr->%s\n", substr ); + }gg + + return 0; +} + +*/ diff --git a/code_c/Maasha/src/test_hash b/code_c/Maasha/src/test_hash new file mode 100755 index 0000000..706b4a7 Binary files /dev/null and b/code_c/Maasha/src/test_hash differ diff --git a/code_c/Maasha/src/test_hash.c b/code_c/Maasha/src/test_hash.c new file mode 100644 index 0000000..a1b8065 --- /dev/null +++ b/code_c/Maasha/src/test_hash.c @@ -0,0 +1,35 @@ +#include +#include "common.h" +#include "hash.h" + + +int main() +{ + struct hash *my_hash; + int pot; + int i; + char *dummy; + + pot = 16; + + my_hash = hash_new( pot ); + + for ( i = 1; i <= ( 1 << pot ); i++ ) + { + sprintf( dummy, "dummy_%d", i ); + + hash_add( my_hash, dummy, "FOO" ); + } + + hash_collision_stats( my_hash ); + +// if ( ( val = ( char * ) hash_get( my_hash, key ) ) != NULL ) { +// printf( "Key: %s, Val: %s\n", key, val ); +// } else { +// printf( "Key: %s, Val: Not Found\n", key ); +// } + + hash_destroy( my_hash ); + + return 0; +} diff --git a/code_c/Maasha/src/test_list b/code_c/Maasha/src/test_list new file mode 100755 index 0000000..66d8bf6 Binary files /dev/null and b/code_c/Maasha/src/test_list differ diff --git a/code_c/Maasha/src/test_list.c b/code_c/Maasha/src/test_list.c new file mode 100644 index 0000000..731d622 --- /dev/null +++ b/code_c/Maasha/src/test_list.c @@ -0,0 +1,42 @@ +#include +#include "common.h" +#include "list.h" + +int main() +{ +// struct list *list_pt; +// +// char *string1 = "Hello"; +// char *string2 = "World"; +// +// list_add( &list_pt, string1 ); +// list_add( &list_pt, string2 ); +// +// if ( list_exists( list_pt, "World" ) ) { +// printf( "Found\n" ); +// } else { +// printf( "Not Found\n" ); +// } +// +// list_free( &list_pt ); + + struct list_int *list_int_pt; + + int i = 4; + int j = 12; + + list_add_int( &list_int_pt, i ); + list_add_int( &list_int_pt, j ); + + if ( list_exists_int( list_int_pt, j ) ) { + printf( "Found\n" ); + } else { + printf( "Not Found\n" ); + } + + list_free( &list_int_pt ); + + return 0; +} + + diff --git a/code_c/Maasha/src/test_oligo2bin b/code_c/Maasha/src/test_oligo2bin new file mode 100755 index 0000000..d1f3408 Binary files /dev/null and b/code_c/Maasha/src/test_oligo2bin differ diff --git a/code_c/Maasha/src/test_oligo2bin.c b/code_c/Maasha/src/test_oligo2bin.c new file mode 100755 index 0000000..a840e02 --- /dev/null +++ b/code_c/Maasha/src/test_oligo2bin.c @@ -0,0 +1,18 @@ +#include +#include "common.h" +#include "seq.h" + + +int main() +{ + int bin; + + /* 123456789012345 */ + char *word = "GGGGGGGGGGGGGGG"; + + bin = oligo2bin( word ); + + printf( "bin->%d\n", bin ); + + return 0; +} diff --git a/code_c/Maasha/src/test_revcomp b/code_c/Maasha/src/test_revcomp new file mode 100755 index 0000000..2f4b8fa Binary files /dev/null and b/code_c/Maasha/src/test_revcomp differ diff --git a/code_c/Maasha/src/test_revcomp.c b/code_c/Maasha/src/test_revcomp.c new file mode 100644 index 0000000..13a4d4a --- /dev/null +++ b/code_c/Maasha/src/test_revcomp.c @@ -0,0 +1,18 @@ +#include +#include "common.h" +#include "seq.h" + +int main() +{ + char seq[] = "ACGACATCGGACTGACactgactgacatgcactg"; + + printf( "seq type: %s\n", seq_guess_type( seq ) ); + + printf( "before revcomp: %s\n", seq ); + + revcomp_nuc( seq ); + + printf( "after revcomp: %s\n", seq ); + + return 0; +} diff --git a/code_c/Maasha/src/test_split b/code_c/Maasha/src/test_split new file mode 100755 index 0000000..14d6f77 Binary files /dev/null and b/code_c/Maasha/src/test_split differ diff --git a/code_c/Maasha/src/test_split.c b/code_c/Maasha/src/test_split.c new file mode 100644 index 0000000..5ee41ee --- /dev/null +++ b/code_c/Maasha/src/test_split.c @@ -0,0 +1,19 @@ +#include +#include "common.h" +#include "list.h" + + +int main() +{ + char string[] = "FOO\tBAR\tFOOBAR\n"; + + struct list *fields; + + chomp( string ); + + split( string, '\t', &fields ); + + list_print( fields ); + + return 0; +} diff --git a/code_perl/Maasha/Align.pm b/code_perl/Maasha/Align.pm new file mode 100644 index 0000000..b0d270f --- /dev/null +++ b/code_perl/Maasha/Align.pm @@ -0,0 +1,685 @@ +package Maasha::Align; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to perform and print pairwise and multiple alignments + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use IPC::Open2; +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Calc; +use Maasha::Seq; +use vars qw ( @ISA @EXPORT ); + +use constant { + HEAD => 0, + SEQ => 1, +}; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub align +{ + # Martin A. Hansen, August 2007. + + # Aligns a given list of FASTA entries and returns a + # list of aligned sequences as FASTA entries. + # (currently uses Muscle, but other align engines can + # be used with a bit of tweaking). + + my ( $entries, # Fasta entries + $args, # additional alignment program specific arguments - OPTIONAL + ) = @_; + + # Returns a list. + + my ( @aligned_entries, $muscle_args ); + + $muscle_args = "-quiet"; + $muscle_args .= $args if $args; + + @aligned_entries = &align_muscle( $entries, $muscle_args ); + + return wantarray ? @aligned_entries : \@aligned_entries; +} + + +sub align_muscle +{ + # Martin A. Hansen, June 2007. + + # Aligns a given list of FASTA entries using Muscle. + # Returns a list of aligned sequences as FASTA entries. + + my ( $entries, # FASTA entries + $args, # additional Muscle arguments - OPTIONAL + ) = @_; + + # Returns a list. + + my ( $pid, $fh_in, $fh_out, $cmd, $entry, @aligned_entries ); + + $cmd = "muscle"; + $cmd .= " " . $args if $args; + + $pid = open2( $fh_out, $fh_in, $cmd ); + + map { &Maasha::Fasta::put_entry( $_, $fh_in ) } @{ $entries }; + + close $fh_in; + + while ( $entry = &Maasha::Fasta::get_entry( $fh_out ) ) { + push @aligned_entries, $entry; + } + + close $fh_out; + + waitpid $pid, 0; + + return wantarray ? @aligned_entries : \@aligned_entries; +} + + +sub align_print_pairwise +{ + # Martin A. Hansen, June 2007. + + # Prints a given pairwise alignment in FASTA format. + + my ( $entry1, # first entry + $entry2, # second entry + $fh, # output filehandle - OPTIONAL + $wrap, # wrap width - OPTIONAL + ) = @_; + + # returns nothing + + my ( @entries, $ruler1, $ruler2, $pins ); + + $ruler1 = &align_ruler( $entry1, 1 ); + $ruler2 = &align_ruler( $entry2, 1 ); + $pins = &align_pins( $entry1, $entry2 ); + + push @entries, $ruler1, $entry1, $pins, $entry2, $ruler2; + + &align_print( \@entries, $fh, $wrap ); +} + + +sub align_print_multi +{ + # Martin A. Hansen, June 2007. + + # Prints a given multiple alignment in FASTA format. + + my ( $entries, # list of aligned FASTA entries + $fh, # output filehandle - OPTIONAL + $wrap, # wrap width - OPTIONAL + $no_ruler, # omit ruler flag - OPTIONAL + $no_cons, # omit consensus flag - OPTIONAL + ) = @_; + + # returns nothing + + my ( @entries, $ruler, $consensus ); + + $ruler = &align_ruler( $entries->[ 0 ] ); + $consensus = &align_consensus( $entries ) if not $no_cons; + + unshift @{ $entries }, $ruler if not $no_ruler; + push @{ $entries }, $consensus; + + &align_print( $entries, $fh, $wrap ); +} + + +sub align_print +{ + # Martin A. Hansen, June 2007. + + # Prints an alignment. + + my ( $entries, # Alignment as FASTA entries + $fh, # output filehandle - OPTIONAL + $wrap, # wrap alignment - OPTIONAL + ) = @_; + + # returns nothing + + my ( $max, $blocks, $block, $entry ); + + $max = 0; + + map { $max = length $_->[ HEAD ] if length $_->[ HEAD ] > $max } @{ $entries }; + + $blocks = &align_wrap( $entries, $wrap ); + + foreach $block ( @{ $blocks } ) + { + foreach $entry ( @{ $block } ) + { + $entry->[ HEAD ] =~ s/stats|ruler|consensus//; + + if ( $fh ) { + print $fh $entry->[ HEAD ], " " x ( $max + 3 - length $entry->[ HEAD ] ), $entry->[ SEQ ], "\n"; + } else { + print $entry->[ HEAD ], " " x ( $max + 3 - length $entry->[ HEAD ] ), $entry->[ SEQ ], "\n"; + } + } + } +} + + +sub align_wrap +{ + # Martin A. Hansen, October 2005. + + # Given a set of fasta entries wraps these + # according to a given width. + + my ( $entries, # list of fasta_entries + $wrap, # wrap width - OPTIONAL + ) = @_; + + # returns AoA + + my ( $ruler, $i, $c, @lines, @blocks ); + + $wrap ||= 999999999; + + $i = 0; + + while ( $i < length $entries->[ 0 ]->[ SEQ ] ) + { + undef @lines; + + for ( $c = 0; $c < @{ $entries }; $c++ ) + { + if ( $entries->[ $c ]->[ HEAD ] eq "ruler" ) + { + $ruler = substr $entries->[ $c ]->[ SEQ ], $i, $wrap; + + if ( $ruler =~ /^(\d+)/ ) { + $ruler =~ s/^($1)/' 'x(length $1)/e; + } + + if ( $ruler =~ /(\d+)$/ ) { + $ruler =~ s/($1)$/' 'x(length $1)/e; + } + + push @lines, [ "ruler", $ruler ]; + } + else + { + push @lines, [ $entries->[ $c ]->[ HEAD ], substr $entries->[ $c ]->[ SEQ ], $i, $wrap ]; + } + } + + push @blocks, [ @lines ]; + + $i += $wrap; + } + + return wantarray ? @blocks: \@blocks; +} + + +sub align_pins +{ + # Martin A. Hansen, June 2007. + + # Given two aligned FASTA entries, generates an entry with pins. + + my ( $entry1, # first entry + $entry2, # second entry + $type, # residue type - OPTIONAL + ) = @_; + + # returns tuple + + my ( $blosum, $i, $char1, $char2, $pins ); + + $type ||= &Maasha::Seq::seq_guess_type( $entry1->[ SEQ ] ); + + $blosum = &blosum_read() if $type =~ /protein/; + + for ( $i = 0; $i < length $entry1->[ SEQ ]; $i++ ) + { + $char1 = substr $entry1->[ SEQ ], $i, 1; + $char2 = substr $entry2->[ SEQ ], $i, 1; + + if ( $blosum and $char1 eq $char2 ) { + $pins .= $char1; + } elsif ( $char1 eq $char2 ) { + $pins .= "|"; + } elsif ( $blosum and $blosum->{ $char1 }->{ $char2 } > 0 ) { + $pins .= "+"; + } else { + $pins .= " "; + } + } + + return wantarray ? ( "consensus", $pins ) : [ "consensus", $pins ]; +} + + +sub align_ruler +{ + # Martin A. Hansen, February 2007; + + # Gererates a ruler for a given FASTA entry (with indels). + + my ( $entry, # FASTA entry + $count_gaps, # flag for counting indels in pairwise alignments. + ) = @_; + + # Returns tuple + + my ( $i, $char, $skip, $count, $gap, $tics ); + + $char = ""; + $gap = 0; + $i = 1; + + while ( $i <= length $entry->[ SEQ ] ) + { + $char = substr( $entry->[ SEQ ], $i - 1, 1 ) if $count_gaps; + + $gap++ if $char eq "-"; + + if ( $skip ) + { + $skip--; + } + else + { + $count = $i - $gap; + $count = 1 if $char eq "-"; + + if ( $count % 100 == 0 ) + { + if ( $count + length( $count ) >= length $entry->[ SEQ ] ) + { + $tics .= "|"; + } + else + { + $tics .= "|" . $count; + $skip = length $count; + } + } + elsif ( $count % 50 == 0 ) { + $tics .= ":"; + } elsif ( $count % 10 == 0 ) { + $tics .= "."; + } else { + $tics .= " "; + } + } + + $i++; + } + + return wantarray ? ( "ruler", $tics ) : [ "ruler", $tics ]; +} + + +sub align_consensus +{ + # Martin A. Hansen, June 2006. + + # Given an alignment as a list of FASTA entries, + # generates a consensus sequences based on the + # entropies for each column similar to the way + # a sequence logo i calculated. Returns the + # consensus sequence as a FASTA entry. + + my ( $entries, # list of aligned FASTA entries + $type, # residue type - OPTIONAL + $min_sim, # minimum similarity - OPTIONAL + ) = @_; + + # Returns tuple + + my ( $bit_max, $data, $pos, $char, $score, $entry ); + + $type ||= &Maasha::Seq::seq_guess_type( $entries->[ 0 ]->[ SEQ ] ); + $min_sim ||= 50; + + if ( $type =~ /protein/ ) { + $bit_max = 4; + } else { + $bit_max = 2; + } + + $data = &Maasha::Seq::seqlogo_calc( $bit_max, $entries ); + + foreach $pos ( @{ $data } ) + { + if ( $pos->[ -1 ] ) + { + ( $char, $score ) = @{ $pos->[ -1 ] }; + + if ( ( $score / $bit_max ) * 100 >= $min_sim ) { + $entry->[ SEQ ] .= $char; + } else { + $entry->[ SEQ ] .= "-"; + } + } + else + { + $entry->[ SEQ ] .= "-"; + } + } + + $entry->[ HEAD ] = "Consensus: $min_sim%"; + + return wantarray ? @{ $entry } : $entry; +} + + +sub align_sim_global +{ + # Martin A. Hansen, June 2007. + + # Calculate the global similarity of two aligned entries + # The similarity is calculated as the number of matching + # residues divided by the length of the shortest sequence. + + my ( $entry1, # first aligned entry + $entry2, # second aligned entry + ) = @_; + + # returns float + + my ( $seq1, $seq2, $len1, $len2, $i, $match_tot, $min, $sim ); + + $seq1 = $entry1->[ SEQ ]; + $seq2 = $entry2->[ SEQ ]; + + # $seq1 =~ tr/-//d; + # $seq2 =~ tr/-//d; + + $seq1 =~ s/^-*//; + $seq2 =~ s/^-*//; + $seq1 =~ s/-*$//; + $seq2 =~ s/-*$//; + + $len1 = length $seq1; + $len2 = length $seq2; + + return 0 if $len1 == 0 or $len2 == 0; + + $match_tot = 0; + + for ( $i = 0; $i < $len1; $i++ ) { + $match_tot++ if substr( $entry1->[ SEQ ], $i, 1 ) eq substr( $entry2->[ SEQ ], $i, 1 ); + } + + $min = &Maasha::Calc::min( $len1, $len2 ); + + $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 ); + + return $sim; +} + + +sub align_tile +{ + # Martin A. Hansen, February 2008. + + # Tile a list of query sequences agains a reference sequence, + # using pairwise alignments. The result is returned as a list of + # aligned FASTA entries. + + my ( $ref_entry, # reference entry as [ HEAD, SEQ ] tuple + $q_entries, # list of [ HEAD, SEQ ] tuples + $args, # argument hash + ) = @_; + + # Returns a list. + + my ( $entry, $seq1, $seq2, $type, $align1, $align2, $sim1, $sim2, $gaps, @entries ); + + $args->{ "identity" } ||= 70; + + foreach $entry ( @{ $q_entries } ) + { + $seq1 = $entry->[ SEQ ]; + + $type = &Maasha::Seq::seq_guess_type( $seq1 ); + + if ( $type eq "rna" ) { + $seq2 = &Maasha::Seq::rna_revcomp( $seq1 ); + } elsif ( $type eq "dna" ) { + $seq2 = &Maasha::Seq::dna_revcomp( $seq1 ); + } else { + &Maasha::Common::error( qq(Bad sequence type->$type) ); + } + + $align1 = &Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ HEAD ] . "_+", $seq1 ] ], "-quiet -maxiters 1" ); + $align2 = &Maasha::Align::align_muscle( [ $ref_entry, [ $entry->[ HEAD ] . "_-", $seq2 ] ], "-quiet -maxiters 1" ); + + if ( $args->{ "supress_indels" } ) + { + &align_supress_indels( $align1 ); + &align_supress_indels( $align2 ); + } + + $sim1 = &Maasha::Align::align_sim_global( $align1->[ 0 ], $align1->[ 1 ] ); + $sim2 = &Maasha::Align::align_sim_global( $align2->[ 0 ], $align2->[ 1 ] ); + + if ( $sim1 < $args->{ "identity" } and $sim2 < $args->{ "identity" } ) + { + # do nothing + } + elsif ( $sim1 > $sim2 ) + { + $gaps = $align1->[ 0 ]->[ SEQ ] =~ tr/-//; + + $align1->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps; + + $entry->[ HEAD ] = "$align1->[ 1 ]->[ HEAD ]_$sim1"; + $entry->[ SEQ ] = $align1->[ 1 ]->[ SEQ ]; + + push @entries, $entry; + } + else + { + $gaps = $align2->[ 0 ]->[ SEQ ] =~ tr/-//; + + $align2->[ 1 ]->[ SEQ ] =~ s/-{$gaps}$// if $gaps; + + $entry->[ HEAD ] = "$align2->[ 1 ]->[ HEAD ]_$sim2"; + $entry->[ SEQ ] = $align2->[ 1 ]->[ SEQ ]; + + push @entries, $entry; + } + } + + @entries = sort { $b->[ SEQ ] cmp $a->[ SEQ ] } @entries; + + unshift @entries, $ref_entry; + + return wantarray ? @entries : \@entries; +} + + +sub align_supress_indels +{ + # Martin A. Hansen, June 2008. + + # Given a pairwise alignment, removes + # indels in the first sequence AND corresponding + # sequence in the second. + + my ( $align, # pairwise alignment + ) = @_; + + # Returns nothing + + my ( $count, $seq, $i ); + + $count = $align->[ 0 ]->[ SEQ ] =~ tr/-//; + + if ( $count > 0 ) + { + for ( $i = 0; $i < length $align->[ 0 ]->[ SEQ ]; $i++ ) + { + if ( substr( $align->[ 0 ]->[ SEQ ], $i, 1 ) ne '-' ) { + $seq .= substr( $align->[ 1 ]->[ SEQ ], $i, 1 ); + } + + } + + $align->[ 0 ]->[ SEQ ] =~ tr/-//d; + $align->[ 1 ]->[ SEQ ] = $seq; + } +} + + +sub align_invert +{ + # Martin A. Hansen, February 2008. + + # Invert an alignment in such a way that only + # residues differing from the first sequence (the reference sequence) + # are shown. The matching sequence can either be lowercased (soft) or replaced + # with _. + + my ( $entries, # list of FASTA entries. + $soft, + ) = @_; + + # Returns nothing. + + my ( $i, $c, $char1, $char2 ); + + map { $_->[ SEQ ] =~ tr/-/_/ } @{ $entries }; + + for ( $i = 0; $i < length $entries->[ 0 ]->[ SEQ ]; $i++ ) + { + $char1 = uc substr $entries->[ 0 ]->[ SEQ ], $i, 1; + + for ( $c = 1; $c < @{ $entries }; $c++ ) + { + $char2 = uc substr $entries->[ $c ]->[ SEQ ], $i, 1; + + if ( $char1 eq $char2 ) + { + if ( $soft ) { + substr $entries->[ $c ]->[ SEQ ], $i, 1, lc $char2; + } else { + substr $entries->[ $c ]->[ SEQ ], $i, 1, "-"; + } + } + } + } +} + + +sub blosum_read +{ + # Martin A. Hansen, January 2006. + + # this routine parses the BLOSUM62 matrix, + # which is located in the __DATA__ section + + # returns HoH + + my ( @lines, @chars, $i, $c, @list, $HoH ); + + @lines = ; + @lines = grep { $_ !~ /^$|^#/ } @lines; + + @chars = split /\s+/, $lines[ 0 ]; + + $i = 1; + + while( $lines[ $i ] ) + { + last if $lines[ $i ] =~ /^__END__/; + + @list = split /\s+/, $lines[ $i ]; + + for ( $c = 1; $c < @list; $c++ ) { + $HoH->{ $list[ 0 ] }->{ $chars[ $c ] } = $list[ $c ]; + } + + $i++; + } + + return wantarray ? %{ $HoH } : $HoH; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__DATA__ + + +# Matrix made by matblas from blosum62.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 62 +# Entropy = 0.6979, Expected = -0.5209 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 + + +__END__ diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm new file mode 100644 index 0000000..7d18643 --- /dev/null +++ b/code_perl/Maasha/AlignTwoSeq.pm @@ -0,0 +1,1039 @@ +package Align; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# yak yak yak + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Storable qw( dclone ); +use IPC::Open2; +use Maasha::Calc; +use Maasha::Seq; +use vars qw ( @ISA @EXPORT ); + +use constant { + Q_BEG => 0, + Q_END => 1, + S_BEG => 2, + S_END => 3, + LEN => 4, + SCORE => 5, + HEAD => 0, + SEQ => 1, +}; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub align_two_seq +{ + # Martin A. Hansen, August 2006. + + # Generates an alignment by chaining matches, which are subsequences + # shared between two sequences. The routine functions by considering + # only matches within a given search space. If no matches are given + # these will be generated, if long matches are found these will be + # included in the alignment, otherwise matches will be included depending + # on a calculated score. New search spaces spanning the spaces between + # matches and the search space boundaries will be cast and recursed into. + + my ( $q_seq, # sequence 1 ref + $s_seq, # sequence 2 ref + $matches, # list of matches + $q_min, # q sequence start position + $q_max, # q sequecne stop position + $s_min, # s sequence start position + $s_max, # s sequecne stop position + $args, # argument hash + ) = @_; + + # returns a chain of matches that can be chained into an alignment + + $matches ||= []; + $q_min ||= 0; + $s_min ||= 0; + $q_max ||= length( ${ $q_seq } ) - 1; + $s_max ||= length( ${ $s_seq } ) - 1; + + $args->{ "long_matches" } ||= 50; + $args->{ "alph_len" } ||= 4; + + my ( $wordsize, @chain, $match, $best_match, @long_matches ); + + $matches = &select_matches( $matches, $q_min, $q_max, $s_min, $s_max ); + + if ( scalar @{ $matches } == 0 ) # no matches - find some! + { + # $wordsize = &find_wordsize( $q_min, $q_max, $s_min, $s_max, $args ); + $wordsize = 4; + $matches = &find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max ); + + while ( scalar @{ $matches } == 0 and $wordsize > 1 ) + { + $wordsize--; + $matches = &find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max ); + } + + if ( scalar @{ $matches } > 0 ) { + push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args ); + } + } + elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } ) # long matches found - include all that don't overlap! + { + @long_matches = &order_matches( \@long_matches ); + + foreach $match ( @long_matches ) + { + push @chain, $match; + + if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) { + push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args ); # intermediate search space + } + + $q_min = $match->[ Q_END ] + 1; + $s_min = $match->[ S_END ] + 1; + } + + if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) { + push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args ); # remaining search space + } + } + else # shorter matches are included according to score + { + foreach $match ( @{ $matches } ) { + # $match->[ SCORE ] = &score_match( $match, $q_min, $q_max, $s_min, $s_max ); + $match->[ SCORE ] = &score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max ); + } + + # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches }; + @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches }; + # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches }; + @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches }; + + $best_match = shift @{ $matches }; + + if ( $best_match ) + { + push @chain, $best_match; + + if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) { + push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args ); # left search space + } + + if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) { + push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args ); # right search space + } + } + } + + return wantarray ? @chain : \@chain; +} + + +sub select_matches +{ + # Martin A. Hansen, August 2006. + + # Given a list of matches and a search space, + # include only those matches contained within + # this search space. + + my ( $matches, # list of matches + $q_min, # q sequence start position + $q_max, # q sequecne stop position + $s_min, # s sequence start position + $s_max, # s sequecne stop position + ) = @_; + + # returns list of matches + + my ( @matches ); + + @matches = grep { $_->[ Q_BEG ] >= $q_min and + $_->[ S_BEG ] >= $s_min and + $_->[ Q_END ] <= $q_max and + $_->[ S_END ] <= $s_max } @{ $matches }; + + return wantarray ? @matches : \@matches; +} + + +sub order_matches +{ + # Martin A. Hansen, October 2006 + + # given a list of long matches, order these by length and position + # and include only those long matches that does not cross. + + my ( $long_matches, # list of matches + ) = @_; + + # returns a list of matches + + my ( @matches, $match, $i ); + + @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches }; + + @matches = shift @{ $long_matches }; + + foreach $match ( @{ $long_matches } ) + { + if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] ) + { + unshift @matches, $match; + } + elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] ) + { + push @matches, $match; + } + else + { + for ( $i = 1; $i < @matches; $i++ ) + { + if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and + $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ] + ) + { + splice @matches, $i, 0, dclone $match; + last; + } + } + } + } + + return wantarray ? @matches : \@matches; +} + + +sub find_wordsize +{ + # Martin A. Hansen, August 2006. + + # Given a search space calculates the wordsize for a word so a match + # occurs only a few times. More matches may be needed at low similarity in + # order to avoid starting with a wrong match. + + my ( $q_min, # q sequence start position + $q_max, # q sequecne stop position + $s_min, # s sequence start position + $s_max, # s sequecne stop position + $args, # argument hash + ) = @_; + + # returns integer + + my ( $q_dim, $s_dim, $dim_min, $wordsize ); + + $q_dim = $q_max - $q_min + 1; + $s_dim = $s_max - $s_min + 1; + + $dim_min = &Maasha::Calc::min( $q_dim, $s_dim ); + + $wordsize = 1; + + if ( $dim_min > 2000000 ) # optimized for DNA + { + $wordsize = $args->{ "long_matches" }; + } + elsif ( $dim_min > 100000 ) # optimized for DNA + { + $wordsize = int( $args->{ "long_matches" } / 2 ); + } + elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA + { + while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) { + $wordsize++; + } + } + else + { + while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) { + $wordsize++; + } + } + + return $wordsize; +} + + +sub find_matches +{ + # Martin A. Hansen, November 2006 + + # given two sequences, find all maximum expanded matches between these + + my ( $q_seq, # sequence 1 + $s_seq, # sequence 2 + $wordsize, # word size + $q_min, # q sequence start position + $q_max, # q sequecne stop position + $s_min, # s sequence start position + $s_max, # s sequecne stop position + ) = @_; + + # returns list of matches + + my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches ); + + if ( length ${ $s_seq } > length ${ $q_seq } ) + { + for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ ) + { + $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize; + + next if $q_word =~ /n/i; # DNA/genome optimization + + push @{ $word_hash{ $q_word } }, $q_beg; + } + + for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ ) + { + $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize; + + if ( exists $word_hash{ $s_word } ) + { + foreach $q_beg ( @{ $word_hash{ $s_word } } ) + { + $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ]; + + if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and + $match->[ Q_END ] <= $_->[ Q_END ] and + $match->[ S_BEG ] >= $_->[ S_BEG ] and + $match->[ S_END ] <= $_->[ S_END ] } @matches ) + { + next; # match is redundant + } + else + { + $match = &expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min ); + $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1; + + push @matches, $match; + } + } + } + } + } + else + { + for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ ) + { + $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize; + + next if $s_word =~ /n/i; # DNA/genome optimization + + push @{ $word_hash{ $s_word } }, $s_beg; + } + + for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ ) + { + $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize; + + if ( exists $word_hash{ $q_word } ) + { + foreach $s_beg ( @{ $word_hash{ $q_word } } ) + { + $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ]; + + if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and + $match->[ Q_END ] <= $_->[ Q_END ] and + $match->[ S_BEG ] >= $_->[ S_BEG ] and + $match->[ S_END ] <= $_->[ S_END ] } @matches ) + { + next; # match is redundant + } + else + { + $match = &expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min ); + $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1; + + push @matches, $match; + } + } + } + } + } + + return wantarray ? @matches : \@matches; +} + + +sub expand_match +{ + # Martin A. Hansen, August 2006. + + # Given two sequences and a match, expand the match maximally. + # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ] + + my ( $q_seq, # sequence 1 ref + $s_seq, # sequence 2 ref + $match, # sequence match + $q_max, # q sequecne stop position + $q_min, # q sequence start position + $s_max, # s sequecne stop position + $s_min, # s sequence start position + ) = @_; + + # returns match + + my ( $q_pos, $s_pos ); + + # expanding forward + + $q_pos = $match->[ Q_END ] + 1; + $s_pos = $match->[ S_END ] + 1; + + while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) ) + { + $q_pos++; + $s_pos++; + } + + $match->[ Q_END ] = $q_pos - 1; + $match->[ S_END ] = $s_pos - 1; + + # expanding backwards + + $q_pos = $match->[ Q_BEG ] - 1; + $s_pos = $match->[ S_BEG ] - 1; + + while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) ) + { + $q_pos--; + $s_pos--; + } + + $match->[ Q_BEG ] = $q_pos + 1; + $match->[ S_BEG ] = $s_pos + 1; + + return $match; +} + + +sub score_match +{ + # Martin A. Hansen, August 2006 + + # given a match and a search space scores the match according to three criteria: + + # 1) length of match - the longer the better. + # 2) distance to closest corner - the shorter the better. + # 3) distance to closest narrow end of the search space - the shorter the better. + + # each of these scores are divided by search space dimentions, and the total score + # is calculated: total = score_len - score_corner - score_narrow + + # the higher the score, the better the match. + + my ( $match, # match + $q_min, # q sequence start position + $q_max, # q sequecne stop position + $s_min, # s sequence start position + $s_max, # s sequecne stop position + ) = @_; + + # returns a positive number + + my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist, + $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist ); + + # ----- 1) scoring according to match length + + $score_len = $match->[ LEN ] ** 3; + + # ---- 2) score according to distance away from diagonal + + $q_dim = $q_max - $q_min + 1; + $s_dim = $s_max - $s_min + 1; + + if ( $q_dim >= $s_dim ) # s_dim is the narrow end + { + $beg_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim ); + $end_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max ); + } + else + { + $beg_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim ); + $end_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max ); + } + + $min_diag_dist = &Maasha::Calc::min( $beg_diag_dist, $end_diag_dist ); + + $score_diag = 2 * $min_diag_dist ** 2; + + # ----- 3) scoring according to distance to the narrow end of the search space + + if ( $q_dim > $s_dim ) # s_dim is the narrow end + { + $beg_narrow_dist = $match->[ Q_BEG ] - $q_min; + $end_narrow_dist = $q_max - $match->[ Q_BEG ]; + + $max_narrow_dist = &Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); + } + elsif ( $q_dim < $s_dim ) + { + $beg_narrow_dist = $match->[ S_BEG ] - $s_min; + $end_narrow_dist = $s_max - $match->[ S_BEG ]; + + $max_narrow_dist = &Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); + } + else + { + $max_narrow_dist = 0; + } + + $score_narrow = $max_narrow_dist; + + $score_total = $score_len - $score_narrow - $score_diag; + # $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ]; + + return $score_total; +} + + +sub score_match_niels +{ + # Niels Larsen, June 2004. + + # Creates a crude "heuristic" attempt of telling how likely it is that a + # given match occurs by chance in a given search space. If sequences are + # given their composition is taken into account. The scoring punishes + # distance from diagonal(s) and distance from previous match(es). Scores + # range from zero and up, and lower is better. + + my ( $match, # Match array + $q_seq, # Either q_seq or s_seq + $q_min, # Lower bound search area (query sequence) + $q_max, # Upper bound search area (query sequence) + $s_min, # Lower bound search area (subject sequence) + $s_max, # Upper bound search area (subject sequence) + ) = @_; + + # Returns a positive number. + + my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos, + $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i, + $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch, + $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta, + $delta_max ); + + $q_beg = $match->[Q_BEG]; + $q_end = $match->[Q_END]; + $s_beg = $match->[S_BEG]; + $s_end = $match->[S_END]; + + # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<< + + # Get the probability of a match from the sequence composition (when + # match is longer than 20 and sequence is given, otherwise use 0.25) + # and raise that to the power of the length. + + if ( $match->[LEN] > 20 and defined $q_seq ) + { + $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1; + $seqlen = length $seq; + + $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0; + + for ( $i = 0; $i < $seqlen; $i++ ) + { + $chars{ substr $seq, $i, 1 }++; + } + + $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2 + + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2; + } + else { + $pmatch = 0.25; + } + + $score = $pmatch ** ( $q_end - $q_beg + 1 ); + +# # Punish by difference in height and width of search space, + +# $q_dim = $q_max - $q_min + 1; +# $s_dim = $s_max - $s_min + 1; + +# if ( $q_dim != $s_dim ) { +# $score *= abs ( $q_dim - $s_dim ) ** 2; +# } + + return $score if $score > 0.25; + + # Punish by how far the match is to the closest corner of the search + # space, + + $q_delta_beg = $q_beg - $q_min; + $s_delta_beg = $s_beg - $s_min; + + $q_delta_end = $q_max - $q_end; + $s_delta_end = $s_max - $s_end; + + if ( $q_delta_beg > $s_delta_beg ) { + $delta_beg_max = $q_delta_beg; + } else { + $delta_beg_max = $s_delta_beg; + } + + if ( $q_delta_end > $s_delta_end ) { + $delta_end_max = $q_delta_end; + } else { + $delta_end_max = $s_delta_end; + } + + if ( $delta_beg_max <= $delta_end_max ) { + $score *= ($delta_beg_max+1) ** 2.0; + } else { + $score *= ($delta_end_max+1) ** 2.0; + } + + return $score if $score > 0.25; + + # Add penalty if the match is towards the narrow end of the + # search space, + + if ( ($q_max - $q_min) <= ($s_max - $s_min) ) + { + if ( $q_delta_beg > $s_delta_beg ) + { + $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3; + } + elsif ( $q_delta_end > $s_delta_end ) + { + $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3; + } + } + else + { + if ( $s_delta_beg > $q_delta_beg ) + { + $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3; + } + elsif ( $s_delta_end > $q_delta_end ) + { + $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3; + } + } + + if ( $score < 0 ) { + print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n"; + die qq (Score <= 0 -> $score); + } + + return $score; +} + + +sub print_alignment +{ + # Martin A. Hansen, August 2006. + + # Routine to print an alignment in fasta format based + # on a list of matches and two given sequences. + + my ( $matches, # list of matches + $q_head, # query sequence head + $q_seq, # query sequence ref + $s_head, # subject sequence head + $s_seq, # subject sequence ref + ) = @_; + + my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i ); + + @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches }; + + $q_pos = 0; + $s_pos = 0; + + for ( $i = 0; $i < @{ $matches }; $i++ ) + { + $match = $matches->[ $i ]; + + $q_nomatch = $match->[ Q_BEG ] - $q_pos; + $s_nomatch = $match->[ S_BEG ] - $s_pos; + + if ( $q_nomatch - $s_nomatch > 0 ) + { + $s_aseq .= "-" x ( $q_nomatch - $s_nomatch ); + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + } + elsif ( $s_nomatch - $q_nomatch > 0 ) + { + $q_aseq .= "-" x ( $s_nomatch - $q_nomatch ); + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + } + else + { + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + } + + $q_pos += $q_nomatch + $match->[ LEN ]; + $s_pos += $s_nomatch + $match->[ LEN ]; + } + + $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ]; + + $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ]; + $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ]; + + if ( $q_nomatch - $s_nomatch > 0 ) + { + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + $s_aseq .= "-" x ( $q_nomatch - $s_nomatch ); + } + elsif ( $s_nomatch - $q_nomatch > 0 ) + { + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + $q_aseq .= "-" x ( $s_nomatch - $q_nomatch ); + } + else + { + $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ]; + } + + print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n"; +} + + +sub print_matches +{ + # Martin A. Hansen, February 2007. + + my ( $matches, # list of matches + $q_head, # query sequence head + $q_seq, # query sequence ref + $s_head, # subject sequence head + $s_seq, # subject sequence ref + $args, # argument hash - OPTIONAL + ) = @_; + + $args->{ "wrap" } ||= 80; + + my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries ); + + @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches }; + + $q_pos = 0; + $s_pos = 0; + + for ( $i = 0; $i < @{ $matches }; $i++ ) + { + $match = $matches->[ $i ]; + + $q_nomatch = $match->[ Q_BEG ] - $q_pos; + $s_nomatch = $match->[ S_BEG ] - $s_pos; + + $q = $q_pos; + $s = $s_pos; + + if ( $q_nomatch - $s_nomatch > 0 ) + { + $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch ); + $s_aseq .= "-" x ( $q_nomatch - $s_nomatch ); + $pins .= " " x ( $q_nomatch - $s_nomatch ); + $q += ( $q_nomatch - $s_nomatch ); + } + elsif ( $s_nomatch - $q_nomatch > 0 ) + { + $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch ); + $q_aseq .= "-" x ( $s_nomatch - $q_nomatch ); + $pins .= " " x ( $s_nomatch - $q_nomatch ); + $s += ( $s_nomatch - $q_nomatch ); + } + + while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch ) + { + $q_aseq .= substr ${ $q_seq }, $q, 1; + $s_aseq .= substr ${ $s_seq }, $s, 1; + + if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) + { + $pins .= ":"; + } else { + $pins .= " "; + } + + $q++; + $s++; + } + + $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ]; + $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ]; + $pins .= "|" x $match->[ LEN ]; + + $q_pos += $q_nomatch + $match->[ LEN ]; + $s_pos += $s_nomatch + $match->[ LEN ]; + } + + $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 ); + $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 ); + + $q = $q_pos; + $s = $s_pos; + + while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } ) + { + $q_aseq .= substr ${ $q_seq }, $q, 1; + $s_aseq .= substr ${ $s_seq }, $s, 1; + + if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) { + $pins .= ":"; + } else { + $pins .= " "; + } + + $q++; + $s++; + $q_pos++; + $s_pos++; + } + + if ( $q_nomatch - $s_nomatch > 0 ) + { + $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch ); + $s_aseq .= "-" x ( $q_nomatch - $s_nomatch ); + $pins .= " " x ( $q_nomatch - $s_nomatch ); + } + elsif ( $s_nomatch - $q_nomatch > 0 ) + { + $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch ); + $q_aseq .= "-" x ( $s_nomatch - $q_nomatch ); + $pins .= " " x ( $s_nomatch - $q_nomatch ); + } + + $q_ruler = &make_ruler( $q_aseq ); + $s_ruler = &make_ruler( $s_aseq ); + + $entries = [ + [ "ruler", $q_ruler ], + [ $q_head, $q_aseq ], + [ "consensus", $pins ], + [ $s_head, $s_aseq ], + [ "ruler", $s_ruler ], + ]; + + &align_print_multi( $entries, undef, $args->{ "wrap" } ) +} + + +sub make_ruler +{ + # Martin A. Hansen, February 2007; + + # Gererates a ruler for a given sequence (with indels). + + my ( $seq + ) = @_; + + # Returns string + + my ( $i, $char, $skip, $count, $gap, $tics ); + + $gap = 0; + $i = 1; + + while ( $i <= length $seq ) + { + $char = substr $seq, $i - 1, 1; + + $gap++ if $char eq "-"; + + if ( $skip ) + { + $skip--; + } + else + { + $count = $i - $gap; + $count = 1 if $char eq "-"; + + if ( $count % 100 == 0 ) + { + if ( $count + length( $count ) >= length $seq ) + { + $tics .= "|"; + } + else + { + $tics .= "|" . $count; + $skip = length $count; + } + } + elsif ( $count % 50 == 0 ) { + $tics .= ":"; + } elsif ( $count % 10 == 0 ) { + $tics .= "."; + } else { + $tics .= " "; + } + } + + $i++; + } + + return $tics; +} + + +sub align_sim_local +{ + # Martin A. Hansen, May 2007. + + # Calculate the local similarity of an alignment based on + # an alignment chain. The similarity is calculated as + # the number of matching residues divided by the overall + # length of the alignment chain. This means that a short + # but "good" alignment will yield a high similarity, while + # a long "poor" alignment will yeild a low similarity. + + my ( $chain, # list of matches in alignment + ) = @_; + + # returns a float + + my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim ); + + return 0 if not @{ $chain }; + + $match_tot = 0; + $q_end = 0; + $s_end = 0; + $q_beg = 999999999; + $s_beg = 999999999; + + foreach $match ( @{ $chain } ) + { + $match_tot += $match->[ LEN ]; + + $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg; + $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg; + + $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end; + $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end; + } + + $q_diff = $q_end - $q_beg + 1; + $s_diff = $s_end - $s_beg + 1; + + $max = &Maasha::Calc::max( $q_diff, $s_diff ); + + $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 ); + + return $sim; +} + + +sub align_sim_global +{ + # Martin A. Hansen, June 2007. + + # Calculate the global similarity of an alignment based on + # an alignment chain. The similarity is calculated as + # the number of matching residues divided by the + # length of the shortest sequence. + + my ( $chain, # list of matches in alignment + $q_seq, # ref to query sequence + $s_seq, # ref to subject sequence + ) = @_; + + # returns a float + + my ( $match_tot, $min, $sim ); + + return 0 if not @{ $chain }; + + $match_tot = 0; + + map { $match_tot += $_->[ LEN ] } @{ $chain }; + + $min = &Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) ); + + $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 ); + + return $sim; +} + + +sub align_consensus +{ + # Martin A. Hansen, June 2006. + + # Given an alignment as a list of FASTA entries, + # generates a consensus sequences based on the + # entropies for each column similar to the way + # a sequence logo i calculated. Returns the + # consensus sequence as a FASTA entry. + + my ( $entries, # list of aligned FASTA entries + $type, # residue type - OPTIONAL + $min_sim, # minimum similarity - OPTIONAL + ) = @_; + + # Returns tuple + + my ( $bit_max, $data, $pos, $char, $score, $entry ); + + $type ||= &Maasha::Seq::seq_guess_type( $entries->[ 0 ] ); + $min_sim ||= 50; + + if ( $type =~ /protein/ ) { + $bit_max = 4; + } else { + $bit_max = 2; + } + + $data = &Maasha::Seq::seqlogo_calc( $bit_max, $entries ); + + foreach $pos ( @{ $data } ) + { + ( $char, $score ) = @{ $pos->[ -1 ] }; + + if ( ( $score / $bit_max ) * 100 >= $min_sim ) { + $entry->[ SEQ ] .= $char; + } else { + $entry->[ SEQ ] .= "-"; + } + } + + $entry->[ HEAD ] = "Consensus: $min_sim%"; + + return wantarray ? @{ $entry } : $entry; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Berkeley_DB.pm b/code_perl/Maasha/Berkeley_DB.pm new file mode 100644 index 0000000..be6639c --- /dev/null +++ b/code_perl/Maasha/Berkeley_DB.pm @@ -0,0 +1,69 @@ +package Maasha::Berkeley_DB; + + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for Berkeley DB manipulation. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Maasha::Common; +use DB_File; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub db_init +{ + # Martin A. Hansen, May 2008. + + # Initializes a Berkeley DB tied to a Perl hash. + + my ( $path, # path to BDB file. + ) = @_; + + # Returns hashref + + my ( %hash ); + + tie %hash, "DB_File", $path or &Maasha::Common::error( "Could not tie-open DB file '$path': $!" ); + + return wantarray ? %hash : \%hash; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; diff --git a/code_perl/Maasha/Biotools.pm b/code_perl/Maasha/Biotools.pm new file mode 100644 index 0000000..50d7e9b --- /dev/null +++ b/code_perl/Maasha/Biotools.pm @@ -0,0 +1,5885 @@ +package Maasha::Biotools; + + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation, parsing and emitting of human/machine readable biotool records. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Getopt::Long qw( :config bundling ); +use Time::HiRes qw( gettimeofday ); +use Storable qw( dclone ); +use Maasha::Config; +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Align; +use Maasha::Matrix; +use Maasha::Match; +use Maasha::EMBL; +use Maasha::Stockholm; +use Maasha::Seq; +use Maasha::Patscan; +use Maasha::Plot; +use Maasha::Calc; +use Maasha::UCSC; +use Maasha::NCBI; +use Maasha::GFF; +use Maasha::TwoBit; +use Maasha::Solid; +use Maasha::SQL; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + +@EXPORT_OK = qw( + read_stream + write_stream + get_record + put_record +); + +use constant { + SEQ_NAME => 0, + SEQ => 1, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SIGNAL HANDLER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +$SIG{ '__DIE__' } = \&sig_handler; +$SIG{ 'INT' } = \&sig_handler; +$SIG{ 'TERM' } = \&sig_handler; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $script, $TMP_DIR ); + +$script = &Maasha::Common::get_scriptname(); +$TMP_DIR = &Maasha::Common::get_tmpdir(); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my $log_fh = &Maasha::Common::append_open( $ENV{ "LOG_DIR" } . "/biopieces.log" ); + +$log_fh->autoflush( 1 ); + +&log( $log_fh, $script, \@ARGV ); + +close $log_fh; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my $t0 = gettimeofday(); + +&run_script( $script ); + +my $t1 = gettimeofday(); + +print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub log +{ + # Martin A. Hansen, January 2008. + + # Log messages to logfile. + + my ( $fh, # filehandle to logfile + $script, # script name + $argv, # reference to @ARGV + ) = @_; + + # Returns nothing. + + my ( $time_stamp, $user ); + + $time_stamp = &Maasha::Common::time_stamp(); + + $user = $ENV{ "USER" }; + + $script = "biopieces" if $script eq "-e"; + + print $fh "$time_stamp\t$user\t$script ", join( " ", @{ $argv } ), "\n"; +} + + +sub run_script +{ + # Martin A. Hansen, August 2007. + + # Run a specific script. + + my ( $script, # script name + ) = @_; + + # Returns nothing. + + my ( $options, $in, $out ); + + &script_list_biotools( $ENV{ 'INST_DIR'} . "/biotools/usage/" ) if $script eq "list_biotools"; + + &script_print_usage( $ENV{ 'INST_DIR'} . "/biotools/usage/$script" ) if -t STDIN and not @ARGV; + + $options = &get_options( $script ); + + $in = &read_stream( $options->{ "stream_in" } ); + $out = &write_stream( $options->{ "stream_out" } ); + + if ( $script eq "read_fasta" ) { &script_read_fasta( $in, $out, $options ) } + elsif ( $script eq "read_align" ) { &script_read_align( $in, $out, $options ) } + elsif ( $script eq "read_tab" ) { &script_read_tab( $in, $out, $options ) } + elsif ( $script eq "read_psl" ) { &script_read_psl( $in, $out, $options ) } + elsif ( $script eq "read_bed" ) { &script_read_bed( $in, $out, $options ) } + elsif ( $script eq "read_blast_tab" ) { &script_read_blast_tab( $in, $out, $options ) } + elsif ( $script eq "read_embl" ) { &script_read_embl( $in, $out, $options ) } + elsif ( $script eq "read_stockholm" ) { &script_read_stockholm( $in, $out, $options ) } + elsif ( $script eq "read_phastcons" ) { &script_read_phastcons( $in, $out, $options ) } + elsif ( $script eq "read_soft" ) { &script_read_soft( $in, $out, $options ) } + elsif ( $script eq "read_gff" ) { &script_read_gff( $in, $out, $options ) } + elsif ( $script eq "read_2bit" ) { &script_read_2bit( $in, $out, $options ) } + elsif ( $script eq "read_solexa" ) { &script_read_solexa( $in, $out, $options ) } + elsif ( $script eq "read_solid" ) { &script_read_solid( $in, $out, $options ) } + elsif ( $script eq "read_mysql" ) { &script_read_mysql( $in, $out, $options ) } + elsif ( $script eq "count_seq" ) { &script_count_seq( $in, $out, $options ) } + elsif ( $script eq "length_seq" ) { &script_length_seq( $in, $out, $options ) } + elsif ( $script eq "uppercase_seq" ) { &script_uppercase_seq( $in, $out, $options ) } + elsif ( $script eq "shuffle_seq" ) { &script_shuffle_seq( $in, $out, $options ) } + elsif ( $script eq "analyze_seq" ) { &script_analyze_seq( $in, $out, $options ) } + elsif ( $script eq "analyze_tags" ) { &script_analyze_tags( $in, $out, $options ) } + elsif ( $script eq "complexity_seq" ) { &script_complexity_seq( $in, $out, $options ) } + elsif ( $script eq "oligo_freq" ) { &script_oligo_freq( $in, $out, $options ) } + elsif ( $script eq "create_weight_matrix" ) { &script_create_weight_matrix( $in, $out, $options ) } + elsif ( $script eq "calc_bit_scores" ) { &script_calc_bit_scores( $in, $out, $options ) } + elsif ( $script eq "reverse_seq" ) { &script_reverse_seq( $in, $out, $options ) } + elsif ( $script eq "complement_seq" ) { &script_complement_seq( $in, $out, $options ) } + elsif ( $script eq "remove_indels" ) { &script_remove_indels( $in, $out, $options ) } + elsif ( $script eq "transliterate_seq" ) { &script_transliterate_seq( $in, $out, $options ) } + elsif ( $script eq "transliterate_vals" ) { &script_transliterate_vals( $in, $out, $options ) } + elsif ( $script eq "translate_seq" ) { &script_translate_seq( $in, $out, $options ) } + elsif ( $script eq "extract_seq" ) { &script_extract_seq( $in, $out, $options ) } + elsif ( $script eq "get_genome_seq" ) { &script_get_genome_seq( $in, $out, $options ) } + elsif ( $script eq "get_genome_align" ) { &script_get_genome_align( $in, $out, $options ) } + elsif ( $script eq "get_genome_phastcons" ) { &script_get_genome_phastcons( $in, $out, $options ) } + elsif ( $script eq "fold_seq" ) { &script_fold_seq( $in, $out, $options ) } + elsif ( $script eq "split_seq" ) { &script_split_seq( $in, $out, $options ) } + elsif ( $script eq "split_bed" ) { &script_split_bed( $in, $out, $options ) } + elsif ( $script eq "align_seq" ) { &script_align_seq( $in, $out, $options ) } + elsif ( $script eq "tile_seq" ) { &script_tile_seq( $in, $out, $options ) } + elsif ( $script eq "invert_align" ) { &script_invert_align( $in, $out, $options ) } + elsif ( $script eq "patscan_seq" ) { &script_patscan_seq( $in, $out, $options ) } + elsif ( $script eq "create_blast_db" ) { &script_create_blast_db( $in, $out, $options ) } + elsif ( $script eq "blast_seq" ) { &script_blast_seq( $in, $out, $options ) } + elsif ( $script eq "blat_seq" ) { &script_blat_seq( $in, $out, $options ) } + elsif ( $script eq "match_seq" ) { &script_match_seq( $in, $out, $options ) } + elsif ( $script eq "create_vmatch_index" ) { &script_create_vmatch_index( $in, $out, $options ) } + elsif ( $script eq "vmatch_seq" ) { &script_vmatch_seq( $in, $out, $options ) } + elsif ( $script eq "write_fasta" ) { &script_write_fasta( $in, $out, $options, $options ) } + elsif ( $script eq "write_align" ) { &script_write_align( $in, $out, $options ) } + elsif ( $script eq "write_blast" ) { &script_write_blast( $in, $out, $options ) } + elsif ( $script eq "write_tab" ) { &script_write_tab( $in, $out, $options ) } + elsif ( $script eq "write_bed" ) { &script_write_bed( $in, $out, $options ) } + elsif ( $script eq "write_psl" ) { &script_write_psl( $in, $out, $options ) } + elsif ( $script eq "write_2bit" ) { &script_write_2bit( $in, $out, $options, $options ) } + elsif ( $script eq "write_solid" ) { &script_write_solid( $in, $out, $options, $options ) } + elsif ( $script eq "head_records" ) { &script_head_records( $in, $out, $options ) } + elsif ( $script eq "remove_keys" ) { &script_remove_keys( $in, $out, $options ) } + elsif ( $script eq "rename_keys" ) { &script_rename_keys( $in, $out, $options ) } + elsif ( $script eq "uniq_vals" ) { &script_uniq_vals( $in, $out, $options ) } + elsif ( $script eq "merge_vals" ) { &script_merge_vals( $in, $out, $options ) } + elsif ( $script eq "grab" ) { &script_grab( $in, $out, $options ) } + elsif ( $script eq "compute" ) { &script_compute( $in, $out, $options ) } + elsif ( $script eq "flip_tab" ) { &script_flip_tab( $in, $out, $options ) } + elsif ( $script eq "add_ident" ) { &script_add_ident( $in, $out, $options ) } + elsif ( $script eq "count_records" ) { &script_count_records( $in, $out, $options ) } + elsif ( $script eq "random_records" ) { &script_random_records( $in, $out, $options ) } + elsif ( $script eq "sort_records" ) { &script_sort_records( $in, $out, $options ) } + elsif ( $script eq "count_vals" ) { &script_count_vals( $in, $out, $options ) } + elsif ( $script eq "plot_histogram" ) { &script_plot_histogram( $in, $out, $options ) } + elsif ( $script eq "plot_lendist" ) { &script_plot_lendist( $in, $out, $options ) } + elsif ( $script eq "plot_chrdist" ) { &script_plot_chrdist( $in, $out, $options ) } + elsif ( $script eq "plot_karyogram" ) { &script_plot_karyogram( $in, $out, $options ) } + elsif ( $script eq "plot_matches" ) { &script_plot_matches( $in, $out, $options ) } + elsif ( $script eq "plot_seqlogo" ) { &script_plot_seqlogo( $in, $out, $options ) } + elsif ( $script eq "plot_phastcons_profiles" ) { &script_plot_phastcons_profiles( $in, $out, $options ) } + elsif ( $script eq "analyze_bed" ) { &script_analyze_bed( $in, $out, $options ) } + elsif ( $script eq "analyze_vals" ) { &script_analyze_vals( $in, $out, $options ) } + elsif ( $script eq "length_vals" ) { &script_length_vals( $in, $out, $options ) } + elsif ( $script eq "sum_vals" ) { &script_sum_vals( $in, $out, $options ) } + elsif ( $script eq "mean_vals" ) { &script_mean_vals( $in, $out, $options ) } + elsif ( $script eq "median_vals" ) { &script_median_vals( $in, $out, $options ) } + elsif ( $script eq "max_vals" ) { &script_max_vals( $in, $out, $options ) } + elsif ( $script eq "min_vals" ) { &script_min_vals( $in, $out, $options ) } + elsif ( $script eq "upload_to_ucsc" ) { &script_upload_to_ucsc( $in, $out, $options ) } + + close $in if defined $in; + close $out; + + # unset status - missing + # write log file - missing +} + + +sub get_options +{ + # Martin A. Hansen, February 2008. + + # Gets options from commandline and checks these vigerously. + + my ( $script, # name of script + ) = @_; + + # Returns hash + + my ( %options, @options, $opt, @genomes ); + + if ( $script eq "read_fasta" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_align" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_tab" ) + { + @options = qw( + data_in|i=s + delimit|d=s + cols|c=s + keys|k=s + skip|s=s + num|n=s + ); + } + elsif ( $script eq "read_psl" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_bed" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_blast_tab" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_embl" ) + { + @options = qw( + data_in|i=s + num|n=s + keys|k=s + feats|f=s + quals|q=s + ); + } + elsif ( $script eq "read_stockholm" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_phastcons" ) + { + @options = qw( + data_in|i=s + num|n=s + min|m=s + dist|d=s + threshold|t=f + gap|g=s + ); + } + elsif ( $script eq "read_soft" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_gff" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_2bit" ) + { + @options = qw( + data_in|i=s + num|n=s + no_mask|N + ); + } + elsif ( $script eq "read_solexa" ) + { + @options = qw( + data_in|i=s + num|n=s + quality|q=s + ); + } + elsif ( $script eq "read_solid" ) + { + @options = qw( + data_in|i=s + num|n=s + quality|q=s + ); + } + elsif ( $script eq "read_mysql" ) + { + @options = qw( + database|d=s + query|q=s + user|u=s + password|p=s + ); + } + elsif ( $script eq "count_seq" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "length_seq" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "oligo_freq" ) + { + @options = qw( + word_size|w=s + all|a + ); + } + elsif ( $script eq "create_weight_matrix" ) + { + @options = qw( + percent|p + ); + } + elsif ( $script eq "transliterate_seq" ) + { + @options = qw( + search|s=s + replace|r=s + delete|d=s + ); + } + elsif ( $script eq "transliterate_vals" ) + { + @options = qw( + keys|k=s + search|s=s + replace|r=s + delete|d=s + ); + } + elsif ( $script eq "translate_seq" ) + { + @options = qw( + frames|f=s + ); + } + elsif ( $script eq "extract_seq" ) + { + @options = qw( + beg|b=s + end|e=s + len|l=s + ); + } + elsif ( $script eq "get_genome_seq" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + flank|f=s + mask|m + ); + } + elsif ( $script eq "get_genome_align" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + strand|s=s + ); + } + elsif ( $script eq "get_genome_phastcons" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + flank|f=s + ); + } + elsif ( $script eq "split_seq" ) + { + @options = qw( + word_size|w=s + uniq|u + ); + } + elsif ( $script eq "split_bed" ) + { + @options = qw( + window_size|w=s + step_size|s=s + ); + } + elsif ( $script eq "tile_seq" ) + { + @options = qw( + identity|i=s + supress_indels|s + ); + } + elsif ( $script eq "invert_align" ) + { + @options = qw( + soft|s + ); + } + elsif ( $script eq "patscan_seq" ) + { + @options = qw( + patterns|p=s + patterns_in|P=s + comp|c + max_hits|h=s + max_misses|m=s + genome|g=s + ); + } + elsif ( $script eq "create_blast_db" ) + { + @options = qw( + no_stream|x + database|d=s + ); + } + elsif ( $script eq "blast_seq" ) + { + @options = qw( + database|d=s + genome|g=s + program|p=s + e_val|e=f + filter|f + cpus|c=s + no_filter|F + ); + } + elsif ( $script eq "blat_seq" ) + { + @options = qw( + genome|g=s + tile_size|t=s + step_size|s=s + min_identity|m=s + min_score|M=s + one_off|o=s + ooc|c + ); + } + elsif ( $script eq "match_seq" ) + { + @options = qw( + word_size|w=s + direction|d=s + ); + } + elsif ( $script eq "create_vmatch_index" ) + { + @options = qw( + index_name|i=s + prefix_length|p=s + no_stream|x + ); + } + elsif ( $script eq "vmatch_seq" ) + { + @options = qw( + genome|g=s + index_name|i=s + count|c + max_hits|m=s + hamming_dist|h=s + edit_dist|e=s + ); + } + elsif ( $script eq "write_fasta" ) + { + @options = qw( + wrap|w=s + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_align" ) + { + @options = qw( + wrap|w=s + no_stream|x + no_ruler|R + no_consensus|C + data_out|o=s + ); + } + elsif ( $script eq "write_blast" ) + { + @options = qw( + no_stream|x + data_out|o=s + comment|c + compress|Z + ); + } + elsif ( $script eq "write_tab" ) + { + @options = qw( + no_stream|x + data_out|o=s + delimit|d=s + keys|k=s + no_keys|K=s + comment|c + compress|Z + ); + } + elsif ( $script eq "write_bed" ) + { + @options = qw( + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_psl" ) + { + @options = qw( + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_2bit" ) + { + @options = qw( + no_stream|x + data_out|o=s + no_mask|N + ); + } + elsif ( $script eq "write_solid" ) + { + @options = qw( + wrap|w=s + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "plot_seqlogo" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "plot_phastcons_profiles" ) + { + @options = qw( + no_stream|x + data_out|o=s + genome|g=s + mean|m + median|M + flank|f=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + ); + } + elsif ( $script eq "analyze_vals" ) + { + @options = qw( + no_stream|x + keys|k=s + ); + } + elsif ( $script eq "head_records" ) + { + @options = qw( + num|n=s + ); + } + elsif ( $script eq "remove_keys" ) + { + @options = qw( + keys|k=s + save_keys|K=s + ); + } + elsif ( $script eq "rename_keys" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "uniq_vals" ) + { + @options = qw( + key|k=s + invert|i + ); + } + elsif ( $script eq "merge_vals" ) + { + @options = qw( + keys|k=s + delimit|d=s + ); + } + elsif ( $script eq "grab" ) + { + @options = qw( + patterns|p=s + patterns_in|P=s + regex|r=s + eval|e=s + exact_in|E=s + invert|i + case_insensitive|c + keys|k=s + keys_only|K + vals_only|V + ); + } + elsif ( $script eq "compute" ) + { + @options = qw( + eval|e=s + ); + } + elsif ( $script eq "add_ident" ) + { + @options = qw( + prefix|p=s + key|k=s + ); + } + elsif ( $script eq "count_records" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "random_records" ) + { + @options = qw( + num|n=s + ); + } + elsif ( $script eq "sort_records" ) + { + @options = qw( + reverse|r + keys|k=s + ); + } + elsif ( $script eq "count_vals" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "plot_histogram" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + key|k=s + sort|s=s + ); + } + elsif ( $script eq "plot_lendist" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + key|k=s + ); + } + elsif ( $script eq "plot_chrdist" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + ); + } + elsif ( $script eq "plot_karyogram" ) + { + @options = qw( + no_stream|x + data_out|o=s + genome|g=s + feat_color|f=s + ); + } + elsif ( $script eq "plot_matches" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + direction|d=s + ); + } + elsif ( $script eq "length_vals" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "sum_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "mean_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "median_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "max_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "min_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "upload_to_ucsc" ) + { + @options = qw( + no_stream|x + database|d=s + table|t=s + short_label|s=s + long_label|l=s + group|g=s + priority|p=f + use_score|u + visibility|v=s + wiggle|w + color|c=s + chunk_size|C=s + ); + } + + push @options, qw( + stream_in|I=s + stream_out|O=s + verbose + ); + +# print STDERR Dumper( \@options ); + + GetOptions( + \%options, + @options, + ); + + $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" }; + $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" }; + $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" }; + $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" }; + $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; + $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; + $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; + + # ---- check arguments ---- + + if ( $options{ 'data_in' } ) + { + $options{ "files" } = &getopt_files( $options{ 'data_in' } ); + + &Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0; + } + + map { &Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" }; + +# print STDERR Dumper( \%options ); + + foreach $opt ( keys %options ) + { + if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } ) + { + &Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); + } + elsif ( $opt =~ /beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|num|skip|cpus|window_size|step_size/ and $options{ $opt } !~ /^\d+$/ ) + { + &Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); + } + elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ ) + { + &Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") ); + } + elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ ) + { + &Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") ); + } + elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) + { + &Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") ); + } + elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ ) + { + &Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); + } + elsif ( $opt eq "genome" ) + { + @genomes = &Maasha::Config::genomes(); + + if ( not grep $options{ $opt }, @genomes ) { + &Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'INST_DIR' }/conf/genomes.conf") ); + } + } + elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb)/ ) + { + &Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") ); + } + elsif ( $opt eq "table" and $options{ $opt } =~ /-\./ ) + { + &Maasha::Common::error( qq(Character '$options{ $opt }' is not allowed in table names) ); + } + } + + &Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; + &Maasha::Common::error( qq(no --index_name specified) ) if $script eq "create_vmatch_index" and not $options{ "index_name" }; + &Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; + &Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; + &Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; + &Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index" }; + &Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; + &Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; + &Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; + + if ( $script eq "upload_to_ucsc" ) + { + &Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" }; + &Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" }; + } + + return wantarray ? %options : \%options; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub script_print_usage +{ + # Martin A. Hansen, January 2008. + + # Retrieves usage information from file and + # prints this nicely formatted. + + my ( $path, # full path to usage file + ) = @_; + + # Returns nothing. + + my ( $script, $fh, $line, @lines, @list, %hash, $key ); + + $script = ( split "/", $path )[ -1 ]; + + $fh = &Maasha::Common::read_open( $path ); + + push @list, "Program name"; + + $hash{ "Program name" } = [ $script ]; + + while ( $line = <$fh> ) + { + chomp $line; + + $line =~ s/\$script/$script/g; + + if ( $line =~ /^([^:]+):\s+(.+)$/ ) + { + push @list, $1 if not exists $hash{ $1 }; + push @{ $hash{ $1 } }, $2; + } + } + + close $fh; + + print "\n"; + + foreach $key ( @list ) + { + if ( scalar @{ $hash{ $key } } == 1 ) + { + @lines = &Maasha::Common::wrap_line( $hash{ $key }->[ 0 ], 80 ); + + printf( "%-15s%s\n", "$key:", shift @lines ); + + map { printf( "%-15s%s\n", "", $_ ) } @lines; + + print "\n"; + } + else + { + print "$key:\n"; + + map { print " $_\n" } @{ $hash{ $key } }; + + print "\n"; + } + } + + exit; +} + + +sub script_list_biotools +{ + # Martin A. Hansen, January 2008. + + # Prints the description from the usage for each of the biotools. + + my ( $path, # full path to usage directory + ) = @_; + + # Returns nothing. + + my ( @files, $file, $fh, $line, @lines, $program ); + + @files = &Maasha::Common::ls_files( $path ); + + foreach $file ( sort @files ) + { + $program = ( split "/", $file )[ -1 ]; + + $fh = &Maasha::Common::read_open( $file ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^Description:\s+(.+)/ ) + { + @lines = &Maasha::Common::wrap_line( $1, 60 ); + + printf( "%-30s%s\n", $program, shift @lines ); + + map { printf( "%-30s%s\n", "", $_ ) } @lines; + } + } + + close $fh; + } + + exit; +} + + +sub script_read_fasta +{ + # Martin A. Hansen, August 2007. + + # Read sequences from FASTA file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $entry, $num ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::Fasta::get_entry( $data_in ) ) + { + if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + SEQ_LEN => length $entry->[ SEQ ], + }; + + &put_record( $record, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_align +{ + # Martin A. Hansen, August 2007. + + # Read aligned sequences from FASTA file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $entry, $record, $file, $data_in, $num ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::Fasta::get_entry( $data_in ) ) + { + if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + ALIGN => 1, + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + ALIGN_LEN => length $entry->[ SEQ ], + }; + + &put_record( $record, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_tab +{ + # Martin A. Hansen, August 2007. + + # Read table or table columns from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num ); + + $options->{ 'delimit' } ||= '\s+'; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $skip = $options->{ 'skip' } ||= 0; + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + if ( $skip ) + { + $skip--; + next; + } + + next if $line =~ /^#|^$/; + + chomp $line; + + undef $record; + undef @fields2; + + @fields = split /$options->{'delimit'}/, $line; + + if ( $options->{ "cols" } ) { + map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; + } else { + @fields2 = @fields; + } + + for ( $i = 0; $i < @fields2; $i++ ) + { + if ( $options->{ "keys" }->[ $i ] ) { + $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; + } else { + $record->{ "V" . $i } = $fields2[ $i ]; + } + } + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_psl +{ + # Martin A. Hansen, August 2007. + + # Read psl table from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @files, $file, $entries, $entry, $num ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $entries = &Maasha::UCSC::psl_get_entries( $file ); + + foreach $entry ( @{ $entries } ) + { + &put_record( $entry, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + NUM: +} + + +sub script_read_bed +{ + # Martin A. Hansen, August 2007. + + # Read bed table from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $record, $entry, $data_in, $num ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::UCSC::bed_get_entry( $data_in ) ) + { + &put_record( $entry, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_blast_tab +{ + # Martin A. Hansen, September 2007. + + # Read tabular BLAST output from NCBI blast run with -m8 or -m9. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $line, @fields, $strand, $record, $data_in, $num ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split /\t/, $line; + + $record->{ "REC_TYPE" } = "BLAST"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "S_ID" } = $fields[ 1 ]; + $record->{ "IDENT" } = $fields[ 2 ]; + $record->{ "ALIGN_LEN" } = $fields[ 3 ]; + $record->{ "MISMATCHES" } = $fields[ 4 ]; + $record->{ "GAPS" } = $fields[ 5 ]; + $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based + $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based + $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based + $record->{ "E_VAL" } = $fields[ 10 ]; + $record->{ "BIT_SCORE" } = $fields[ 11 ]; + + if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) + { + $record->{ "STRAND" } = '-'; + + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + else + { + $record->{ "STRAND" } = '+'; + } + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_embl +{ + # Martin A. Hansen, August 2007. + + # Read EMBL format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %options2, $file, $data_in, $num, $entry, $record ); + + map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } }; + map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } }; + map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } }; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::EMBL::get_embl_entry( $data_in ) ) + { + $record = &Maasha::EMBL::parse_embl_entry( $entry, \%options2 ); + + my ( $feat, $feat2, $qual, $qual_val, $record_copy ); + + $record_copy = dclone $record; + + delete $record_copy->{ "FT" }; + + &put_record( $record_copy, $out ); + + delete $record_copy->{ "SEQ" }; + + foreach $feat ( keys %{ $record->{ "FT" } } ) + { + $record_copy->{ "FEAT_TYPE" } = $feat; + + foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } ) + { + foreach $qual ( keys %{ $feat2 } ) + { + $qual_val = join "; ", @{ $feat2->{ $qual } }; + + $qual =~ s/^_//; + $qual = uc $qual; + + $record_copy->{ $qual } = $qual_val; + } + + &put_record( $record_copy, $out ); + } + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_stockholm +{ + # Martin A. Hansen, August 2007. + + # Read Stockholm format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::Stockholm::get_stockholm_entry( $data_in ) ) + { + $record = &Maasha::Stockholm::parse_stockholm_entry( $entry ); + + undef $record_anno; + + foreach $key ( keys %{ $record->{ "GF" } } ) { + $record_anno->{ $key } = $record->{ "GF" }->{ $key }; + } + + $record_anno->{ "ALIGN" } = $num; + + &put_record( $record_anno, $out ); + + foreach $seq ( @{ $record->{ "ALIGN" } } ) + { + undef $record_align; + + $record_align = { + ALIGN => $num, + SEQ_NAME => $seq->[ 0 ], + SEQ => $seq->[ 1 ], + }; + + &put_record( $record_align, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_phastcons +{ + # Martin A. Hansen, December 2007. + + # Read PhastCons format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $entry, @records, $record ); + + $options->{ "min" } ||= 10; + $options->{ "dist" } ||= 25; + $options->{ "threshold" } ||= 0.8; + $options->{ "gap" } ||= 5; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::UCSC::phastcons_get_entry( $data_in ) ) + { + @records = &Maasha::UCSC::phastcons_parse_entry( $entry, $options ); + + foreach $record ( @records ) + { + $record->{ "REC_TYPE" } = "BED"; + $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_soft +{ + # Martin A. Hansen, December 2007. + + # Read soft format. + # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $soft_index = &Maasha::NCBI::soft_index_file( $file ); + + $fh = &Maasha::Common::read_open( $file ); + + @platforms = grep { $_->[ 0 ] =~ /PLATFORM/ } @{ $soft_index }; + + $plat_table = &Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->[ 1 ], $platforms[ -1 ]->[ 2 ] ); + + @samples = grep { $_->[ 0 ] =~ /SAMPLE/ } @{ $soft_index }; + + $old_end = $platforms[ -1 ]->[ 2 ]; + + foreach $sample ( @samples ) + { + $records = &Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->[ 1 ] - $old_end - 1, $sample->[ 2 ] - $old_end - 1 ); + + foreach $record ( @{ $records } ) + { + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + $old_end = $sample->[ 2 ]; + } + + close $fh; + } + + NUM: + + close $data_in if $data_in; + close $fh if $fh; +} + + +sub script_read_gff +{ + # Martin A. Hansen, February 2008. + + # Read soft format. + # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $fh, $num, $record, $entry ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $fh = &Maasha::Common::read_open( $file ); + + while ( $entry = &Maasha::GFF::get_entry( $fh ) ) + { + &put_record( $entry, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $fh; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_2bit +{ + # Martin A. Hansen, March 2008. + + # Read sequences from 2bit file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $mask, $toc, $line, $num ); + + $mask = 1 if not $options->{ "no_mask" }; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + $toc = &Maasha::TwoBit::twobit_get_TOC( $data_in ); + + foreach $line ( @{ $toc } ) + { + $record->{ "SEQ_NAME" } = $line->[ 0 ]; + $record->{ "SEQ" } = &Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask ); + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_solexa +{ + # Martin A. Hansen, March 2008. + + # Read Solexa sequence reads from file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $base_name, $data_in, $line, $num, @fields, @seqs, @scores, $i, $seq, $seq_count ); + + $options->{ "quality" } ||= 20; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + $base_name = &Maasha::Common::get_basename( $file ); + $base_name =~ s/\..*//; + + $seq_count = 0; + + while ( $line = <$data_in> ) + { + @fields = split /:/, $line; + @seqs = split //, $fields[ 5 ]; + @scores = split / /, $fields[ -1 ]; + + for ( $i = 0; $i < @scores; $i++ ) { + $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; + } + + $seq = join "", @seqs; + + $record->{ "SEQ_NAME" } = sprintf( "%s_ID%08d", $base_name, $seq_count ); + $record->{ "SEQ" } = $seq; + $record->{ "SEQ_LEN" } = length $seq; + $record->{ "SCORE_MEAN" } = sprintf ( "%.2f", &Maasha::Calc::mean( \@scores ) ); + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $seq_count++; + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_solid +{ + # Martin A. Hansen, April 2008. + + # Read Solid sequence from file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i ); + + $options->{ "quality" } ||= 15; + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = &Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + chomp $line; + + ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line; + + @scores = split /,/, $seq_qual; + @seqs = split //, &Maasha::Solid::color_space2seq( $seq_cs ); + + for ( $i = 0; $i < @seqs; $i++ ) { + $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; + } + + $record = { + SEQ_NAME => $seq_name, + SEQ_CS => $seq_cs, + SEQ_QUAL => $seq_qual, + SEQ_LEN => length $seq_cs, + SEQ => join( "", @seqs ), + SCORE_MEAN => sprintf( "%.2f", &Maasha::Calc::mean( \@scores ) ), + }; + + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_mysql +{ + # Martin A. Hansen, May 2008. + + # Read a MySQL query into stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $dbh, $results ); + + $options->{ "user" } ||= &Maasha::UCSC::ucsc_get_user(); + $options->{ "password" } ||= &Maasha::UCSC::ucsc_get_password(); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $dbh = &Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + $results = &Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } ); + + &Maasha::SQL::disconnect( $dbh ); + + map { &put_record( $_ ) } @{ $results }; +} + + +sub script_count_seq +{ + # Martin A. Hansen, August 2007. + + # Count sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count, $result, $fh ); + + $count = 0; + + while ( $record = &get_record( $in ) ) + { + $count++ if $record->{ "SEQ" }; + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $result = { "count_seq" => $count }; + + $fh = &write_stream( $options->{ "data_out" } ); + + &put_record( $result, $fh ); + + close $fh; +} + + +sub script_length_seq +{ + # Martin A. Hansen, August 2007. + + # Determine the length of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $total ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + $total += $record->{ "SEQ_LEN" }; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + &put_record( { TOTAL_SEQ_LEN => $total }, $out ); +} + + +sub script_uppercase_seq +{ + # Martin A. Hansen, August 2007. + + # Uppercases sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; + + &put_record( $record, $out ); + } +} + + +sub script_shuffle_seq +{ + # Martin A. Hansen, December 2007. + + # Shuffle sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + $record->{ "SEQ" } = &Maasha::Seq::seq_shuffle( $record->{ "SEQ" } ) if $record->{ "SEQ" }; + + &put_record( $record, $out ); + } +} + + +sub script_analyze_seq +{ + # Martin A. Hansen, August 2007. + + # Analyze sequence composition of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $analysis ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $analysis = &Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); + + map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; + } + + &put_record( $record, $out ); + } +} + + +sub script_analyze_tags +{ + # Martin A. Hansen, August 2008. + + # Analyze sequence tags in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $analysis, %len_hash, %clone_hash, $clones, $key, $tag_record ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ length( $record->{ "SEQ" } ) }++; + $clone_hash{ length( $record->{ "SEQ" } ) } += $clones; + } + } + elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } ) + { + if ( $record->{ "Q_ID" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ $record->{ "BED_LEN" } }++; + $clone_hash{ $record->{ "BED_LEN" } } += $clones; + } + } + } + + foreach $key ( sort { $a <=> $b } keys %len_hash ) + { + $tag_record->{ "TAG_LEN" } = $key; + $tag_record->{ "TAG_COUNT" } = $len_hash{ $key }; + $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key }; + + &put_record( $tag_record, $out ); + } +} + + +sub script_complexity_seq +{ + # Martin A. Hansen, May 2008. + + # Generates an index calculated as the most common di-residue over + # the sequence length for all sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $index ); + + while ( $record = &get_record( $in ) ) + { + $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", &Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" }; + + &put_record( $record, $out ); + } +} + + +sub script_oligo_freq +{ + # Martin A. Hansen, August 2007. + + # Determine the length of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %oligos, @freq_table ); + + $options->{ "word_size" } ||= 7; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + map { $oligos{ $_ }++ } &Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } ); + + if ( not $options->{ "all" } ) + { + @freq_table = &Maasha::Seq::oligo_freq( \%oligos ); + + map { &put_record( $_, $out ) } @freq_table; + + undef %oligos; + } + } + + &put_record( $record, $out ); + } + + if ( $options->{ "all" } ) + { + @freq_table = &Maasha::Seq::oligo_freq( \%oligos ); + + map { &put_record( $_, $out ) } @freq_table; + } +} + + +sub script_create_weight_matrix +{ + # Martin A. Hansen, August 2007. + + # Creates a weight matrix from an alignmnet. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq ); + + $count = 0; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) + { + $res = substr $record->{ "SEQ" }, $i, 1; + + $freq_hash{ $i }{ $res }++; + $res_hash{ $res } = 1; + } + + $count++; + } + else + { + &put_record( $record, $out ); + } + } + + foreach $res ( sort keys %res_hash ) + { + undef $record; + + $record->{ "V0" } = $res; + + for ( $i = 0; $i < keys %freq_hash; $i++ ) + { + $freq = $freq_hash{ $i }{ $res } || 0; + + if ( $options->{ "percent" } ) { + $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0; + } + + $record->{ "V" . ( $i + 1 ) } = $freq; + } + + &put_record( $record, $out ); + } +} + + +sub script_calc_bit_scores +{ + # Martin A. Hansen, March 2007. + + # Calculates the bit scores for each position from an alignmnet in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type, $count, $i, $res, %freq_hash, $bit_max, $bit_height, $bit_diff ); + + $count = 0; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + + for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) + { + $res = substr $record->{ "SEQ" }, $i, 1; + + next if $res =~ /-|_|~|\./; + + $freq_hash{ $i }{ $res }++; + } + + $count++; + } + else + { + &put_record( $record, $out ); + } + } + + undef $record; + + if ( $type eq "protein" ) { + $bit_max = 4; + } else { + $bit_max = 2; + } + + for ( $i = 0; $i < keys %freq_hash; $i++ ) + { + $bit_height = &Maasha::Seq::seqlogo_calc_bit_height( $freq_hash{ $i }, $count ); + + $bit_diff = $bit_max - $bit_height; + + $record->{ "V" . ( $i ) } = sprintf( "%.2f", $bit_diff ); + } + + &put_record( $record, $out ); +} + + +sub script_reverse_seq +{ + # Martin A. Hansen, August 2007. + + # Reverse sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) { + $record->{ "SEQ" } = reverse $record->{ "SEQ" }; + } + + &put_record( $record, $out ); + } +} + + +sub script_complement_seq +{ + # Martin A. Hansen, August 2007. + + # Complement sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( not $type ) { + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); + } + + if ( $type eq "rna" ) { + &Maasha::Seq::rna_comp( \$record->{ "SEQ" } ); + } elsif ( $type eq "dna" ) { + &Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); + } + } + + &put_record( $record, $out ); + } +} + + +sub script_remove_indels +{ + # Martin A. Hansen, August 2007. + + # Remove indels from sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" }; + + &put_record( $record, $out ); + } +} + + +sub script_transliterate_seq +{ + # Martin A. Hansen, August 2007. + + # Transliterate chars from sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $search, $replace, $delete ); + + $search = $options->{ "search" } || ""; + $replace = $options->{ "replace" } || ""; + $delete = $options->{ "delete" } || ""; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( $search and $replace ) { + eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/"; + } elsif ( $delete ) { + eval "\$record->{ 'SEQ' } =~ tr/$delete//d"; + } + } + + &put_record( $record, $out ); + } +} + + +sub script_transliterate_vals +{ + # Martin A. Hansen, April 2008. + + # Transliterate chars from values in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $search, $replace, $delete, $key ); + + $search = $options->{ "search" } || ""; + $replace = $options->{ "replace" } || ""; + $delete = $options->{ "delete" } || ""; + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $record->{ $key } ) + { + if ( $search and $replace ) { + eval "\$record->{ $key } =~ tr/$search/$replace/"; + } elsif ( $delete ) { + eval "\$record->{ $key } =~ tr/$delete//d"; + } + } + } + + &put_record( $record, $out ); + } +} + + +sub script_translate_seq +{ + # Martin A. Hansen, February 2008. + + # Translate DNA sequence into protein sequence. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $frame, %new_record ); + + $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ]; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" ) + { + foreach $frame ( @{ $options->{ "frames" } } ) + { + %new_record = %{ $record }; + + $new_record{ "SEQ" } = &Maasha::Seq::translate( $record->{ "SEQ" }, $frame ); + $new_record{ "SEQ_LEN" } = length $record->{ "SEQ" }; + $new_record{ "FRAME" } = $frame; + + &put_record( \%new_record, $out ); + } + } + } + else + { + &put_record( $record, $out ); + } + } +} + + +sub script_extract_seq +{ + # Martin A. Hansen, August 2007. + + # Extract subsequences from sequences in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $beg, $end, $len, $record ); + + if ( not defined $options->{ "beg" } or $options->{ "beg" } < 0 ) { + $beg = 0; + } else { + $beg = $options->{ "beg" } - 1; # correcting for start offset + } + + if ( defined $options->{ "end" } and $options->{ "end" } - 1 < $beg ) { + $end = $beg - 1; + } elsif ( defined $options->{ "end" } ) { + $end = $options->{ "end" } - 1; # correcting for start offset + } + + $len = $options->{ "len" }; + +# print "beg->$beg, end->$end, len->$len\n"; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( defined $beg and defined $end ) + { + if ( $end - $beg + 1 > length $record->{ "SEQ" } ) { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } else { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $end - $beg + 1; + } + } + elsif ( defined $beg and defined $len ) + { + if ( $len > length $record->{ "SEQ" } ) { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } else { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $len; + } + } + elsif ( defined $beg ) + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } + } + + &put_record( $record, $out ); + } +} + + +sub script_get_genome_seq +{ + # Martin A. Hansen, December 2007. + + # Gets a subsequence from a genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i ); + + $options->{ "flank" } ||= 0; + + if ( $options->{ "genome" } ) + { + $genome_file = &Maasha::Config::genome_fasta( $options->{ 'genome' } ); + $index_file = &Maasha::Config::genome_fasta_index( $options->{ 'genome' } ); + + $fh = &Maasha::Common::read_open( $genome_file ); + $index = &Maasha::Fasta::index_retrieve( $index_file ); + + shift @{ $index }; # Get rid of the file size info + + map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + + if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; + + $beg = $index_beg + $options->{ "beg" } - 1; + + if ( $options->{ "len" } ) { + $len = $options->{ "len" }; + } elsif ( $options->{ "end" } ) { + $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR" } = $options->{ "chr" }; + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = &Maasha::Common::file_read( $fh, $beg, $len ); + $record->{ "SEQ_LEN" } = $len; + + &put_record( $record, $out ); + } + } + + while ( $record = &get_record( $in ) ) + { + if ( $options->{ "genome" } and not $record->{ "SEQ" } ) + { + if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; + + $beg = $record->{ "CHR_BEG" } + $index_beg; + $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = &Maasha::Common::file_read( $fh, $beg, $len ); + + if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) + { + &Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); + $record->{ "SEQ" } = reverse $record->{ "SEQ" }; + } + + if ( $options->{ "mask" } ) + { + if ( $record->{ "BLOCKCOUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. + { + $record->{ "SEQ" } = lc $record->{ "SEQ" }; + + @begs = split ",", $record->{ "Q_BEGS" }; + @lens = split ",", $record->{ "BLOCKSIZES" }; + + for ( $i = 0; $i < @begs; $i++ ) { + substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; + } + } + } + } + + &put_record( $record, $out ); + } + + close $fh if $fh; +} + + +sub script_get_genome_align +{ + # Martin A. Hansen, April 2008. + + # Gets a subalignment from a multiple genome alignment. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry ); + + $options->{ "strand" } ||= "+"; + + $align_num = 1; + + $maf_track = &Maasha::Config::maf_track( $options->{ "genome" } ); + + if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) ) + { + $beg = $options->{ "beg" } - 1; + + if ( $options->{ "end" } ) { + $end = $options->{ "end" }; + } elsif ( $options->{ "len" } ) { + $end = $beg + $options->{ "len" }; + } + + $align = &Maasha::UCSC::maf_extract( $TMP_DIR, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } ); + + foreach $entry ( @{ $align } ) + { + $entry->{ "ALIGN" } = $align_num; + $entry->{ "CHR" } = $record->{ "CHR" }; + $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; + $entry->{ "CHR_END" } = $record->{ "CHR_END" }; + $entry->{ "STRAND" } = $record->{ "STRAND" } || '+'; + $entry->{ "Q_ID" } = $record->{ "Q_ID" }; + $entry->{ "SCORE" } = $record->{ "SCORE" }; + + &put_record( $entry, $out ); + } + } + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BED" ) + { + $align = &Maasha::UCSC::maf_extract( $TMP_DIR, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $align = &Maasha::UCSC::maf_extract( $TMP_DIR, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + $align = &Maasha::UCSC::maf_extract( $TMP_DIR, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); + } + + foreach $entry ( @{ $align } ) + { + $entry->{ "ALIGN" } = $align_num; + $entry->{ "CHR" } = $record->{ "CHR" }; + $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; + $entry->{ "CHR_END" } = $record->{ "CHR_END" }; + $entry->{ "STRAND" } = $record->{ "STRAND" }; + $entry->{ "Q_ID" } = $record->{ "Q_ID" }; + $entry->{ "SCORE" } = $record->{ "SCORE" }; + + &put_record( $entry, $out ); + } + + $align_num++; + } +} + + +sub script_get_genome_phastcons +{ + # Martin A. Hansen, February 2008. + + # Get phastcons scores from genome intervals. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record ); + + $options->{ "flank" } ||= 0; + + $phastcons_file = &Maasha::Config::genome_phastcons( $options->{ "genome" } ); + $phastcons_index = &Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); + + $index = &Maasha::UCSC::phastcons_index_retrieve( $phastcons_index ); + $fh_phastcons = &Maasha::Common::read_open( $phastcons_file ); + + if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) + { + $options->{ "beg" } -= 1; # request is 1-based + $options->{ "end" } -= 1; # request is 1-based + + if ( $options->{ "len" } ) { + $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; + } + + $scores = &Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); + + $record->{ "CHR" } = $options->{ "chr" }; + $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; + $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" }; + + $record->{ "PHASTCONS" } = join ",", @{ $scores }; + $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG + + &put_record( $record, $out ); + } + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BED" ) + { + $scores = &Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $scores = &Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + $scores = &Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + } + + $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; +# $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG + + &put_record( $record, $out ); + } + + close $fh_phastcons if $fh_phastcons; +} + + +sub script_fold_seq +{ + # Martin A. Hansen, December 2007. + + # Folds sequences in stream into secondary structures. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type, $struct, $index ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( not $type ) { + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); + } + + if ( $type ne "protein" ) + { + ( $struct, $index ) = &Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } ); + $record->{ "SEC_STRUCT" } = $struct; + $record->{ "FREE_ENERGY" } = $index; + $record->{ "SCORE" } = abs int $index; + $record->{ "SIZE" } = length $struct; + $record->{ "CONF" } = "1," x $record->{ "SIZE" }; + } + } + + &put_record( $record, $out ); + } +} + + +sub script_split_seq +{ + # Martin A. Hansen, August 2007. + + # Split a sequence in stream into words. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record, $i, $subseq, %lookup ); + + $options->{ "word_size" } ||= 7; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + for ( $i = 0; $i < length( $record->{ "SEQ" } ) - $options->{ "word_size" } + 1; $i++ ) + { + $subseq = substr $record->{ "SEQ" }, $i, $options->{ "word_size" }; + + if ( $options->{ "uniq" } and not $lookup{ $subseq } ) + { + $new_record->{ "REC_TYPE" } = "SPLIT"; + $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; + $new_record->{ "SEQ" } = $subseq; + + &put_record( $new_record, $out ); + + $lookup{ $subseq } = 1; + } + else + { + $new_record->{ "REC_TYPE" } = "SPLIT"; + $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; + $new_record->{ "SEQ" } = $subseq; + + &put_record( $new_record, $out ); + } + } + } + else + { + &put_record( $record, $out ); + } + } +} + + +sub script_split_bed +{ + # Martin A. Hansen, June 2008. + + # Split a BED record into overlapping windows. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record, $i ); + + $options->{ "window_size" } ||= 20; + $options->{ "step_size" } ||= 1; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + + for ( $i = 0; $i < $record->{ "BED_LEN" } - $options->{ "window_size" }; $i += $options->{ "step_size" } ) + { + $new_record->{ "REC_TYPE" } = "BED"; + $new_record->{ "CHR" } = $record->{ "CHR" }; + $new_record->{ "CHR_BEG" } = $record->{ "CHR_BEG" } + $i; + $new_record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $i + $options->{ "window_size" }; + $new_record->{ "BED_LEN" } = $options->{ "window_size" }; + $new_record->{ "Q_ID" } = $record->{ "Q_ID" } . "_$i"; + $new_record->{ "SCORE" } = $record->{ "SCORE" }; + $new_record->{ "STRAND" } = $record->{ "STRAND" }; + + &put_record( $new_record, $out ); + } + } + else + { + &put_record( $record, $out ); + } + } +} + + +sub script_align_seq +{ + # Martin A. Hansen, August 2007. + + # Align sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $entry ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } elsif ( $record->{ "Q_ID" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "Q_ID" }, $record->{ "SEQ" } ]; + } else { + &put_record( $record, $out ); + } + } + + @entries = &Maasha::Align::align( \@entries ); + + foreach $entry ( @entries ) + { + if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + ALIGN => 1, + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + }; + + &put_record( $record, $out ); + } + } +} + + +sub script_tile_seq +{ + # Martin A. Hansen, February 2008. + + # Using the first sequence in stream as reference, tile + # all subsequent sequences based on pairwise alignments. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $first, $ref_entry, @entries ); + + $first = 1; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + if ( $first ) + { + $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + + $first = 0; + } + else + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + } + else + { + &put_record( $record, $out ); + } + } + + @entries = &Maasha::Align::align_tile( $ref_entry, \@entries, $options ); + + map { &put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ], ALIGN => 1 }, $out ) } @entries; +} + + +sub script_invert_align +{ + # Martin A. Hansen, February 2008. + + # Inverts an alignment showing only non-mathing residues + # using the first sequence as reference. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } and $record->{ "ALIGN" } ) + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + else + { + &put_record( $record, $out ); + } + } + + &Maasha::Align::align_invert( \@entries, $options->{ "soft" } ); + + map { &put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ], ALIGN => 1 }, $out ) } @entries; +} + + +sub script_patscan_seq +{ + # Martin A. Hansen, August 2007. + + # Locates patterns in sequences using scan_for_matches. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i ); + + if ( $options->{ "patterns" } ) { + $patterns = &Maasha::Patscan::parse_patterns( $options->{ "patterns" } ); + } elsif ( -f $options->{ "patterns_in" } ) { + $patterns = &Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); + } + + $genome_file = &Maasha::Config::genome_fasta( $options->{ 'genome' } ) if $options->{ 'genome' }; + + push @args, "-c" if $options->{ "comp" }; + push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' }; + push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' }; + + $seq_file = "$TMP_DIR/patscan.seq"; + $pat_file = "$TMP_DIR/patscan.pat"; + $out_file = "$TMP_DIR/patscan.out"; + + $fh_out = &Maasha::Common::write_open( $seq_file ); + + $i = 0; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) + { + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + + &Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out ); + + $head_hash{ $i } = $record->{ "SEQ_NAME" }; + + $i++; + } + +# &put_record( $record, $out ); + } + + close $fh_out; + + $arg = join " ", @args; + $arg .= " -p" if $type eq "protein"; + + foreach $pattern ( @{ $patterns } ) + { + $fh_out = &Maasha::Common::write_open( $pat_file ); + + print $fh_out "$pattern\n"; + + close $fh_out; + + if ( $options->{ 'genome' } ) { + `scan_for_matches $arg $pat_file < $genome_file > $out_file`; + # &Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" ); + } else { + `scan_for_matches $arg $pat_file < $seq_file > $out_file`; + # &Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" ); + } + + $fh_in = &Maasha::Common::read_open( $out_file ); + + while ( $entry = &Maasha::Fasta::get_entry( $fh_in ) ) + { + $result = &Maasha::Patscan::parse_scan_result( $entry, $pattern ); + + if ( $options->{ 'genome' } ) + { + $result->{ "CHR" } = $result->{ "S_ID" }; + $result->{ "CHR_BEG" } = $result->{ "S_BEG" }; + $result->{ "CHR_END" } = $result->{ "S_END" }; + + delete $result->{ "S_ID" }; + delete $result->{ "S_BEG" }; + delete $result->{ "S_END" }; + } + else + { + $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } }; + } + + &put_record( $result, $out ); + } + + close $fh_in; + } + + unlink $pat_file; + unlink $seq_file; + unlink $out_file; +} + + +sub script_create_blast_db +{ + # Martin A. Hansen, September 2007. + + # Creates a NCBI BLAST database with formatdb + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $seq_type, $path, $record ); + + $path = $options->{ "database" }; + + $fh = &Maasha::Common::write_open( $path ); + + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) + { + $seq_type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $seq_type; + + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh ); + } + } + + close $fh; + + if ( $seq_type eq "protein" ) { + &Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" ); + } else { + &Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" ); + } + + unlink $path; +} + + +sub script_blast_seq +{ + # Martin A. Hansen, September 2007. + + # BLASTs sequences in stream against a given database. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields ); + + $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; + $options->{ "filter" } = "F"; + $options->{ "filter" } = "T" if $options->{ "filter" }; + $options->{ "cpus" } ||= 1; + + $options->{ "database" } = &Maasha::Config::genome_blast( $options->{ 'genome' } ) if $options->{ 'genome' }; + + $tmp_in = "$TMP_DIR/blast_query.seq"; + $tmp_out = "$TMP_DIR/blast.result"; + + $fh_out = &Maasha::Common::write_open( $tmp_in ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + $q_type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $q_type; + + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out ); + } + + &put_record( $record, $out ); + } + + close $fh_out; + + if ( -f $options->{ 'database' } . ".phr" ) { + $s_type = "protein"; + } else { + $s_type = "nucleotide"; + } + + if ( not $options->{ 'program' } ) + { + if ( $q_type ne "protein" and $s_type ne "protein" ) { + $options->{ 'program' } = "blastn"; + } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { + $options->{ 'program' } = "blastp"; + } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { + $options->{ 'program' } = "blastx"; + } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { + $options->{ 'program' } = "tblastn"; + } + } + + &Maasha::Common::run( "blastall", "-p $options->{ 'program' } -e $options->{ 'e_val' } -a $options->{ 'cpus' } -m 8 -i $tmp_in -d $options->{ 'database' } -F $options->{ 'filter' } -o $tmp_out > /dev/null 2>&1", 1 ); + + unlink $tmp_in; + + $fh_out = &Maasha::Common::read_open( $tmp_out ); + + undef $record; + + while ( $line = <$fh_out> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split /\s+/, $line; + + $record->{ "REC_TYPE" } = "BLAST"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "S_ID" } = $fields[ 1 ]; + $record->{ "IDENT" } = $fields[ 2 ]; + $record->{ "ALIGN_LEN" } = $fields[ 3 ]; + $record->{ "MISMATCHES" } = $fields[ 4 ]; + $record->{ "GAPS" } = $fields[ 5 ]; + $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based + $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based + $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based + $record->{ "E_VAL" } = $fields[ 10 ]; + $record->{ "BIT_SCORE" } = $fields[ 11 ]; + + if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) + { + $record->{ "STRAND" } = '-'; + + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + else + { + $record->{ "STRAND" } = '+'; + } + + &put_record( $record, $out ); + } + + close $fh_out; + + unlink $tmp_out; +} + + +sub script_blat_seq +{ + # Martin A. Hansen, August 2007. + + # BLATs sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries ); + + $genome_file = &Maasha::Config::genome_fasta( $options->{ "genome" } ); + + $options->{ 'tile_size' } ||= 11; + $options->{ 'one_off' } ||= 0; + $options->{ 'min_identity' } ||= 90; + $options->{ 'min_score' } ||= 0; + $options->{ 'step_size' } ||= $options->{ 'tile_size' }; + + $blat_args .= " -tileSize=$options->{ 'tile_size' }"; + $blat_args .= " -oneOff=$options->{ 'one_off' }"; + $blat_args .= " -minIdentity=$options->{ 'min_identity' }"; + $blat_args .= " -minScore=$options->{ 'min_score' }"; + $blat_args .= " -stepSize=$options->{ 'step_size' }"; + $blat_args .= " -ooc=" . &Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; + + $query_file = "$TMP_DIR/blat.seq"; + + $fh_out = &Maasha::Common::write_open( $query_file ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, 80 ); + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + } + + &put_record( $record, $out ); + } + + close $fh_out; + + $blat_args .= " -t=dnax" if $type eq "protein"; + $blat_args .= " -q=$type"; + + $result_file = "$TMP_DIR/blat.psl"; + + &Maasha::Common::run( "blat", "$genome_file $query_file $blat_args $result_file > /dev/null 2>&1" ); + + unlink $query_file; + + $entries = &Maasha::UCSC::psl_get_entries( $result_file ); + + map { &put_record( $_, $out ) } @{ $entries }; + + unlink $result_file; +} + + +sub script_match_seq +{ + # Martin A. Hansen, August 2007. + + # BLATs sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $results ); + + $options->{ "word_size" } ||= 20; + $options->{ "direction" } ||= "both"; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + &put_record( $record, $out ); + } + + if ( @entries == 1 ) + { + $results = &Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $TMP_DIR ); + + map { &put_record( $_, $out ) } @{ $results }; + } + elsif ( @entries == 2 ) + { + $results = &Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $TMP_DIR ); + + map { &put_record( $_, $out ) } @{ $results }; + } +} + + +sub script_create_vmatch_index +{ + # Martin A. Hansen, January 2008. + + # Create a vmatch index from sequences in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file_tmp, $fh_tmp, $type ); + + if ( $options->{ "index_name" } ) + { + $file_tmp = $options->{ 'index_name' }; + $fh_tmp = &Maasha::Common::write_open( $file_tmp ); + } + + while ( $record = &get_record( $in ) ) + { + if ( $options->{ "index_name" } and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_tmp ); + + $type = &Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( $options->{ "index_name" } ) + { + close $fh_tmp; + + if ( $type eq "protein" ) { + &Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } else { + &Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } + + unlink $file_tmp; + } +} + + +sub script_vmatch_seq +{ + # Martin A. Hansen, August 2007. + + # Vmatches sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @index_files, @records, $result_file, $fh_in, $record ); + + $options->{ 'count' } = 1 if $options->{ 'max_hits' }; + + if ( $options->{ "index_name" } ) { + @index_files = $options->{ "index_name" }; + } else { + @index_files = &Maasha::Config::genome_vmatch( $options->{ "genome" } ); + } + + while ( $record = &get_record( $in ) ) + { + push @records, $record; + + &put_record( $record, $out ); + } + + $result_file = &Maasha::Match::match_vmatch( $TMP_DIR, \@records, \@index_files, $options ); + + undef @records; + + $fh_in = &Maasha::Common::read_open( $result_file ); + + while ( $record = &Maasha::Match::vmatch_get_entry( $fh_in ) ) { + &put_record( $record, $out ); + } + + close $fh_in; + + unlink $result_file; +} + + +sub script_write_fasta +{ + # Martin A. Hansen, August 2007. + + # Write FASTA entries from sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh ); + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh, $options->{ "wrap" } ); + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_align +{ + # Martin A. Hansen, August 2007. + + # Write pretty alignments aligned sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $align, $old_align, @entries ); + + $fh = &write_stream( $options->{ "data_out" } ) ; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "ALIGN" } and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + $align = $record->{ "ALIGN" }; + + if ( not $old_align ) + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + + $old_align = $align; + } + elsif ( $align == $old_align ) + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + else + { + if ( scalar( @entries ) == 2 ) { + &Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); + } elsif ( scalar ( @entries ) > 2 ) { + &Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); + } + + undef @entries; + $old_align = $align; + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( scalar( @entries ) == 2 ) { + &Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); + } elsif ( scalar ( @entries ) > 2 ) { + &Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); + } + + close $fh if $fh; +} + + +sub script_write_blast +{ + # Martin A. Hansen, November 2007. + + # Write data in blast table format (-m8 and 9). + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $first ); + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; + + $first = 1; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + if ( $options->{ "comment" } and $first ) + { + print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; + + $first = 0; + } + + if ( $record->{ "STRAND" } eq "-" ) { + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + + print $fh join( "\t", + $record->{ "Q_ID" }, + $record->{ "S_ID" }, + $record->{ "IDENT" }, + $record->{ "ALIGN_LEN" }, + $record->{ "MISMATCHES" }, + $record->{ "GAPS" }, + $record->{ "Q_BEG" } + 1, + $record->{ "Q_END" } + 1, + $record->{ "S_BEG" } + 1, + $record->{ "S_END" } + 1, + $record->{ "E_VAL" }, + $record->{ "BIT_SCORE" } + ), "\n"; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_tab +{ + # Martin A. Hansen, August 2007. + + # Write data as table. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B ); + + $options->{ "delimit" } ||= "\t"; + + map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = &get_record( $in ) ) + { + undef @vals; + $ok = 1; + + if ( $options->{ "keys" } ) + { + map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; + + if ( $ok ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $record->{ $key } ) + { + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + } + } + else + { + foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) + { + next if exists $no_keys{ $key }; + + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + + if ( @keys and $options->{ "comment" } ) + { + print $fh "#", join( $options->{ "delimit" }, @keys ), "\n"; + + delete $options->{ "comment" }; + } + + print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals; + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_bed +{ + # Martin A. Hansen, August 2007. + + # Write BED format for the UCSC genome browser using records in stream. + + # Crude - needs lots of work! + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $new_record ); + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BED" ) # ---- Hits from BED ---- + { + &Maasha::UCSC::bed_put_entry( $record, $fh, $record->{ "BED_COLS" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAT (PSL) ---- + { + $new_record->{ "CHR" } = $record->{ "S_ID" }; + $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; + $new_record->{ "CHR_END" } = $record->{ "S_END" }; + $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; + $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; + $new_record->{ "STRAND" } = $record->{ "STRAND" }; + + &Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); + } + elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) # ---- Hits from patscan_seq ---- + { + &Maasha::UCSC::bed_put_entry( $record, $fh, 6 ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from BLAST ---- + { + $new_record->{ "CHR" } = $record->{ "S_ID" }; + $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; + $new_record->{ "CHR_END" } = $record->{ "S_END" }; + $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; + $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow + $new_record->{ "STRAND" } = $record->{ "STRAND" }; + + &Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); + } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) # ---- Hits from Vmatch ---- + { + $new_record->{ "CHR" } = $record->{ "S_ID" }; + $new_record->{ "CHR_BEG" } = $record->{ "S_BEG" }; + $new_record->{ "CHR_END" } = $record->{ "S_END" }; + $new_record->{ "Q_ID" } = $record->{ "Q_ID" }; + $new_record->{ "SCORE" } = $record->{ "SCORE" } || 999; # or use E_VAL somehow + $new_record->{ "STRAND" } = $record->{ "STRAND" }; + + &Maasha::UCSC::bed_put_entry( $new_record, $fh, 6 ); + } + elsif ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) # ---- Generic data from tables ---- + { + &Maasha::UCSC::bed_put_entry( $record, $fh ); + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_psl +{ + # Martin A. Hansen, August 2007. + + # Write PSL output from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, @output, $first ); + + $first = 1; + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" ) + { + &Maasha::UCSC::psl_put_header( $fh ) if $first; + &Maasha::UCSC::psl_put_entry( $record, $fh ); + $first = 0; + } + } + + close $fh; +} + + +sub script_write_2bit +{ + # Martin A. Hansen, March 2008. + + # Write sequence entries from stream in 2bit format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out ); + + $mask = 1 if not $options->{ "no_mask" }; + + $tmp_file = "$TMP_DIR/write_2bit.fna"; + $fh_tmp = &Maasha::Common::write_open( $tmp_file ); + + $fh_out = &write_stream( $options->{ "data_out" } ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_tmp ); + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh_tmp; + + $fh_in = &Maasha::Common::read_open( $tmp_file ); + + &Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask ); + + close $fh_in; + close $fh_out; + + unlink $tmp_file; +} + + +sub script_write_solid +{ + # Martin A. Hansen, April 2008. + + # Write di-base encoded Solid sequence from entries in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh, $seq_cs ); + + $fh = &write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + $seq_cs = &Maasha::Solid::seq2color_space( $record->{ "SEQ" } ); + + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $seq_cs ], $fh, $options->{ "wrap" } ); + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_plot_seqlogo +{ + # Martin A. Hansen, August 2007. + + # Calculates and writes a sequence logo for alignments. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $logo, $fh ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $logo = &Maasha::Plot::seq_logo( \@entries ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh $logo; + + close $fh; +} + + +sub script_plot_phastcons_profiles +{ + # Martin A. Hansen, January 2008. + + # Plots PhastCons profiles. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh ); + + $options->{ "title" } ||= "PhastCons Profiles"; + + $phastcons_file = &Maasha::Config::genome_phastcons( $options->{ "genome" } ); + $phastcons_index = &Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); + + $index = &Maasha::UCSC::phastcons_index_retrieve( $phastcons_index ); + $fh_phastcons = &Maasha::Common::read_open( $phastcons_file ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $scores = &Maasha::UCSC::phastcons_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + + push @{ $AoA }, [ @{ $scores } ]; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + &Maasha::UCSC::phastcons_normalize( $AoA ); + + $AoA = [ [ &Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" }; + $AoA = [ [ &Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" }; + + $AoA = &Maasha::Matrix::matrix_flip( $AoA ); + + $plot = &Maasha::Plot::lineplot_simple( $AoA, $options, $TMP_DIR ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $plot }; + + close $fh; +} + + +sub script_analyze_bed +{ + # Martin A. Hansen, March 2008. + + # Analyze BED entries in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + $record = &Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; + + &put_record( $record, $out ); + } +} + + +sub script_analyze_vals +{ + # Martin A. Hansen, August 2007. + + # Analyze values for given keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, @keys, %key_hash, $analysis, $len ); + + map { $key_hash{ $_ } = 1 } @{ $options->{ "keys" } }; + + while ( $record = &get_record( $in ) ) + { + foreach $key ( keys %{ $record } ) + { + next if $options->{ "keys" } and not exists $key_hash{ $key }; + + $analysis->{ $key }->{ "COUNT" }++; + + if ( &Maasha::Calc::is_a_number( $record->{ $key } ) ) + { + $analysis->{ $key }->{ "TYPE" } = "num"; + $analysis->{ $key }->{ "SUM" } += $record->{ $key }; + $analysis->{ $key }->{ "MAX" } = $record->{ $key } if $record->{ $key } > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; + $analysis->{ $key }->{ "MIN" } = $record->{ $key } if $record->{ $key } < $analysis->{ $key }->{ "MIN" } or not $analysis->{ $key }->{ "MIN" }; + } + else + { + $len = length $record->{ $key }; + + $analysis->{ $key }->{ "TYPE" } = "alph"; + $analysis->{ $key }->{ "SUM" } += $len; + $analysis->{ $key }->{ "MAX" } = $len if $len > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; + $analysis->{ $key }->{ "MIN" } = $len if $len < $analysis->{ $key }->{ "MIM" } or not $analysis->{ $key }->{ "MIN" }; + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $key ( keys %{ $analysis } ) + { + $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; + $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUN" }; + } + + my ( $keys, $types, $counts, $mins, $maxs, $sums, $means ); + + $keys = "KEY "; + $types = "TYPE "; + $counts = "COUNT"; + $mins = "MIN "; + $maxs = "MAX "; + $sums = "SUM "; + $means = "MEAN "; + + if ( $options->{ "keys" } ) { + @keys = @{ $options->{ "keys" } }; + } else { + @keys = keys %{ $analysis }; + } + + foreach $key ( @keys ) + { + $keys .= sprintf "% 15s", $key; + $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; + $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; + $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; + $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; + $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; + $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; + } + + print $out "$keys\n"; + print $out "$types\n"; + print $out "$counts\n"; + print $out "$mins\n"; + print $out "$maxs\n"; + print $out "$sums\n"; + print $out "$means\n"; +} + + +sub script_head_records +{ + # Martin A. Hansen, August 2007. + + # Display the first sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count ); + + $options->{ "num" } ||= 10; + + $count = 0; + + while ( $record = &get_record( $in ) ) + { + $count++; + + &put_record( $record, $out ); + + last if $count == $options->{ "num" }; + } +} + + +sub script_remove_keys +{ + # Martin A. Hansen, August 2007. + + # Remove keys from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record ); + + while ( $record = &get_record( $in ) ) + { + if ( $options->{ "keys" } ) + { + map { delete $record->{ $_ } } @{ $options->{ "keys" } }; + } + elsif ( $options->{ "save_keys" } ) + { + map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } }; + + $record = $new_record; + } + + &put_record( $record, $out ) if keys %{ $record }; + } +} + + +sub script_rename_keys +{ + # Martin A. Hansen, August 2007. + + # Rename keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = &get_record( $in ) ) + { + if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) + { + $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] }; + + delete $record->{ $options->{ "keys" }->[ 0 ] }; + } + + &put_record( $record, $out ); + } +} + + +sub script_uniq_vals +{ + # Martin A. Hansen, August 2007. + + # Find unique values in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %hash, $record ); + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ $options->{ "key" } } ) + { + if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } ) + { + &put_record( $record, $out ); + + $hash{ $record->{ $options->{ "key" } } } = 1; + } + elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } ) + { + &put_record( $record, $out ); + } + else + { + $hash{ $record->{ $options->{ "key" } } } = 1; + } + } + else + { + &put_record( $record, $out ); + } + } +} + + +sub script_merge_vals +{ + # Martin A. Hansen, August 2007. + + # Rename keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @join, $i ); + + $options->{ "delimit" } ||= '_'; + + while ( $record = &get_record( $in ) ) + { + if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) + { + @join = $record->{ $options->{ "keys" }->[ 0 ] }; + + for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) { + push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] }; + } + + $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join; + } + + &put_record( $record, $out ); + } +} + + +sub script_grab +{ + # Martin A. Hansen, August 2007. + + # Grab for records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $patterns, $pattern, $record, $key, $pos, $op, $val, %lookup_hash ); + + if ( $options->{ "patterns" } ) + { + $patterns = [ split ",", $options->{ "patterns" } ]; + } + elsif ( -f $options->{ "patterns_in" } ) + { + $patterns = &Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); + } + elsif ( -f $options->{ "exact_in" } ) + { + $patterns = &Maasha::Patscan::read_patterns( $options->{ "exact_in" } ); + + map { $lookup_hash{ $_ } = 1 } @{ $patterns }; + + undef $patterns; + } + + if ( $options->{ "eval" } ) + { + if ( $options->{ "eval" } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) + { + $key = $1; + $op = $2; + $val = $3; + } + } + + while ( $record = &get_record( $in ) ) + { + $pos = -1; + + if ( %lookup_hash ) + { + if ( $options->{ "keys" } ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $lookup_hash{ $record->{ $key } } ) + { + $pos = 1; + goto FOUND; + } + } + } + else + { + foreach $key ( keys %{ $record } ) + { + if ( not $options->{ "vals_only" } ) + { + if ( exists $lookup_hash{ $key } ) + { + $pos = 1; + goto FOUND; + } + } + + if ( not $options->{ "keys_only" } ) + { + if ( exists $lookup_hash{ $record->{ $key } } ) + { + $pos = 1; + goto FOUND; + } + } + } + } + } + elsif ( $patterns ) + { + foreach $pattern ( @{ $patterns } ) + { + if ( $options->{ "keys" } ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + $pos = index $record->{ $key }, $pattern; + + goto FOUND if $pos >= 0; + } + } + else + { + foreach $key ( keys %{ $record } ) + { + if ( not $options->{ "vals_only" } ) + { + $pos = index $key, $pattern; + + goto FOUND if $pos >= 0; + } + + if ( not $options->{ "keys_only" } ) + { + $pos = index $record->{ $key }, $pattern; + + goto FOUND if $pos >= 0; + } + } + } + } + } + elsif ( $options->{ "regex" } ) + { + if ( $options->{ "keys" } ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $options->{ "case_insensitive" } ) { + $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/i; + } else { + $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/; + } + + goto FOUND if $pos >= 0; + } + } + else + { + foreach $key ( keys %{ $record } ) + { + if ( not $options->{ "vals_only" } ) + { + if ( $options->{ "case_insensitive" } ) { + $pos = 1 if $key =~ /$options->{'regex'}/i; + } else { + $pos = 1 if $key =~ /$options->{'regex'}/; + } + + goto FOUND if $pos >= 0; + } + + if ( not $options->{ "keys_only" } ) + { + if ( $options->{ "case_insensitive" } ) { + $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/i; + } else { + $pos = 1 if $record->{ $key } =~ /$options->{'regex'}/; + } + + goto FOUND if $pos >= 0; + } + } + } + } + elsif ( $options->{ "eval" } ) + { + if ( defined $record->{ $key } ) + { + if ( $op eq "<" and $record->{ $key } < $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq ">" and $record->{ $key } > $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq ">=" and $record->{ $key } >= $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq "<=" and $record->{ $key } <= $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq "=" and $record->{ $key } == $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq "!=" and $record->{ $key } != $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq "eq" and $record->{ $key } eq $val ) { + $pos = 1 and goto FOUND; + } elsif ( $op eq "ne" and $record->{ $key } ne $val ) { + $pos = 1 and goto FOUND; + } + } + } + + FOUND: + + if ( $pos >= 0 and not $options->{ "invert" } ) { + &put_record( $record, $out ); + } elsif ( $pos < 0 and $options->{ "invert" } ) { + &put_record( $record, $out ); + } + } +} + + +sub script_compute +{ + # Martin A. Hansen, August 2007. + + # Evaluate extression for records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $eval_key, $eval_val, $check, @keys ); + + while ( $record = &get_record( $in ) ) + { + if ( $options->{ "eval" } ) + { + if ( $options->{ "eval" } =~ /^(.+)\s*=\s*(.+)$/ ) + { + $eval_key = $1; + $eval_val = $2; + } + + if ( not $check ) + { + @keys = split /\W+/, $eval_val; + @keys = grep { ! /^\d+$/ } @keys; + + $check = 1; + } + + map { $eval_val =~ s/$_/$record->{ $_ }/g } @keys; + + $record->{ $eval_key } = eval "$eval_val" or &Maasha::Common::error( "eval failed -> $@" ); + } + + &put_record( $record, $out ); + } +} + + +sub script_flip_tab +{ + # Martin A. Hansen, June 2008. + + # Flip a table. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $A, $B, @rows, @matrix, $row, $i ); + + while ( $record = &get_record( $in ) ) + { + undef @rows; + + foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) + { + push @rows, $record->{ $key }; + + } + + push @matrix, [ @rows ]; + } + + undef $record; + + @matrix = &Maasha::Matrix::matrix_flip( \@matrix ); + + foreach $row ( @matrix ) + { + for ( $i = 0; $i < @{ $row }; $i++ ) { + $record->{ "V$i" } = $row->[ $i ]; + } + + &put_record( $record, $out ); + } +} + + +sub script_add_ident +{ + # Martin A. Hansen, May 2008. + + # Add a unique identifier to each record in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $prefix, $i ); + + $key = $options->{ "key" } || "ID"; + $prefix = $options->{ "prefix" } || "ID"; + + $i = 0; + + while ( $record = &get_record( $in ) ) + { + $record->{ $key } = sprintf( "$prefix%08d", $i ); + + &put_record( $record, $out ); + + $i++; + } +} + + +sub script_count_records +{ + # Martin A. Hansen, August 2007. + + # Count records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count, $result, $fh, $line ); + + $count = 0; + + if ( $options->{ "no_stream" } ) + { + while ( $line = <$in> ) + { + chomp $line; + + $count++ if $line eq "---"; + } + } + else + { + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $out ); + + $count++; + } + } + + $result = { "count_records" => $count }; + + $fh = &write_stream( $options->{ "data_out" } ); + + &put_record( $result, $fh ); + + close $fh; +} + + +sub script_random_records +{ + # Martin A. Hansen, August 2007. + + # Pick a number or random records from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $tmp_file, $fh_out, $fh_in, $count, $i, %rand_hash, $rand, $max ); + + $options->{ "num" } ||= 10; + + $tmp_file = "$TMP_DIR/random_records.tmp"; + + $fh_out = &Maasha::Common::write_open( $tmp_file ); + + $count = 0; + + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $fh_out ); + + $count++; + } + + close $fh_out; + + $max = 0; + $i = 0; + + &Maasha::Common::error( qq(Requested random records > records in stream) ) if $options->{ "num" } > $count; + + while ( $i < $options->{ "num" } ) + { + $rand = int( rand( $count ) ); + + if ( not exists $rand_hash{ $rand } ) + { + $rand_hash{ $rand } = 1; + + $max = $rand if $rand > $max; + + $i++; + } + } + + $fh_in = &Maasha::Common::read_open( $tmp_file ); + + $count = 0; + + while ( $record = &get_record( $fh_in ) ) + { + &put_record( $record, $out ) if exists $rand_hash{ $count }; + + last if $count == $max; + + $count++; + } + + close $fh_in; + + unlink $tmp_file; +} + + +sub script_sort_records +{ + # Martin A. Hansen, August 2007. + + # Sort to sort records according to keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $key =~ s/n$// ) { + push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" }); + } else { + push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" }); + } + } + + $sort_str = join " or ", @sort_cmd; + $sort_sub = eval "sub { $sort_str }"; # NB security issue! + + while ( $record = &get_record( $in ) ) { + push @records, $record; + } + + @records = sort $sort_sub @records; + + if ( $options->{ "reverse" } ) + { + for ( $i = scalar @records - 1; $i >= 0; $i-- ) { + &put_record( $records[ $i ], $out ); + } + } + else + { + for ( $i = 0; $i < scalar @records; $i++ ) { + &put_record( $records[ $i ], $out ); + } + } +} + + +sub script_count_vals +{ + # Martin A. Hansen, August 2007. + + # Count records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache ); + + $tmp_file = "$TMP_DIR/count_cache.tmp"; + + $fh_out = &Maasha::Common::write_open( $tmp_file ); + + $num = 0; + + while ( $record = &get_record( $in ) ) + { + map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + push @records, $record; + + if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache + { + map { &put_record( $_, $fh_out ) } @records; + + undef @records; + + $cache = 1; + } + + print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); + + $num++; + } + + close $fh_out; + + if ( $cache ) + { + $num = 0; + + $fh_in = &Maasha::Common::read_open( $tmp_file ); + + while ( $record = &get_record( $fh_in ) ) + { + map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + &put_record( $record, $out ); + + print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); + + $num++; + } + + close $fh_in; + } + + foreach $record ( @records ) + { + map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + &put_record( $record, $out ); + } + + unlink $tmp_file; +} + + +sub script_plot_histogram +{ + # Martin A. Hansen, September 2007. + + # Plot a simple histogram for a given key using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); + + $options->{ "title" } ||= "Histogram"; + $options->{ "sort" } ||= "num"; + + while ( $record = &get_record( $in ) ) + { + $data_hash{ $record->{ $options->{ "key" } } }++ if $record->{ $options->{ "key" } }; + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( $options->{ "sort" } eq "num" ) { + map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash; + } else { + map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash; + } + + $result = &Maasha::Plot::histogram_simple( \@data_list, $options ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_lendist +{ + # Martin A. Hansen, August 2007. + + # Plot length distribution using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); + + $options->{ "title" } ||= "Length Distribution"; + + while ( $record = &get_record( $in ) ) + { + $data_hash{ $record->{ $options->{ "key" } } }++ if $record->{ $options->{ "key" } }; + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $max = &Maasha::Calc::list_max( [ keys %data_hash ] ); + + for ( $i = 0; $i < $max; $i++ ) { + push @data_list, [ $i, $data_hash{ $i } || 0 ]; + } + + $result = &Maasha::Plot::histogram_lendist( \@data_list, $options ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_chrdist +{ + # Martin A. Hansen, August 2007. + + # Plot chromosome distribution using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh ); + + $options->{ "title" } ||= "Chromosome Distribution"; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "CHR" } ) { # generic + $data_hash{ $record->{ "CHR" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan + $data_hash{ $record->{ "S_ID" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL + $data_hash{ $record->{ "S_ID" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST + $data_hash{ $record->{ "S_ID" } }++; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $elem ( keys %data_hash ) + { + $sort_key = $elem; + + $sort_key =~ s/chr//i; + + $sort_key =~ s/^X(.*)/99$1/; + $sort_key =~ s/^Y(.*)/99$1/; + $sort_key =~ s/^Z(.*)/999$1/; + $sort_key =~ s/^M(.*)/9999$1/; + $sort_key =~ s/^U(.*)/99999$1/; + + $count = $sort_key =~ tr/_//; + + $sort_key =~ s/_.*/"999999" x $count/ex; + + push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ]; + } + + @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list; + + $result = &Maasha::Plot::histogram_chrdist( \@data_list, $options ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_karyogram +{ + # Martin A. Hansen, August 2007. + + # Plot hits on karyogram. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %options, $record, @data, $fh, $result, %data_hash ); + + $options->{ "genome" } ||= "human"; + $options->{ "feat_color" } ||= "black"; + + while ( $record = &get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ]; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $result = &Maasha::Plot::karyogram( \%data_hash, \%options ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh $result; + + close $fh; +} + + +sub script_plot_matches +{ + # Martin A. Hansen, August 2007. + + # Plot matches in 2D generating a dotplot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @data, $fh, $result, %data_hash ); + + $options->{ "direction" } ||= "both"; + + while ( $record = &get_record( $in ) ) + { + if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) { + push @data, $record; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $options->{ "title" } ||= "plot_matches"; + $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" }; + $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" }; + + $result = &Maasha::Plot::dotplot_matches( \@data, $options, $TMP_DIR ); + + $fh = &write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_length_vals +{ + # Martin A. Hansen, August 2007. + + # Determine the length of the value for given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) { + $record->{ $key . "_LEN" } = length $record->{ $key }; + } + } + + &put_record( $record, $out ); + } +} + + +sub script_sum_vals +{ + # Martin A. Hansen, August 2007. + + # Calculates the sums for values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %sum_hash, $fh ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) { + $sum_hash{ $key } += $record->{ $key }; + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = &write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) { + &put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh ); + } + + close $fh; +} + + +sub script_mean_vals +{ + # Martin A. Hansen, August 2007. + + # Calculate the mean of values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %sum_hash, %count_hash, $mean, $fh ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) + { + $sum_hash{ $key } += $record->{ $key }; + $count_hash{ $key }++; + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = &write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $count_hash{ $key } ) { + $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) ); + } else { + $mean = "N/A"; + } + + &put_record( { $key . "_MEAN" => $mean } , $fh ); + } + + close $fh; +} + + +sub script_median_vals +{ + # Martin A. Hansen, March 2008. + + # Calculate the median values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %median_hash, $median, $fh ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) { + push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key }; + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = &write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $median_hash{ $key } ) { + $median = &Maasha::Calc::median( $median_hash{ $key } ); + } else { + $median = "N/A"; + } + + &put_record( { $key . "_MEDIAN" => $median } , $fh ); + } + + close $fh; +} + + +sub script_max_vals +{ + # Martin A. Hansen, February 2008. + + # Determine the maximum values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $fh, %max_hash, $max_record ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) + { + $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key }; + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = &write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + $max_record->{ $key . "_MAX" } = $max_hash{ $key }; + } + + &put_record( $max_record, $fh ); + + close $fh; +} + + +sub script_min_vals +{ + # Martin A. Hansen, February 2008. + + # Determine the minimum values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $fh, %min_hash, $min_record ); + + while ( $record = &get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( defined $record->{ $key } ) + { + if ( exists $min_hash{ $key } ) { + $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key }; + } else { + $min_hash{ $key } = $record->{ $key }; + } + } + } + + &put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = &write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + $min_record->{ $key . "_MIN" } = $min_hash{ $key }; + } + + &put_record( $min_record, $fh ); + + close $fh; +} + + +sub script_upload_to_ucsc +{ + # Martin A. Hansen, August 2007. + + # Calculate the mean of values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_in, $fh_out, $i, $first, $format, $args, $type, $columns, $append, %fh_hash, + $chr, $beg, $end, $block, $line, $max, $beg_block, $entry, $q_id, $clones ); + + $options->{ "short_label" } ||= $options->{ 'table' }; + $options->{ "long_label" } ||= $options->{ 'table' }; + $options->{ "group" } ||= $ENV{ "LOGNAME" }; + $options->{ "priority" } ||= 1; + $options->{ "visibility" } ||= "pack"; + $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); + $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. + + $file = "$TMP_DIR/ucsc_upload.tmp"; + + $append = 0; + + $first = 1; + + $i = 0; + + if ( $options->{ 'wiggle' } ) + { + $options->{ "visibility" } = "full"; + + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $out ) if not $options->{ "no_stream" }; + + $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; + $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; + $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; + + $fh_hash{ $record->{ "CHR" } } = &Maasha::Common::write_open( "$TMP_DIR/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; + + $fh_out = $fh_hash{ $record->{ "CHR" } }; + + &Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); + } + + map { close $_ } keys %fh_hash; + + $fh_out = &Maasha::Common::write_open( $file ); + + foreach $chr ( sort keys %fh_hash ) + { + &Maasha::Common::run( "bedSort", "$TMP_DIR/$chr $TMP_DIR/$chr" ); + + $fh_in = &Maasha::Common::read_open( "$TMP_DIR/$chr" ); + + undef $block; + + while ( $entry = &Maasha::UCSC::bed_get_entry( $fh_in, 5 ) ) + { + $chr = $entry->{ 'CHR' }; + $beg = $entry->{ 'CHR_BEG' }; + $end = $entry->{ 'CHR_END' }; + $q_id = $entry->{ 'Q_ID' }; + + if ( $q_id =~ /_(\d+)$/ ) { + $clones = $1; + } else { + $clones = 1; + } + + if ( $block ) + { + if ( $beg > $max ) + { + &Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out ); + undef $block; + } + else + { + for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) { + $block->[ $i ] += $clones; + } + + $max = &Maasha::Calc::max( $max, $end ); + } + } + + if ( not $block ) + { + $beg_block = $beg; + $max = $end; + + for ( $i = 0; $i < ( $end - $beg ); $i++ ) { + $block->[ $i ] += $clones; + } + } + } + + close $fh_in; + + &Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out ); + + unlink "$TMP_DIR/$chr"; + } + + close $fh_out; + + $wig_file = "$options->{ 'table' }.wig"; + $wib_file = "$options->{ 'table' }.wib"; + + $wib_dir = "$ENV{ 'DATA_DIR' }/genomes/$options->{ 'database' }/wib"; + + &Maasha::Common::dir_create_if_not_exists( $wib_dir ); + + # &Maasha::Common::run( "wigEncode", "$file $wig_file $wib_file > /dev/null 2>&1" ); + + `cd $TMP_DIR && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; + &Maasha::Common::run( "mv", "$TMP_DIR/$wib_file $wib_dir" ); + + unlink $file; + + $file = $wig_file; + + $format = "WIGGLE"; + } + else + { + $fh_out = &Maasha::Common::write_open( $file ); + + while ( $record = &get_record( $in ) ) + { + &put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $record->{ "REC_TYPE" } eq "PSL" ) + { + &Maasha::UCSC::psl_put_header( $fh_out ) if $first; + &Maasha::UCSC::psl_put_entry( $record, $fh_out ); + + $first = 0; + + $format = "PSL" if not $format; + } + elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) + { + # chrom chromStart chromEnd name score strand size secStr conf + + print $fh_out join ( "\t", + $record->{ "CHR" }, + $record->{ "CHR_BEG" }, + $record->{ "CHR_END" } + 1, + $record->{ "Q_ID" }, + $record->{ "SCORE" }, + $record->{ "STRAND" }, + $record->{ "SIZE" }, + $record->{ "SEC_STRUCT" }, + $record->{ "CONF" }, + ), "\n"; + + $format = "BED_SS" if not $format; + } + elsif ( $record->{ "REC_TYPE" } eq "BED" ) + { + &Maasha::UCSC::bed_put_entry( $record, $fh_out, $record->{ "BED_COLS" } ); + + $format = "BED" if not $format; + $columns = $record->{ "BED_COLS" } if not $columns; + } + elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) + { + &Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + + $format = "BED" if not $format; + $columns = 6 if not $columns; + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) + { + $record->{ "CHR" } = $record->{ "S_ID" }; + $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; + $record->{ "CHR_END" } = $record->{ "S_END" }; + $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; + + $format = "BED" if not $format; + $columns = 6 if not $columns; + + &Maasha::UCSC::bed_put_entry( $record, $fh_out ); + } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) + { + $record->{ "CHR" } = $record->{ "S_ID" }; + $record->{ "CHR_BEG" } = $record->{ "S_BEG" }; + $record->{ "CHR_END" } = $record->{ "S_END" }; + $record->{ "SCORE" } = $record->{ "SCORE" } || 999; + $record->{ "SCORE" } = int( $record->{ "SCORE" } ); + + $format = "BED" if not $format; + $columns = 6 if not $columns; + + &Maasha::UCSC::bed_put_entry( $record, $fh_out, 6 ); + } + + if ( $i == $options->{ "chunk_size" } ) + { + close $fh_out; + + if ( $format eq "BED" ) { + &Maasha::UCSC::bed_upload_to_ucsc( $TMP_DIR, $file, $options, $append ); + } elsif ( $format eq "PSL" ) { + &Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + } + + unlink $file; + + $first = 1; + + $append = 1; + + $fh_out = &Maasha::Common::write_open( $file ); + } + + $i++; + } + } + + close $fh_out; + + if ( exists $options->{ "database" } and $options->{ "table" } ) + { + if ( $format eq "BED" ) + { + $type = "bed $columns"; + + &Maasha::UCSC::bed_upload_to_ucsc( $TMP_DIR, $file, $options, $append ); + } + elsif ( $format eq "BED_SS" ) + { + $options->{ "sec_struct" } = 1; + + $type = "sec_struct"; + + &Maasha::UCSC::bed_upload_to_ucsc( $TMP_DIR, $file, $options, $append ); + } + elsif ( $format eq "PSL" ) + { + $type = "psl"; + + &Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + } + elsif ( $format eq "WIGGLE" ) + { + $type = "wig 0"; + + &Maasha::UCSC::wiggle_upload_to_ucsc( $TMP_DIR, $wib_dir, $file, $options ); + } + + unlink $file; + + &Maasha::UCSC::update_my_tracks( $options, $type ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub read_stream +{ + # Martin A. Hansen, July 2007. + + # Opens a stream to STDIN or a file, + + my ( $path, # path - OPTIONAL + ) = @_; + + # Returns filehandle. + + my ( $fh ); + + if ( not -t STDIN ) { + $fh = &Maasha::Common::read_stdin(); + } elsif ( not $path ) { +# &Maasha::Common::error( qq(no data stream) ); + } else { + $fh = &Maasha::Common::read_open( $path ); + } + +# $fh->autoflush(1) if $fh; + + return $fh; +} + + +sub write_stream +{ + # Martin A. Hansen, August 2007. + + # Opens a stream to STDOUT or a file. + + my ( $path, # path - OPTIONAL + $gzip, # compress data - OPTIONAL + ) = @_; + + # Returns filehandle. + + my ( $fh ); + + if ( $path ) { + $fh = &Maasha::Common::write_open( $path, $gzip ); + } else { + $fh = &Maasha::Common::write_stdout(); + } + + return $fh; +} + + +sub get_record +{ + # Martin A. Hansen, July 2007. + + # Reads one record at a time and converts that record + # to a Perl data structure (a hash) which is returned. + + my ( $fh, + ) = @_; + + # Returns data structure. + + my ( $block, @lines, $line, $key, $value, %record ); + + local $/ = "\n---\n"; + + $block = <$fh>; + + chomp $block; + + return if not defined $block; + + @lines = split "\n", $block; + + foreach $line ( @lines ) + { + ( $key, $value ) = split ": ", $line; + + $record{ $key } = $value; + } + + return wantarray ? %record : \%record; +} + + +sub put_record +{ + # Martin A. Hansen, July 2007. + + # Given a Perl datastructure (a hash ref) emits this to STDOUT or a filehandle. + + my ( $data, # data structure + $fh, # file handle - OPTIONAL + ) = @_; + + # Returns nothing. + + if ( scalar keys %{ $data } ) + { + if ( $fh ) + { + map { print $fh "$_: $data->{ $_ }\n" } keys %{ $data }; + print $fh "---\n"; + } + else + { + map { print "$_: $data->{ $_ }\n" } keys %{ $data }; + print "---\n"; + } + } + + undef $data; +} + + +sub getopt_files +{ + # Martin A. Hansen, November 2007. + + # Extracts files from an explicit GetOpt::Long argument + # allowing for the use of glob. E.g. + # --data_in=test.fna + # --data_in=test.fna,test2.fna + # --data_in=*.fna + # --data_in=test.fna,/dir/*.fna + + my ( $option, # option from GetOpt::Long + ) = @_; + + # Returns a list. + + my ( $elem, @files ); + + foreach $elem ( split ",", $option ) + { + if ( -f $elem ) { + push @files, $elem; + } elsif ( $elem =~ /\*/ ) { + push @files, glob( $elem ); + } + } + + return wantarray ? @files : \@files; +} + + +sub sig_handler +{ + # Martin A. Hansen, April 2008. + + # Removes temporary directory and exits gracefully. + # This subroutine is meant to be run always as the last + # thing even if a script is dies or is interrupted + # or killed. + + my ( $sig, # signal from the %SIG + ) = @_; + + # print STDERR "signal->$sig<-\n"; + + chomp $sig; + + sleep 1; + + if ( -d $TMP_DIR ) + { + if ( $sig =~ /MAASHA_ERROR/ ) { + print STDERR "\nProgram '$script' had an error" . " - Please wait for temporary data to be removed\n"; + } elsif ( $sig eq "INT" ) { + print STDERR "\nProgram '$script' interrupted (ctrl-c was pressed)" . " - Please wait for temporary data to be removed\n"; + } elsif ( $sig eq "TERM" ) { + print STDERR "\nProgram '$script' terminated (someone used kill?)" . " - Please wait for temporary data to be removed\n"; + } else { + print STDERR "\nProgram '$script' died->$sig" . " - Please wait for temporary data to be removed\n"; + } + + # This is a really bad solution, potentially, anyone can include this module and set + # the TMP_DIR to point at any dir and thus take out the machine !!! + + &Maasha::Common::dir_remove( $TMP_DIR ); + } + + exit( 0 ); +} + + +END +{ + # This is a really bad solution, potentially, anyone can include this module and set + # the TMP_DIR to point at any dir and thus take out the machine !!! + + &Maasha::Common::dir_remove( $TMP_DIR ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; + +__END__ + + +sub script_read_soft +{ + # Martin A. Hansen, December 2007. + + # Read soft format. + # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $records, $record ); + + while ( $record = &get_record( $in ) ) { + &put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $records = &Maasha::NCBI::soft_parse( $file ); + + foreach $record ( @{ $records } ) + { + &put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + NUM: + + close $data_in if $data_in; +} diff --git a/code_perl/Maasha/Blast.pm b/code_perl/Maasha/Blast.pm new file mode 100644 index 0000000..5b8647b --- /dev/null +++ b/code_perl/Maasha/Blast.pm @@ -0,0 +1,370 @@ +package Maasha::Blast; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to run BLAST and parse results. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Storable qw( dclone ); +use Data::Dumper; +use vars qw( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# The blast report structured like this: +# +# for the first entry: +# +# 1 - blast program name +# 2 - blast reference +# 3 - query sequence name and length +# 4 - subject database +# 5 - sequences producing significant alignments +# 6 - one or more HSP for each subject sequence +# 7 - blast statistics +# +# for subsequent entries: +# +# 3 - query sequence name and length +# 5 - sequences producing significant alignments +# 6 - one or more HSP for each subject sequence +# 7 - blast statistics +# +# ________________________ +# +# info +# query +# parems +# subject +# hit1 +# hsp1 +# hsp2 +# hsp3 +# hit2 +# hsp1 +# hit3 +# hsp1 +# hsp2 +# stats +# ________________________ + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub xml_parse_blast +{ + # Martin A. Hansen, March 2007. + + # determines if the results is from ncbi blast or blastcl3 + # and parses the results in accordance. + + my ( $fh, + ) = @_; + + # returns list + + my ( $results, $line,$doctype ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^' ) + { + print STDERR qq(Parsing blastcl3 results ...\n); + $results = &xml_parse_blast_blastcl3( $fh ); + } + elsif ( $doctype eq '' ) + { + print STDERR qq(Parsing NCBI blast results ...\n); + $results = &xml_parse_blast_ncbi( $fh ); + } + else + { + die qq(ERROR: Could not determine doctype\n); + } + + return wantarray ? @{ $results } : $results; +} + + +sub xml_parse_blast_ncbi +{ + # Martin A. Hansen, February 2007. + + my ( $fh, + ) = @_; + + my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results ); + + while ( $blast_record = &xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 ) + { + foreach $line ( @{ $blast_record } ) + { + if ( $line =~ /||/ ) + { + push @blast_query, $line; + } + elsif ( @blast_query ) + { + push @blast_subject, $line; + + if ( $line =~ /<\/Iteration_hits>/ ) + { + $query = &xml_parse_blast_query( \@blast_query ); + $subject = &xml_parse_blast_subject( \@blast_subject ); + + push @results, { + "QUERY" => $query, + "SUBJECT" => $subject, + }; + + undef @blast_query; + undef @blast_subject; + } + } + } + } + + return wantarray ? @results : \@results; +} + + +sub xml_parse_blast_blastcl3 +{ + # Martin A. Hansen, February 2007. + + my ( $fh, + ) = @_; + + my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results ); + + while ( $blast_record = &xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 ) + { + foreach $line ( @{ $blast_record } ) + { + if ( $line =~ /||/ ) + { + push @blast_query, $line; + } + elsif ( @blast_query ) + { + push @blast_subject, $line; + + if ( $line =~ /<\/Iteration_hits>/ ) + { + $query = &xml_parse_blast_query( \@blast_query ); + $subject = &xml_parse_blast_subject( \@blast_subject ); + + push @results, { + "QUERY" => $query, + "SUBJECT" => $subject, + }; + + undef @blast_query; + undef @blast_subject; + } + } + } + } + + return wantarray ? @results : \@results; +} + + +sub xml_get_blast_record +{ + # Martin A. Hansen, March 2007. + + my ( $fh, # file handle to BLAST file in XML format + ) = @_; + + # returns list of lines + + my ( $line, @blast_record ); + + while ( $line = <$fh> ) + { + chomp $line; + + push @blast_record, $line; + + last if $line =~ /<\/BlastOutput>/; + } + + return wantarray ? @blast_record : \@blast_record; +} + + +sub xml_parse_blast_query +{ + my ( $lines, + ) = @_; + + my ( $line, %hash ); + + foreach $line ( @{ $lines } ) + { + if ( $line =~ /([^<]+)/ ) { + $hash{ "Q_ID" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_DEF" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_LEN" } = $1; + } + } + + return wantarray ? %hash : \%hash; +} + + +sub xml_parse_blast_subject +{ + # Martin A. Hansen, March 2007. + + my ( $lines, # + ) = @_; + + # returns + + my ( $line, @blast_hit, @blast_hsps, $hit, $hsps, @hits ); + + foreach $line ( @{ $lines } ) + { + if ( $line =~ /||||/ ) + { + push @blast_hit, $line; + } + elsif ( @blast_hit ) + { + push @blast_hsps, $line; + + if ( $line =~ /<\/Hit_hsps>/ ) + { + $hit = &xml_parse_blast_hit( \@blast_hit ); + $hsps = &xml_parse_blast_hsps( \@blast_hsps ); + + $hit->{ "HSPS" } = $hsps; + + push @hits, $hit; + + undef @blast_hit; + undef @blast_hsps; + } + } + } + + return wantarray ? @hits : \@hits; +} + + +sub xml_parse_blast_hit +{ + my ( $lines + ) = @_; + + my ( $line, %hash ); + + foreach $line ( @{ $lines } ) + { + if ( $line =~ /([^<]+)/ ) { + $hash{ "S_NUM" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_ID" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_DEF" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_ACC" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_LEN" } = $1; + } + } + + return wantarray ? %hash : \%hash; +} + + +sub xml_parse_blast_hsps +{ + # Martin A. Hansen, March 2007. + + my ( $blast_hits, # + ) = @_; + + # returns + + my ( $line, %hash, @hsps ); + + foreach $line ( @{ $blast_hits } ) + { + if ( $line =~ /([^<]+)/ ) { + $hash{ "NUM" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "E_VAL" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_BEG" } = $1 - 1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_END" } = $1 - 1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_BEG" } = $1 - 1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_END" } = $1 - 1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_FRAME" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_FRAME" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "Q_ALIGN" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "S_ALIGN" } = $1; + } elsif ( $line =~ /([^<]+)/ ) { + $hash{ "MIDLINE" } = $1; + } elsif ( $line =~ /<\/Hsp>/ ) { + push @hsps, dclone \%hash; + } + } + + return wantarray ? @hsps : \@hsps; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Calc.pm b/code_perl/Maasha/Calc.pm new file mode 100644 index 0000000..7906b52 --- /dev/null +++ b/code_perl/Maasha/Calc.pm @@ -0,0 +1,348 @@ +package Maasha::Calc; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This modules contains subroutines for simple algebra. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Storable qw( dclone ); +use vars qw ( @ISA @EXPORT ); +use Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub is_a_number +{ + # Identify if a string is a number or not. + # Taken from perldoc -q 'is a number'. + + my ( $str, # string to test + ) = @_; + + # Returns boolean. + + if ( $str =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) { + return 1; + } else { + return 0; + } +} + + +sub dist_point2line +{ + # Martin A. Hansen, June 2004. + + # calculates the distance from at point to a line. + # the line is represented by a beg/end set of coordinates. + + my ( $px, # point x coordinate + $py, # point y coordinate + $x1, # line 1 x coordinate + $y1, # line 1 y coordinate + $x2, # line 2 x coordinate + $y2, # line 2 y coordinate + ) = @_; + + # returns float + + my ( $dist, $a, $b ); + + $a = ( $y2 - $y1 ) / ( $x2 - $x1 ); + + $b = $y1 - $a * $x1; + + $dist = abs( $a * $px + $b - $py ) / sqrt( $a ** 2 + 1 ); + + return $dist; +} + + +sub dist_point2point +{ + # Martin A. Hansen, April 2004. + + # calculates the distance between two set of coordinates + + my ( $x1, + $y1, + $x2, + $y2, + ) = @_; + + # returns float + + my $dist; + + $dist = sqrt( ( $x2 - $x1 ) ** 2 + ( $y2 - $y1 ) ** 2 ); + + return $dist; +} + + +sub dist_interval +{ + # Martin A. Hansen, February 2008. + + # Returns the distance between two given intervals. + # 0 indicates that the intervals are overlapping. + + my ( $beg1, + $end1, + $beg2, + $end2, + ) = @_; + + # Returns number + + if ( $beg2 > $end1 ) { + return $beg2 - $end1; + } elsif ( $beg1 > $end2 ) { + return $beg1 - $end2; + } else { + return 0; + } +} + + +sub mean +{ + # Martin A. Hansen, April 2007 + + # Given a list of numbers, calculates and returns the mean. + + my ( $numbers, # list of numbers + ) = @_; + + # returns decimal number + + my ( $sum, $mean ); + + $sum = 0; + + map { $sum += $_ } @{ $numbers }; + + $mean = $sum / @{ $numbers }; + + return $mean; +} + + +sub median +{ + # Martin A. Hansen, January 2008 + + # Given a list of numbers, calculates and returns the median. + + my ( $numbers, # list of numbers + ) = @_; + + # returns decimal number + + my ( $num, $median ); + + @{ $numbers } = sort { $a <=> $b } @{ $numbers }; + + $num = scalar @{ $numbers }; + + if ( $num % 2 == 0 ) { + $median = &mean( [ $numbers->[ $num / 2 ], $numbers->[ $num / 2 + 1 ] ] ); + } else { + $median = $numbers->[ int( $num / 2 ) ]; + } + + return $median; +} + + +sub min +{ + # Martin A. Hansen, August 2006. + + # Return the smallest of two given numbers. + + my ( $x, # first number + $y, # second number + ) = @_; + + # Returns number + + if ( $x <= $y ) { + return $x; + } else { + return $y; + } +} + +sub max +{ + # Martin A. Hansen, November 2006. + + # Return the largest of two given numbers. + + my ( $x, # first number + $y, # second number + ) = @_; + + # Returns number + + if ( $x > $y ) { + return $x; + } else { + return $y; + } +} + + +sub minmax +{ + # Martin A. Hansen, April 2007. + + # given a list of numbers returns a tuple with min and max + + my ( $list, # list of numbers + ) = @_; + + # returns a tuple + + my ( $num, $min, $max ); + + $min = $max = $list->[ 0 ]; + + foreach $num ( @{ $list } ) + { + $min = $num if $num < $min; + $max = $num if $num > $max; + } + + return wantarray ? ( $min, $max ) : [ $min, $max ]; +} + + +sub list_max +{ + # Martin A. Hansen, August 2007. + + # Returns the maximum number in a given list. + + my ( $list, # list of numbers + ) = @_; + + # Returns float + + my ( $max, $num ); + + $max = $list->[ 0 ]; + + foreach $num ( @{ $list } ) { + $max = $num if $num > $max; + } + + return $max; +} + + +sub list_min +{ + # Martin A. Hansen, August 2007. + + # Returns the minimum number in a given list. + + my ( $list, # list of numbers + ) = @_; + + # Returns float + + my ( $min, $num ); + + $min = $list->[ 0 ]; + + foreach $num ( @{ $list } ) { + $min = $num if $num < $min; + } + + return $min; +} + + +sub sum +{ + # Martin A. Hansen, April 2007. + + # Sums a list of given numbers and + # returns the sum. + + my ( $list, # list of numbers + ) = @_; + + # returns float + + my ( $sum ); + + $sum = 0; + + map { $sum += $_ } @{ $list }; + + return $sum; +} + + +sub overlap +{ + # Martin A. Hansen, November 2003. + + # Tests if two invervals overlap + # returns 1 if overlapping else 0. + + my ( $beg1, + $end1, + $beg2, + $end2, + ) = @_; + + # returns integer + + if ( $beg1 > $end1 ) { ( $beg1, $end1 ) = ( $end1, $beg1 ) }; + if ( $beg2 > $end2 ) { ( $beg2, $end2 ) = ( $end2, $beg2 ) }; + + if ( $end1 < $beg2 or $beg1 > $end2 ) { + return 0; + } else { + return 1; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Common.pm b/code_perl/Maasha/Common.pm new file mode 100644 index 0000000..b430484 --- /dev/null +++ b/code_perl/Maasha/Common.pm @@ -0,0 +1,705 @@ +package Maasha::Common; + + +# Copyright (C) 2006-2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This module contains commonly used routines + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Carp; +use Data::Dumper; +use Storable; +use IO::File; +use Maasha::Config; + +use Exporter; + +use vars qw( @ISA @EXPORT @EXPORT_OK ); + +@ISA = qw( Exporter ) ; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub error +{ + # Martin A. Hansen, February 2008. + + # Print error message and exit with stack trace. + + my ( $msg, # Error message. + $no_stack, # disable stack trace - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( $script, $error, @lines, $line, $routine, $file, $line_no, @table, $routine_max, $file_max, $line_max ); + + chomp $msg; + + $script = &get_scriptname(); + + $error = &Carp::longmess(); + + @lines = split "\n", $error; + + $line = shift @lines; + + push @table, [ "Routine", "File", "Line" ]; + push @table, [ "-------", "----", "----" ]; + + $routine_max = length "Routine"; + $file_max = length "File"; + $line_max = length "Line"; + + if ( $line =~ /^ at (.+) line (\d+)$/ ) + { + $file = $1; + $line_no = $2; + + $file_max = length $file if length $file > $file_max; + $line_max = length $line_no if length $line_no > $line_max; + + push @table, [ "", $file, $line_no ]; + } + else + { + die qq(ERROR: Unrecognized error line "$line"\n); + } + + foreach $line ( @lines ) + { + if ( $line =~ /^\s*(.+) called at (.+) line (\d+)\s*$/ ) + { + $routine = $1; + $file = $2; + $line_no = $3; + + $routine_max = length $routine if length $routine > $routine_max; + $file_max = length $file if length $file > $file_max; + $line_max = length $line_no if length $line_no > $line_max; + + push @table, [ $routine, $file, $line_no ]; + } + else + { + die qq(ERROR: Unrecognized error line "$line"\n); + } + } + + $msg =~ s/\.$//; + + print STDERR qq(\nERROR!\n\nProgram \'$script\' failed: $msg.\n\n); + + die( "MAASHA_ERROR" ) if $no_stack; + + $routine_max += 3; + $file_max += 3; + $line_max += 3; + + foreach $line ( @table ) { + printf( STDERR "%-${routine_max}s%-${file_max}s%s\n", @{ $line } ); + } + + print STDERR "\n"; + + die( "MAASHA_ERROR" ); +} + + +sub read_open +{ + # Martin A. Hansen, January 2004. + + # read opens a file and returns a filehandle. + + my ( $path, # full path to file + ) = @_; + + # returns filehandle + + my ( $fh, $type ); + + $type = `file $path` if $path; + + if ( $type =~ /gzip compressed/ ) { + $fh = new IO::File "zcat $path|" or &Maasha::Common::error( qq(Could not read-open file "$path": $!) ); + } else { + $fh = new IO::File $path, "r" or &Maasha::Common::error( qq(Could not read-open file "$path": $!) ); + } + + return $fh; +} + + +sub write_open +{ + # Martin A. Hansen, January 2004. + + # write opens a file and returns a filehandle + + my ( $path, # full path to file + $gzip, # flag if data is to be gzipped - OPRIONAL + ) = @_; + + # returns filehandle + + my ( $fh ); + + if ( $gzip ) { + $fh = new IO::File "|gzip -f>$path" or &Maasha::Common::error( qq(Could not write-open file "$path": $!) ); + } else { + $fh = new IO::File $path, "w" or &Maasha::Common::error( qq(Could not write-open file "$path": $!) ); + } + + return $fh; +} + + +sub append_open +{ + # Martin A. Hansen, February 2006. + + # append opens file and returns a filehandle + + my ( $path, # path to file + ) = @_; + + # returns filehandle + + my ( $fh ); + + $fh = new IO::File $path, "a" or &Maasha::Common::error( qq(Could not append-open file "$path": $!) ); + + return $fh; +} + + +sub pipe_open +{ + # Martin A. Hansen, January 2007. + + # opens a pipe and returns a filehandle + + my ( $fh ); + + $fh = new IO::File "-" or &Maasha::Common::error( qq(Could not open pipe: $!) ); + + return $fh; +} + + +sub read_stdin +{ + # Martin A. Hansen, July 2007. + + # Returns a filehandle to STDIN + + my ( $fh ); + + $fh = new IO::File "<&STDIN" or &Maasha::Common::error( qq(Could not read from STDIN: $!) ); + + return $fh; +} + + +sub write_stdout +{ + # Martin A. Hansen, July 2007. + + # Returns a filehandle to STDOUT + + my ( $fh ); + + $fh = new IO::File ">&STDOUT" or &Maasha::Common::error( qq(Could not write to STDOUT: $!) ); + + return $fh; +} + + +sub file_store +{ + # Martin A. Hansen, December 2004. + + # writes a data structure to file. + + my ( $path, # full path to file + $data, # data structure + ) = @_; + + &Storable::store( $data, $path ) or &Maasha::Common::error( qq(Could not write-open file "$path": $!) ); +} + + +sub file_retrieve +{ + # Martin A. Hansen, December 2004. + + # retrieves hash data structure + # (this routines needs to test if its a hash, array or else) + + my ( $path, # full path to data file + ) = @_; + + my ( $data ); + + $data = &Storable::retrieve( $path ) or &Maasha::Common::error( qq(Could not read-open file "$path": $!) ); + + return wantarray ? %{ $data } : $data; +} + + +sub dir_create +{ + # Martin A. Hansen, July 2007. + + # Creates a directory. + + my ( $path, # full path to dir + ) = @_; + + # Returns nothing. + + if ( -d $path ) { + &Maasha::Common::error( qq(Directory already exists "$path": $!) ); + } else { + mkdir $path or &Maasha::Common::error( qq(Could not create directory "$path": $!) ); + } +} + + +sub dir_create_if_not_exists +{ + # Martin A. Hansen, May 2008. + + # Creates a directory if it does not already exists. + + my ( $path, # full path to dir + ) = @_; + + # Returns nothing. + + if ( not -d $path ) { + mkdir $path or &Maasha::Common::error( qq(Could not create directory "$path": $!) ); + } +} + + +sub dir_remove +{ + # Martin A. Hansen, April 2008. + + # Removes a directory recursively. + + my ( $path, # directory + ) = @_; + + &Maasha::Common::run( "rm", "-rf $path" ) if -d $path; +} + + +sub ls_dirs +{ + # Martin A. Hansen, June 2007. + + # returns all dirs in a given directory. + + my ( $path, # full path to directory + ) = @_; + + # returns a list of filenames. + + my ( $dh, @dirs ); + + $dh = &open_dir( $path ); + + @dirs = &read_dir( $dh ); + @dirs = grep { -d "$path/$_" } @dirs; + + map { $_ = "$path/$_" } @dirs; + + close $dh; + + return wantarray ? @dirs : \@dirs; +} + + +sub ls_files +{ + # Martin A. Hansen, June 2007. + + # returns all files in a given directory. + + my ( $path, # full path to directory + ) = @_; + + # returns a list of filenames. + + my ( $dh, @files ); + + $dh = &open_dir( $path ); + + @files = &read_dir( $dh ); + @files = grep { -f "$path/$_" } @files; + + map { $_ = "$path/$_" } @files; + + close $dh; + + return wantarray ? @files : \@files; +} + + +sub open_dir +{ + # Martin A. Hansen, June 2007. + + # open a directory and returns a directory handle + + use IO::Dir; + + my ( $path, # full path to directory + ) = @_; + + # returns object + + my $dh; + + $dh = IO::Dir->new( $path ) or &Maasha::Common::error( qq(Could not open dir "$path": $!) ); + + return $dh; +} + + +sub read_dir +{ + # Martin A. Hansen, June 2007. + + # read all files and directories from a directory. + + my ( $dh, # directory handle object + ) = @_; + + # returns list + + my ( $elem, @elems ); + + while ( defined( $elem = $dh->read ) ) { + push @elems, $elem; + } + + return wantarray ? @elems : \@elems; +} + + +sub read_args +{ + # Martin A. Hansen, December 2006 + + # reads arguments from @ARGV which is strictly formatted. + # three kind of argments are accepted: + # 1) file names [filename] + # 2) options with value [--option=value] + # 3) option without value [--option] + + my ( $args, # list of arguments + $ok_args, # list of accepted arguments - OPTIONAL + ) = @_; + + # returns a hashref + + my ( %ok_hash, $arg, @dirs, @files, %hash ); + + foreach $arg ( @{ $args } ) + { + if ( $arg =~ /^--([^=]+)=(.+)$/ ) { + $hash{ $1 } = $2; + } elsif ( $arg =~ /^--(.+)$/ ) { + $hash{ $1 } = 1; + } elsif ( -d $arg ) { + push @dirs, $arg; + } elsif ( -f $arg ) { + push @files, $arg; + } else { + &Maasha::Common::error( qq(Bad syntax in argument->"$arg") ); + } + } + + $hash{ "DIRS" } = \@dirs; + $hash{ "FILES" } = \@files; + + if ( $ok_args ) + { + map { $ok_hash{ $_ } = 1 } @{ $ok_args }; + + $ok_hash{ "DIRS" } = 1; + $ok_hash{ "FILES" } = 1; + + map { &Maasha::Common::error( qq(Unknown argument->"$_") ) if not exists $ok_hash{ $_ } } keys %hash; + } + + return wantarray ? %hash : \%hash; +} + + +sub get_sessionid +{ + # Martin A. Hansen, April 2008. + + # Create a session id based on time and pid. + + # Returns a number + + return time . $$; +} + + +sub get_tmpdir +{ + # Martin A. Hansen, April 2008. + + # Create a temporary directory based on + # $ENV{ 'TMP_DIR' } and sessionid. + + # Returns a path. + + my ( $user, $sid, $path ); + + &Maasha::Common::error( qq(no TMP_DIR set in %ENV) ) if not -d $ENV{ 'TMP_DIR' }; + + $user = $ENV{ 'USER' }; + $user =~ s/\.//g; + + $sid = &Maasha::Common::get_sessionid(); + + $path = "$ENV{ 'TMP_DIR' }/$user\_$sid"; + + &Maasha::Common::dir_create( $path ); + + return $path; +} + + +sub get_scriptname +{ + # Martin A. Hansen, February 2007 + + # returns the script name + + return ( split "/", $0 )[ -1 ]; +} + + +sub get_basename +{ + # Martin A. Hansen, February 2007 + + # Given a full path to a file returns the basename, + # which is the part of the name before the last '.'. + + my ( $path, # full path to filename + ) = @_; + + my ( $basename ); + + $basename = ( split "/", $path )[ -1 ]; + + $basename =~ s/(.+)\.?.*/$1/; + + return $basename +} + + +sub file_read +{ + # Martin A. Hansen, December 2004. + + # given a file, a seek beg position and + # length, returns the corresponding string. + + my ( $fh, # file handle to file + $beg, # read start in file + $len, # read length of block + ) = @_; + + # returns string + + my ( $string ); + + &Maasha::Common::error( qq(Negative length: $len) ) if $len < 0; + + sysseek $fh, $beg, 0; + sysread $fh, $string, $len; + + return $string; +} + + +sub file_size +{ + # Martin A. Hansen, March 2007 + + # returns the file size for a given file + + my ( $path, # full path to file + ) = @_; + + # returns integer + + my $file_size = ( stat ( $path ) )[ 7 ]; + + return $file_size; +} + + +sub run +{ + # Martin A. Hansen, April 2007. + + # Run an execute with optional arguments. + + my ( $exe, # executable to run + $args, # argument string + $nice, # nice flag + ) = @_; + + # Returns nothing. + + my ( $command_line, $result ); + + $command_line = &Maasha::Config::get_exe( $exe ); + $command_line .= " " . $args if $args; + $command_line = "nice -n19 " . $command_line if $nice; + + system( $command_line ) == 0 or &Maasha::Common::error( qq(Could not execute "$command_line": $?) ); +} + + +sub run_and_return +{ + # Martin A. Hansen, April 2008. + + # Run an execute with optional arguments returning the output + # as a list. + + my ( $exe, # executable to run + $args, # argument string + $nice, # nice flag + ) = @_; + + # Returns a list. + + my ( $command_line, @result ); + + $command_line = &Maasha::Config::get_exe( $exe ); + $command_line .= " " . $args if $args; + $command_line = "nice -n19 " . $command_line if $nice; + + @result = `$command_line`; + + chomp @result; + + return wantarray ? @result : \@result; +} + + +sub time_stamp +{ + # Martin A. Hansen, February 2006. + + # returns timestamp for use in log file. + # format: YYYY-MM-DD HH:MM:SS + + # returns string + + my ( $year, $mon, $day, $time ); + + ( undef, undef, undef, $day, $mon, $year, undef, undef ) = gmtime( time ); + + $mon += 1; # first month is 0, so we correct accordingly + $year += 1900; + + $day = sprintf "%02d", $day; + $mon = sprintf "%02d", $mon; + + $time = localtime; + + $time =~ s/.*(\d{2}:\d{2}:\d{2}).*/$1/; + + return "$year-$mon-$day $time"; +} + + +sub wrap_line +{ + # Martin A. Hansen, May 2005 + + # Takes a given line and wraps it to a given width, + # without breaking any words. + + my ( $line, # line to wrap + $width, # wrap width + ) = @_; + + # Returns a list of lines. + + my ( @lines, $substr, $wrap_pos, $pos, $new_line ); + + $pos = 0; + + while ( $pos < length $line ) + { + $substr = substr $line, $pos, $width; + + if ( length $substr == $width ) + { + $substr = reverse $substr; + $wrap_pos = index $substr, " "; + + $new_line = substr $line, $pos, $width - $wrap_pos; + $new_line =~ s/ $//; + + $pos += $width - $wrap_pos; + } + else + { + $new_line = $substr; + + $pos += $width; + } + + push @lines, $new_line; + } + + return wantarray ? @lines : \@lines; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; diff --git a/code_perl/Maasha/Config.pm b/code_perl/Maasha/Config.pm new file mode 100644 index 0000000..e25f3f2 --- /dev/null +++ b/code_perl/Maasha/Config.pm @@ -0,0 +1,331 @@ +package Maasha::Config; + +# Copyright (C) 2006 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This module contains configuration details for the usual system setup, etc. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use vars qw( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $HOME, $PATH, $DATA_DIR, $TMP_DIR, $INST_DIR ); + +$HOME = $ENV{ "HOME" }; +$PATH = $ENV{ "PATH" }; +$DATA_DIR = $ENV{ "DATA_DIR" }; +$TMP_DIR = $ENV{ "TMP_DIR" }; +$INST_DIR = $ENV{ "INST_DIR" }; + +warn qq(WARNING: HOME not set in env\n) if not defined $HOME; +warn qq(WARNING: PATH not set in env\n) if not defined $PATH; +warn qq(WARNING: DATA_DIR not set in env\n) if not defined $DATA_DIR; +warn qq(WARNING: TMP_DIR not set in env\n) if not defined $TMP_DIR; +warn qq(WARNING: INST_DIR not set in env\n) if not defined $INST_DIR; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_exe +{ + # Martin A. Hansen, April 2007. + + # finds a given exe in path and returns + # the full path to the exe. + + my ( $exe, + ) = @_; + + # returns string + + my ( $dir, $ok ); + + foreach $dir ( split /:/, $PATH ) { + return "$dir/$exe" if -x "$dir/$exe" and not -d "$dir/$exe"; + } + + &Maasha::Common::error( qq(Could not find executable \'$exe\') ); +} + + +sub genome_fasta +{ + # Martin A. Hansen, November 2007. + + # Returns the full path to the FASTA file for + # a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns string. + + my $genome_file = "$DATA_DIR/genomes/$genome/$genome.fna"; + + if ( not -f $genome_file ) { + &Maasha::Common::error( qq(Genome file "$genome_file" for genome "$genome" not found) ); + } + + return $genome_file; +} + + +sub genome_fasta_index +{ + # Martin A. Hansen, December 2007. + + # Returns the full path to the FASTA file index for + # a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns string. + + my $index = "$DATA_DIR/genomes/$genome/$genome.fna.index"; + + if ( not -f $index ) { + &Maasha::Common::error( qq(Index file "$index" for genome -> $genome not found) ); + } + + return $index; +} + + +sub genome_blast +{ + # Martin A. Hansen, November 2007. + + # Returns the BLAST database path for a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns string. + + my $file = "$DATA_DIR/genomes/$genome/blast/$genome.fna"; + + return $file; +} + + +sub genome_blat_ooc +{ + # Martin A. Hansen, November 2007. + + # Returns the ooc file of a given tile size + # for a given genome. + + my ( $genome, # requested genome + $tile_size, # blat tile size + ) = @_; + + # Returns string. + + my $ooc_file = "$DATA_DIR/genomes/$genome/blat/$tile_size.ooc"; + + &Maasha::Common::error( qq(ooc file "$ooc_file" not found for genome -> $genome) ) if not -f $ooc_file; + + return $ooc_file; +} + + +sub genome_vmatch +{ + # Martin A. Hansen, November 2007. + + # Returns a list of Vmatch index names for a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns a list. + + my ( @chrs ); + + @chrs = &chromosomes( $genome ); + + map { $_ = "$DATA_DIR/genomes/$genome/vmatch/$_" } @chrs; + + # needs robustness check + + return wantarray ? @chrs : \@chrs; +} + + +sub genome_phastcons +{ + # Martin A. Hansen, January 2008. + + # Returns the full path to the location of a concatenated + # PhastCons file for a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns a string. + + my $file = "$DATA_DIR/genomes/$genome/phastcons/$genome.pp"; + + return $file; +} + + +sub genome_phastcons_index +{ + # Martin A. Hansen, January 2008. + + # Returns the full path to the location of a PhastCons index + # for a given genome. + + my ( $genome, # requested genome + ) = @_; + + # Returns a string. + + my $file = "$DATA_DIR/genomes/$genome/phastcons/$genome.pp.index"; + + return $file; +} + + +sub genomes +{ + # Martin A. Hansen, February 2008. + + # Returns a list of available genomes in the, + # genomes.conf file. + + # Returns a list. + + my ( %genome_hash, $fh, $line, @genomes, $org ); + + $fh = &Maasha::Common::read_open( "$INST_DIR/conf/genomes.conf" ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $line eq "//"; + + ( $org, undef ) = split "\t", $line; + + $genome_hash{ $org } = 1; + } + + close $fh; + + @genomes = sort keys %genome_hash; + + return wantarray ? @genomes : \@genomes; +} + + +sub chromosomes +{ + # Martin A. Hansen, November 2007. + + # Returns a list of chromosome files for a given genome + # read from the genomes.conf file. + + my ( $genome, # requested genome + ) = @_; + + # Returns a list. + + my ( $fh_in, $line, $org, $chr, %genome_hash, @chrs ); + + $fh_in = &Maasha::Common::read_open( "$INST_DIR/conf/genomes.conf" ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line eq "//"; + + ( $org, $chr ) = split "\t", $line; + + push @{ $genome_hash{ $org } }, $chr; + } + + close $fh_in; + + if ( exists $genome_hash{ $genome } ) { + @chrs = @{ $genome_hash{ $genome } }; + } else { + &Maasha::Common::error( qq(Genome -> $genome not found in genome hash) ); + } + + return wantarray ? @chrs : \@chrs; +} + + +sub maf_track +{ + # Martin A. Hansen, April 2008. + + # Given a genome returns the corresponding mafTrack database table name. + + my ( $genome, # genome to lookup. + ) = @_; + + # Returns a string. + + my ( %hash ); + + # The below has should be in a config file - fix later. + + %hash = ( + danRer4 => 'multiz7way', + dm2 => 'multiz15way', + dm3 => 'multiz15way', + fr2 => 'multiz7way', + galGal3 => 'multiz7way', + gasAcu1 => 'multiz7way', + hg18 => 'multiz17way', + mm8 => 'multiz17way', + mm9 => 'multiz17way', + oryLat1 => 'multiz7way', + panTro2 => 'multiz17way', + tetNig1 => 'multiz7way', + ); + + &Maasha::Common::error( qw(multiz track not found) ) if not exists $hash{ $genome }; + + return $hash{ $genome }; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; diff --git a/code_perl/Maasha/DumpFunc.pm b/code_perl/Maasha/DumpFunc.pm new file mode 100644 index 0000000..799d97b --- /dev/null +++ b/code_perl/Maasha/DumpFunc.pm @@ -0,0 +1,93 @@ +package Maasha::DumpFunc; + +# Copyright (C) 2003 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to inspect objects and their inheritance. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Class::Inspector; +use vars qw ( @ISA @EXPORT ); +use Exporter; + +use Data::Dumper; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub dump_func +{ + # Martin A. Hansen, August 2003. + + # given an object, the cognate functions are returned as a list + + my ( $obj, # incomming object + ) = @_; + + my ( $ref, $methods ); + + $ref = ref $obj; + $methods = Class::Inspector->methods( $ref, 'full', 'public' ); + +# @{ $methods } = grep /$ref/, @{ $methods }; + + return wantarray ? @{ $methods } : $methods; +} + + +sub dump_test +{ + # Martin A. Hansen, August 2003. + + # given an object, returns the cognate function run with default values + + my ( $obj , # incomming object + ) = @_; + + # returns a list of test lines to be printed + + my ( $methods, $method, $function, @lines ); + + $methods = dump_func( $obj ); + + foreach $method ( @{ $methods } ) + { + $method =~ /::(\w+)$/; + $function = $1; + next if not eval { $obj->$function }; + + # push @lines, "Testing $function from $method --- Returns -> " . $obj->$function; + print "TESTING $function FROM $method: RETURNS->" . $obj->$function . "\n"; + } + + return wantarray ? @lines : \@lines; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/EMBL.pm b/code_perl/Maasha/EMBL.pm new file mode 100644 index 0000000..2e1cc53 --- /dev/null +++ b/code_perl/Maasha/EMBL.pm @@ -0,0 +1,432 @@ +package Maasha::EMBL; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to parse EMBL records. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Storable qw( dclone ); +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Calc; +use Maasha::Seq; +use vars qw ( @ISA @EXPORT ); + + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_embl_entry +{ + # Martin A. Hansen, June 2006. + + # Given a filehandle to an embl file, + # fetches the next embl entry, and returns + # this as a string with multiple lines. + + my ( $fh, # filehandle to embl file + ) = @_; + + # returns string + + my ( $entry ); + + $/ = "//\n"; + + $entry = <$fh>; + + return $entry; +} + + +sub parse_embl_entry +{ + # Martin A. Hansen, June 2006. + + # given an embl entry extracts the keys + # given in an argument hash. Special care + # is taken to parse the feature table if + # requested. + + my ( $entry, # embl entry + $args, # argument hash + ) = @_; + + # returns data structure + + my ( @lines, $line, %hash, $ft, $seq, $key ); + + @lines = split "\n", $entry; + + foreach $line ( @lines ) + { + if ( exists $args->{ "keys" } ) + { + if ( $line =~ /^(\w{2})\s+(.*)/ and exists $args->{ "keys" }->{ $1 } ) + { + if ( exists $hash{ $1 } and $1 eq "FT" ) { + $hash{ $1 } .= "\n" . $2; + } elsif ( exists $hash{ $1 } ) { + $hash{ $1 } .= " " . $2; + } else { + $hash{ $1 } = $2; + } + } + elsif ( $line =~ /^\s+(.*)\s+\d+$/ and exists $args->{ "keys" }->{ "SEQ" } ) + { + $seq .= $1; + } + } + else + { + if ( $line =~ /^(\w{2})\s+(.*)/ ) + { + if ( exists $hash{ $1 } and $1 eq "FT" ) { + $hash{ $1 } .= "\n" . $2; + } elsif ( exists $hash{ $1 } ) { + $hash{ $1 } .= " " . $2; + } else { + $hash{ $1 } = $2; + } + } + elsif ( $line =~ /^\s+(.*)\s+\d+$/ ) + { + $seq .= $1; + } + } + } + + if ( $seq ) + { + $seq =~ tr/ //d; + $hash{ "SEQ" } = $seq; + } + +# foreach $key ( keys %hash ) +# { +# next if $key =~ /^(SEQ|SEQ_FT|FT)/; +# +# if ( not $hash{ $key } =~ /$args->{ $key }/i ) { +# return wantarray ? () : {} ; +# } +# } + + if ( exists $hash{ "FT" } ) + { + $seq =~ tr/ //d; + $ft = &parse_feature_table( $hash{ "FT" }, $seq, $args ); + $hash{ "FT" } = $ft; + } + + return wantarray ? %hash : \%hash; +} + + +sub parse_feature_table +{ + # Martin A. Hansen, June 2006. + + # parses the feature table of a EMBL/GenBank/DDBJ entry. + # parsing takes place in 4 steps. 1) the feature key is + # located. 2) the locator is located taking into # consideration + # that it may be split over multiple lines, which is dealt with + # by counting the params that always are present in multiline + # locators. 3) the locator is used to fetch the corresponding + # sequence. 4) qualifier key/value pars are located again taking + # into consideration multiline values, which are dealt with by + # keeping track of the "-count (value-less qualifers are also + # included). only feature keys and qualifers defined in the + # argument hash are returned. + + my ( $ft, # feature table + $seq, # entry sequnce + $args, # argument hash + ) = @_; + + # returns data structure + + my ( @lines, $key_regex, $i, $p, $q, %key_hash, $key, $locator, %qual_hash, $qual_name, $qual_val, $subseq ); + + @lines = split "\n", $ft; + + $key_regex = "[A-Za-z0-9_']+"; # this regex should match every possible feature key (gene, misc_feature, 5'UTR ...) + + $i = 0; + + while ( $lines[ $i ] ) + { + if ( $lines[ $i ] =~ /^($key_regex)\s+(.+)/ ) + { + $key = $1; + $locator = $2; + + undef %qual_hash; + + # ---- getting locator + + $p = 1; + + if ( not &balance_params( $locator ) ) + { + while ( not &balance_params( $locator ) ) + { + $locator .= $lines[ $i + $p ]; + $p++; + } + } + + push @{ $qual_hash{ "_locator" } }, $locator; + + # ---- getting subsequence + + $subseq = &parse_locator( $locator, $seq ); + + push @{ $qual_hash{ "_seq" } }, $subseq; + + # ----- getting qualifiers + + while ( defined( $lines[ $i + $p ] ) and not $lines[ $i + $p ] =~ /^$key_regex/ ) + { + if ( $lines[ $i + $p ] =~ /^\// ) + { + if ( $lines[ $i + $p ] =~ /^\/([^=]+)=(.*)$/ ) + { + $qual_name = $1; + $qual_val = $2; + } + elsif ( $lines[ $i + $p ] =~ /^\/(.*)$/ ) + { + $qual_name = $1; + $qual_val = ""; + } + + # ----- getting qualifier value + + $q = 1; + + if ( not &balance_quotes( $qual_val ) ) + { + while ( not &balance_quotes( $qual_val ) ) + { + $qual_val .= " " . $lines[ $i + $p + $q ]; + $q++; + } + } + + $qual_val =~ s/^"(.*)"$/$1/; + $qual_val =~ tr/ //d if $qual_name =~ /translation/i; + + if ( exists $args->{ "quals" } ) { + push @{ $qual_hash{ $qual_name } }, $qual_val if exists $args->{ "quals" }->{ $qual_name }; + } else { + push @{ $qual_hash{ $qual_name } }, $qual_val; + } + } + + $p += $q; + } + + if ( scalar keys %qual_hash > 0 ) + { + if ( exists $args->{ "feats" } ) { + push @{ $key_hash{ $key } }, dclone \%qual_hash if exists $args->{ "feats" }->{ $key }; + } else { + push @{ $key_hash{ $key } }, dclone \%qual_hash; + } + } + } + + $i += $p; + } + + return wantarray ? %key_hash : \%key_hash; +} + + +sub parse_locator +{ + # Martin A. Hansen, June 2006. + + # uses recursion to parse a locator string from a feature + # table and fetches the appropriate subsequence. the operators + # join(), complement(), and order() are handled. + # the locator string is broken into a comma separated lists, and + # modified if the params donnot balance. otherwise the comma separated + # list of ranges are stripped from operators, and the subsequence are + # fetched and handled according to the operators. + # SNP locators are also dealt with (single positions). + + my ( $locator, # locator string + $seq, # nucleotide sequence + $subseq, # subsequence being joined + $join, # join sequences + $comp, # complement sequence or not + $order, # order sequences + ) = @_; + + # returns string + + my ( @intervals, $interval, $beg, $end, $newseq ); + + @intervals = split ",", $locator; + + if ( not &balance_params( $intervals[ 0 ] ) ) # locator includes a join/comp/order of several ranges + { + if ( $locator =~ /^join\((.*)\)$/ ) + { + $join = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + } + elsif ( $locator =~ /^complement\((.*)\)$/ ) + { + $comp = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + + } + elsif ( $locator =~ /^order\((.*)\)$/ ) + { + $order = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + } + } + else + { + foreach $interval ( @intervals ) + { + if ( $interval =~ /^join\((.*)\)$/ ) + { + $join = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + } + elsif ( $interval =~ /^complement\((.*)\)$/ ) + { + $comp = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + + } + elsif ( $interval =~ /^order\((.*)\)$/ ) + { + $order = 1; + $subseq = &parse_locator( $1, $seq, $subseq, $join, $comp, $order ); + } + elsif ( $interval =~ /^[<>]?(\d+)[^\d]+(\d+)$/ ) + { + $beg = $1; + $end = $2; + + $newseq = substr $seq, $beg - 1, $end - $beg + 1; + + $newseq = &Maasha::Seq::dna_revcomp( $newseq ) if $comp; + + if ( $order ) { + $subseq .= " " . $newseq; + } else { + $subseq .= $newseq; + } + } + elsif ( $interval =~ /^(\d+)$/ ) + { + $beg = $1; + + $newseq = substr $seq, $beg - 1, 1 ; + + $newseq = &Maasha::Seq::dna_revcomp( $newseq ) if $comp; + + if ( $order ) { + $subseq .= " " . $newseq; + } else { + $subseq .= $newseq; + } + } + else + { + warn qq(WARNING: Could not match locator -> $locator\n); + # die qq(ERROR: Could not match locator -> $locator\n); + $subseq .= ""; + } + } + } + + return $subseq; +} + + +sub balance_params +{ + # Martin A. Hansen, June 2006. + + # given a string checks if left and right params + # balances. returns 1 if balanced, else 0. + + my ( $string, # string to check + ) = @_; + + # returns boolean + + my ( $param_count ); + + $param_count = 0; + $param_count += $string =~ tr/(//; + $param_count -= $string =~ tr/)//; + + if ( $param_count == 0 ) { + return 1; + } else { + return 0; + } +} + + +sub balance_quotes +{ + # Martin A. Hansen, June 2006. + + # given a string checks if the number of double quotes + # balances. returns 1 if balanced, else 0. + + my ( $string, # string to check + ) = @_; + + # returns boolean + + my ( $quote_count ); + + $quote_count = $string =~ tr/"//; + + if ( $quote_count % 2 == 0 ) { + return 1; + } else { + return 0; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm new file mode 100644 index 0000000..95a2fa8 --- /dev/null +++ b/code_perl/Maasha/Fasta.pm @@ -0,0 +1,469 @@ +package Maasha::Fasta; + +# Copyright (C) 2006 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation of FASTA files and FASTA entries. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Maasha::Common; +use Maasha::Seq; +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + +use constant { + HEAD => 0, + SEQ => 1, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub fasta_format_ok +{ + # Martin A. Hansen, March 2007. + + # Checks if a given FASTA file is formatted with + # one header per line and one sequence per line. + # returns 1 if so, else 0. + + my ( $path, # full path to FASTA file + ) = @_; + + # Returns boolean + + my ( $fh, $line, $count ); + + $fh = &Maasha::Common::read_open( $path ); + + $count = 0; + + while ( $line = <$fh> ) + { + if ( not $count % 2 and substr( $line, 0, 1 ) ne ">" ) { + return 0; + } + + $count++; + } + + close $fh; + + return 1; +} + + +sub get_entries +{ + # Martin A. Hansen, December 2006. + + # Parses a fasta file and returns a list of headers and sequence tuples. + + my ( $path, # full path to FASTA file + $count, # number of sequences to read - OPTIONAL + ) = @_; + + # returns list of tuples + + my ( $fh, $entry, @entries ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $entry = &get_entry( $fh ) ) + { + push @entries, $entry; + + if ( $count and $count == @entries ) { + last; + } + } + + close $fh; + + return wantarray ? @entries : \@entries; +} + + +sub put_entries +{ + # Martin A. Hansen, March 2004. + + # writes fasta sequences to STDOUT or file + + my ( $entries, # list of fasta entries + $path, # full path to file - OPTIONAL + $wrap, # line width - OPTIONAL + ) = @_; + + my ( $fh ); + + $fh = &Maasha::Common::write_open( $path ) if $path; + + map { &put_entry( $_, $fh, $wrap ) } @{ $entries }; + + close $fh if defined; +} + + +sub wrap +{ + # Martin A. Hansen, June 2007 + + # Wraps the sequence of a given FASTA entry + # to a given length. + + my ( $entry, # FASTA entry + $wrap, # wrap length + ) = @_; + + # Returns nothing. + + &Maasha::Seq::wrap( \$entry->[ SEQ ], $wrap ); +} + + +sub get_entry +{ + # Martin A. Hansen, January 2007. + + # Given a filehandle to an FASTA file, + # fetches the next FASTA entry, and returns + # this as a tuple of [ header, sequence ]. + + my ( $fh, # filehandle to FASTA file + ) = @_; + + # Returns string. + + my ( $block, @lines, $seq_name, $seq, $entry ); + + local $/ = "\n>"; + + while ( $block = <$fh> ) + { + chomp $block; + + last if $block !~ /^\s+$/; + } + + return if not defined $block; + + $block =~ />?([^\n]+)\n/m; + $seq_name = $1; + $seq = $'; + + local $/ = "\n"; + + chomp $seq; + + $seq =~ tr/ \t\n//d; + + $entry = [ $seq_name, $seq ]; + + return wantarray ? @{ $entry } : $entry; +} + + +sub put_entry +{ + # Martin A. Hansen, January 2007. + + # Writes FASTA entries to STDOUT or file. + + my ( $entry, # a FASTA entries + $fh, # file handle to output file - OPTIONAL + $wrap, # line width - OPTIONAL + ) = @_; + + # Returns nothing. + + &Maasha::Common::error( qq(FASTA entry has no header) ) if not defined $entry->[ HEAD ]; + &Maasha::Common::error( qq(FASTA entry has no sequence) ) if not defined $entry->[ SEQ ]; + + if ( $wrap ) { + &Maasha::Fasta::wrap( $entry, $wrap ); + } + + if ( defined $fh ) { + print $fh ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + } else { + print ">$entry->[ HEAD ]\n$entry->[ SEQ ]\n"; + } +} + + +sub find_shortest +{ + # Martin A. Hansen, June 2007. + + # Given a stack of FASTA entries, find and return + # the shortest entry. + + my ( $entries, # list of FASTA entries + ) = @_; + + # returns tuple + + my ( $min, $entry, $min_entry ); + + $min = 99999999999; + + foreach $entry ( @{ $entries } ) + { + if ( length( $entry->[ SEQ ] ) < $min ) + { + $min_entry = $entry; + $min = length $entry->[ SEQ ]; + } + } + + return wantarray ? @{ $min_entry } : $min_entry; +} + + +sub find_longest +{ + # Martin A. Hansen, June 2007. + + # Given a stack of FASTA entries, find and return + # the longest entry. + + my ( $entries, # list of FASTA entries + ) = @_; + + # returns tuple + + my ( $max, $entry, $max_entry ); + + $max = 0; + + foreach $entry ( @{ $entries } ) + { + if ( length( $entry->[ SEQ ] ) > $max ) + { + $max_entry = $entry; + $max = length $entry->[ SEQ ]; + } + } + + return wantarray ? @{ $max_entry } : $max_entry; +} + + +sub fasta_get_headers +{ + # Martin A. Hansen, May 2007. + + # Gets the header names of a FASTA file, + # and returns these in a list. + + my ( $path, # full path to FASTA file + ) = @_; + + # returns list + + my ( $fh, $entry, @list ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $entry = &get_entry( $fh ) ) { + push @list, $entry->[ HEAD ]; + } + + close $fh; + + return wantarray ? @list : \@list; +} + + +sub fasta_reformat +{ + # Martin A. Hansen, December 2004. + + # Given a file of one or more FASTA entries, reformats these so + # each entry consits of one line with header and one line with sequence. + + my ( $path, # full path to file with multiple FASTA entries + ) = @_; + + my ( $fh_in, $fh_out, $entry ); + + $fh_in = &Maasha::Common::read_open( $path ); + $fh_out = &Maasha::Common::write_open( "$path.temp" ); + + while ( $entry = &get_entry( $fh_in ) ) { + &put_entry( $entry, $fh_out ); + } + + close $fh_in; + close $fh_out; + + rename( "$path.temp", $path ); +} + + +sub index_create +{ + # Matin A. Hansen, December 2004. + + # Given a FASTA file formatted with one line of header and one line of sequence, + # returns a list of header, seq beg and seq length (first nucleotide is 0). Also, + # the file size of the indexed file is written to the index for checking purposes. + + my ( $path, # full path to file with multiple FASTA entries + ) = @_; + + # returns a hashref + + my ( $file_size, $fh, $entry, $beg, $len, %hash, @index ); + + $file_size = &Maasha::Common::file_size( $path ); + + push @index, "FILE_SIZE=$file_size"; + + $fh = &Maasha::Common::read_open( $path ); + + $beg = 0; + $len = 0; + + while ( $entry = &get_entry( $fh ) ) + { + warn qq(WARNING: header->$entry->[ HEAD ] alread exists in index) if exists $hash{ $entry->[ HEAD ] }; + + $beg += $len + 2 + length $entry->[ HEAD ]; + $len = length $entry->[ SEQ ]; + + push @index, [ $entry->[ HEAD ], $beg, $len ]; + + $hash{ $entry->[ HEAD ] } = 1; + + $beg++; + } + + close $fh; + + return wantarray ? @index : \@index; +} + + +sub index_search +{ + # Martin A. Hansen, December 2004. + + # Searches the index for matching entries. + + my ( $index, # index list + $regex, # regex to match FASTA headers [OPTIONAL] + $invert, # invert matching + ) = @_; + + # returns list + + my ( @results ); + + if ( not $regex ) + { + @results = @{ $index }; + } + else + { + if ( $invert ) { + @results = grep { $_->[ 0 ] !~ /$regex/ } @{ $index }; + } else { + @results = grep { $_->[ 0 ] =~ /$regex/ } @{ $index }; + } + } + + return wantarray ? @results : \@results; +} + + +sub index_lookup +{ + # Martin A. Hansen, July 2007. + + # Lookup a list of exact matches in the index and returns these + + my ( $index, # index list + $headers, # headers to lookup + ) = @_; + + # returns a list + + my ( %hash, $head, @results ); + + map { $hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + + foreach $head ( @{ $headers } ) + { + if ( exists $hash{ $head } ) { + push @results, [ $head, $hash{ $head }->[ 0 ], $hash{ $head }->[ 1 ] ]; + } + } + + return wantarray ? @results : \@results; +} + + +sub index_store +{ + # Martin A. Hansen, May 2007. + + # Stores a FASTA index to binary file. + + my ( $path, # full path to file + $index, # list with index + ) = @_; + + # returns nothing + + &Maasha::Common::file_store( $path, $index ); +} + + +sub index_retrieve +{ + # Martin A. Hansen, May 2007. + + # Retrieves a FASTA index from binary file. + + my ( $path, # full path to file + ) = @_; + + # returns list + + my $index; + + $index = &Maasha::Common::file_retrieve( $path ); + + return wantarray ? @{ $index } : $index; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/GFF.pm b/code_perl/Maasha/GFF.pm new file mode 100644 index 0000000..8d8a20b --- /dev/null +++ b/code_perl/Maasha/GFF.pm @@ -0,0 +1,140 @@ +package Maasha::GFF; + + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation 'Generic Feature Format' - GFF. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Maasha::Common; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_entry +{ + # Martin A. Hansen, February 2008. + + # Reads a single entry from a filehandle to a GFF file. + + my ( $fh, # file handle + ) = @_; + + # Returns hashref. + + my ( $line, @fields, %entry, $q_beg, $q_end, @atts, $att, $key, $val ); + + while ( $line = <$fh> ) + { + chomp $line; + + @fields = split "\t", $line; + + if ( @fields == 9 ) + { + $q_beg = $fields[ 3 ] - 1; + $q_end = $fields[ 4 ] - 1; + + ( $q_beg, $q_end ) = ( $q_end, $q_beg ) if $q_beg > $q_end; + + %entry = ( + Q_ID => $fields[ 0 ], + SOURCE => $fields[ 1 ], + TYPE => $fields[ 2 ], + Q_BEG => $q_beg, + Q_END => $q_end, + SCORE => $fields[ 5 ], + STRAND => $fields[ 6 ], + PHASE => $fields[ 7 ], + ATT => $fields[ 8 ], + ); + + @atts = split ";", $fields[ 8 ]; + + foreach $att ( @atts ) + { + ( $key, $val ) = split "=", $att; + + $entry{ "ATT_" . uc $key } = $val; + } + + return wantarray ? %entry : \%entry; + } + } +} + + +sub get_entries +{ + # Martin A. Hansen, February 2008. + + # Reads GFF file and returns a list of entries. + + my ( $path, # full path to GFF file. + ) = @_; + + # Returns a list. + + my ( $fh, $entry, @entries ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $entry = &get_entry( $fh ) ) { + push @entries, $entry; + } + + close $fh; + + return wantarray ? @entries : \@entries; +} + + +sub put_entry +{ + + +} + + +sub put_entries +{ + + +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; diff --git a/code_perl/Maasha/Match.pm b/code_perl/Maasha/Match.pm new file mode 100644 index 0000000..5f97856 --- /dev/null +++ b/code_perl/Maasha/Match.pm @@ -0,0 +1,441 @@ +package Maasha::Match; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to match sequences + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Storable qw( dclone ); +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Seq; +use Maasha::Berkeley_DB; +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub match_mummer +{ + # Martin A. Hansen, June 2007. + + # Match sequences using MUMmer. + + my ( $entries1, # FASTA entries + $entries2, # FASTA entries + $options, # additional MUMmer options - OPTIONAL + $tmp_dir, # temporary directory + ) = @_; + + # Returns a list. + + my ( @args, $arg, $file_in1, $file_in2, $cmd, $file_out, $fh, $line, $result, @results ); + + $tmp_dir ||= $ENV{ "TMP_DIR" }; + + $options->{ "word_size" } ||= 20; + $options->{ "direction" } ||= "both"; + + push @args, "-c"; + push @args, "-L"; + push @args, "-F"; + push @args, "-l $options->{ 'word_size' }"; + push @args, "-maxmatch"; + push @args, "-n" if not &Maasha::Seq::seq_guess_type( $entries1->[ 0 ]->[ 1 ] ) eq "protein"; + push @args, "-b" if $options->{ "direction" } =~ /^b/; + push @args, "-r" if $options->{ "direction" } =~ /^r/; + + $arg = join " ", @args; + + $file_in1 = "$tmp_dir/muscle1.tmp"; + $file_in2 = "$tmp_dir/muscle2.tmp"; + $file_out = "$tmp_dir/muscle3.tmp"; + + map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries1 }; + map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries2 }; + + &Maasha::Fasta::put_entries( $entries1, $file_in1 ); + &Maasha::Fasta::put_entries( $entries2, $file_in2 ); + + &Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" ); + + $fh = &Maasha::Common::read_open( $file_out ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^> (.+)Reverse\s+Len = (\d+)$/ ) + { + $result->{ "Q_ID" } = $1; + $result->{ "Q_LEN" } = $2; + $result->{ "DIR" } = "reverse"; + } + elsif ( $line =~ /^> (.+)Len = (\d+)$/ ) + { + $result->{ "Q_ID" } = $1; + $result->{ "Q_LEN" } = $2; + $result->{ "DIR" } = "forward"; + } + elsif ( $line =~ /^\s*(.\S+)\s+(\d+)\s+(\d+)\s+(\d+)$/ ) + { + $result->{ "S_ID" } = $1; + $result->{ "S_BEG" } = $2 - 1; + $result->{ "Q_BEG" } = $3 - 1; + $result->{ "HIT_LEN" } = $4; + $result->{ "S_END" } = $result->{ "S_BEG" } + $result->{ "HIT_LEN" } - 1; + $result->{ "Q_END" } = $result->{ "Q_BEG" } + $result->{ "HIT_LEN" } - 1; + + push @results, dclone $result; + } + + } + + unlink $file_in1; + unlink $file_in2; + unlink $file_out; + + return wantarray ? @results : \@results; +} + + +sub match_vmatch +{ + # Martin A. Hansen, April 2008. + + # Vmatches a list of records against a list of index files and the full + # path to the result file is returned. + + my ( $tmp_dir, # directory in where to save temp files + $records, # list of records + $index_files, # list of index files + $options, # argument hash + ) = @_; + + # Returns a string. + + my ( $query_file, $result_file, @result_files, $fh_in, $fh_out, $line, @fields, $i, $record, $vmatch_args, @index_names, @seq_names, $count_list ); + + $query_file = "$tmp_dir/query.seq"; + $result_file = "$tmp_dir/vmatch.out"; + + $fh_out = &Maasha::Common::write_open( $query_file ); + + foreach $record ( @{ $records } ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + next if length $record->{ "SEQ" } < 12; # assuming that the index is created for 12 as minimum length + + push @seq_names, $record->{ "SEQ_NAME" }; + + &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, 80 ); + } + } + + close $fh_out; + + if ( $options->{ 'genome' } ) { + $vmatch_args = "-complete -d -p -q $query_file"; + } else { + $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file"; + } + + $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" }; + $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" }; + + for ( $i = 0; $i < @{ $index_files }; $i++ ) + { + &Maasha::Common::run( "vmatch", "$vmatch_args $index_files->[ $i ] > $result_file.$i" ); + + push @result_files, "$result_file.$i"; + } + + unlink $query_file; + + $count_list = &vmatch_count_hits( \@result_files ) if ( $options->{ "count" } ); + + $fh_out = &Maasha::Common::write_open( $result_file ); + + for ( $i = 0; $i < @{ $index_files }; $i++ ) + { + $index_files->[ $i ] =~ s/.+\/(.+)\.fna$/$1/ if $options->{ 'genome' }; + + $fh_in = &Maasha::Common::read_open( "$result_file.$i" ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + next if $options->{ "max_hits" } and $count_list->[ $fields[ 5 ] ] > $options->{ 'max_hits' }; + + $fields[ 1 ] = $index_files->[ $i ]; # S_ID + $fields[ 9 ] = $count_list->[ $fields[ 5 ] ] if $options->{ "count" }; # SCORE + $fields[ 5 ] = $seq_names[ $fields[ 5 ] ]; # Q_ID + + print $fh_out join( "\t", @fields ), "\n"; + } + + close $fh_in; + + unlink "$result_file.$i"; + } + + close $fh_out; + + return $result_file; +} + + +sub vmatch_count_hits +{ + # Martin A. Hansen, April 2008. + + # Given a list of Vmatch result file, count duplications based + # on q_id. The counts are returned in a list where the list index + # corresponds to the q_id index in the query file. + + my ( $files, # vmatch result files + ) = @_; + + # Returns a list. + + my ( $file, $fh_in, $line, @fields, @count_list ); + + foreach $file ( @{ $files } ) + { + $fh_in = &Maasha::Common::read_open( $file ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $count_list[ $fields[ 5 ] ]++; + } + + close $fh_in; + } + + return wantarray ? @count_list : \@count_list; +} + + +sub vmatch_count_hits_old +{ + # Martin A. Hansen, April 2008. + + # Given a Vmatch result file, substitute the + # score field with the times the query sequence + # was found. + + my ( $tmp_dir, # directory in where to save temp files + $path, # full path to vmatch file + $max_count, # filter too abundant seqs - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( $fh_in, $fh_out, $line, @fields, @count_list ); + + $fh_in = &Maasha::Common::read_open( $path ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $count_list[ $fields[ 5 ] ]++; + } + + close $fh_in; + + $fh_in = &Maasha::Common::read_open( $path ); + $fh_out = &Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $fields[ 9 ] = $count_list[ $fields[ 5 ] ]; + + if ( $max_count ) { + print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; + } else { + print $fh_out join( "\t", @fields ), "\n"; + } + } + + close $fh_in; + close $fh_out; + + rename "$tmp_dir/vmatch.count", $path; +} + + +sub vmatch_count_hits_old +{ + # Martin A. Hansen, April 2008. + + # Given a Vmatch result file, substitute the + # score field with the times the query sequence + # was found. + + my ( $tmp_dir, # directory in where to save temp files + $path, # full path to vmatch file + $max_count, # filter too abundant seqs - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( $fh_in, $fh_out, $line, @fields, %count_hash ); + + if ( $max_count ) { + %count_hash = (); + } else { + %count_hash = &Maasha::Berkeley_DB::db_init( "$tmp_dir/hash.bdb" ); + } + + $fh_in = &Maasha::Common::read_open( $path ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $count_hash{ $fields[ 5 ] }++; + } + + close $fh_in; + + $fh_in = &Maasha::Common::read_open( $path ); + $fh_out = &Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); + + while ( $line = <$fh_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split " ", $line; + + $fields[ 9 ] = $count_hash{ $fields[ 5 ] }; + + if ( $max_count ) { + print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; + } else { + print $fh_out join( "\t", @fields ), "\n"; + } + } + + close $fh_in; + close $fh_out; + + if ( not $max_count ) + { + untie %count_hash; + unlink "$tmp_dir/hash.bdb"; + } + + rename "$tmp_dir/vmatch.count", $path; +} + + +sub vmatch_get_entry +{ + # Martin A. Hansen, January 2008. + + # Parses vmatch output records. + + my ( $fh, # file handle to vmatch result file. + ) = @_; + + # Returns a hash. + + my ( $line, @fields, %record ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split "\t", $line; + + $record{ "REC_TYPE" } = "VMATCH"; + + $record{ "S_LEN" } = $fields[ 0 ]; + $record{ "S_ID" } = $fields[ 1 ]; + $record{ "S_BEG" } = $fields[ 2 ]; + + if ( $fields[ 3 ] eq "D" ) { + $record{ "STRAND" } = "+"; + } else { + $record{ "STRAND" } = "-"; + } + + $record{ "Q_LEN" } = $fields[ 4 ]; + $record{ "Q_ID" } = $fields[ 5 ]; + $record{ "Q_BEG" } = $fields[ 6 ]; + $record{ "MATCH_DIST" } = $fields[ 7 ]; + $record{ "E_VAL" } = $fields[ 8 ]; + $record{ "SCORE" } = $fields[ 9 ]; + $record{ "IDENT" } = $fields[ 10 ]; + + $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1; + $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1; + + return wantarray ? %record : \%record; + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Matrix.pm b/code_perl/Maasha/Matrix.pm new file mode 100644 index 0000000..82a08f6 --- /dev/null +++ b/code_perl/Maasha/Matrix.pm @@ -0,0 +1,1170 @@ +package Maasha::Matrix; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This modules contains subroutines for simple matrix manipulations. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Storable qw( dclone ); +use Maasha::Common; +use Maasha::Calc; +use vars qw ( @ISA @EXPORT ); +use Exporter; + +@ISA = qw( Exporter ); + +use constant { + ROWS => 0, + COLS => 1, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub matrix_dims +{ + # Martin A. Hansen, April 2007 + + # returns the dimensions of a matrix: rows x cols + + my ( $matrix, # AoA data structure + ) = @_; + + # returns a tuple + + my ( $rows, $cols ); + + $rows = scalar @{ $matrix }; + $cols = scalar @{ $matrix->[ 0 ] }; + + return wantarray ? ( $rows, $cols ) : [ $rows, $cols ]; +} + + +sub matrix_check +{ + # Martin A. Hansen, April 2007. + + # Checks that the matrix of even columns. + # return 1 if ok else 0. + + my ( $matrix, # AoA data structure + ) = @_; + + # returns boolean + + my ( $dims, $row, $check ); + + $dims = &matrix_dims( $matrix ); + + $check = $dims->[ COLS ]; + + foreach $row ( @{ $matrix } ) { + return 0 if scalar @{ $row } != $check; + } + + return 1; +} + + +sub matrix_summary +{ + # Martin A. Hansen, April 2007. + + # For each column in a given matrix print: + + my ( $matrix, # AoA data structure + ) = @_; + + my ( $dims, $i, $col, $list, $type, $sort, $uniq, $min, $max, $mean ); + + die qq(ERROR: cannot summarize uneven matrix\n) if not &matrix_check( $matrix ); + + $dims = &matrix_dims( $matrix ); + + print join( "\t", "TYPE", "LEN", "UNIQ", "SORT", "MIN", "MAX", "MEAN" ), "\n"; + + for ( $i = 0; $i < $dims->[ COLS ]; $i++ ) + { + $col = &cols_get( $matrix, $i, $i ); + $list = &matrix_flip( $col )->[ 0 ]; + + if ( &list_check_numeric( $list ) ) { + $type = "num"; + } else { + $type = "alph"; + } + + if ( &list_check_sort( $list, $type ) ) { + $sort = "yes"; + } else { + $sort = "no"; + } + + if ( $type eq "num" ) + { + if ( $sort eq "yes" ) + { + $min = $list->[ 0 ]; + $max = $list->[ -1 ]; + } + else + { + ( $min, $max ) = &Maasha::Calc::minmax( $list ); + } + + $mean = sprintf( "%.2f", &Maasha::Calc::mean( $list ) ); + } + else + { + $min = "N/A"; + $max = "N/A"; + $mean = "N/A"; + } + + $uniq = &list_uniq( $list ); + + print join( "\t", $type, $dims->[ ROWS ], $uniq, $sort, $min, $max, $mean ), "\n"; + } +} + + +sub matrix_flip +{ + # Martin A. Hansen, April 2007 + + # flips a matrix making rows to columns and visa versa. + + my ( $matrix, # AoA data structure + ) = @_; + + # returns AoA + + my ( $i, $c, $dims, $AoA ); + + die qq(ERROR: cannot flip uneven matrix\n) if not &matrix_check( $matrix ); + + $dims = &matrix_dims( $matrix ); + + for ( $i = 0; $i < $dims->[ ROWS ]; $i++ ) + { + for ( $c = 0; $c < $dims->[ COLS ]; $c++ ) { + $AoA->[ $c ]->[ $i ] = $matrix->[ $i ]->[ $c ]; + } + } + + $matrix = $AoA; + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub matrix_rotate_right +{ + # Martin A. Hansen, April 2007 + + # Rotates elements in a given matrix a given + # number of positions to the right by popping columns, + # from the right matrix edge and prefixed to the left edge. + + my ( $matrix, # AoA data structure + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $i, $dims, $col, $AoA ); + + $shift ||= 1; + + die qq(ERROR: cannot right rotate uneven matrix\n) if not &matrix_check( $matrix ); + + $dims = &matrix_dims( $matrix ); + + for ( $i = 0; $i < $shift; $i++ ) + { + $col = &cols_get( $matrix, $dims->[ COLS ] - 1, $dims->[ COLS ] - 1 ); + $AoA = &cols_get( $matrix, 0, $dims->[ COLS ] - 2 ); + + &cols_unshift( $AoA, $col ); + + $matrix = $AoA; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub matrix_rotate_left +{ + # Martin A. Hansen, April 2007 + + # Rotates elements in a given matrix a given + # number of positions to the left while columns + # are shifted from the left matrix edge and appended, + # to the right edge. + + my ( $matrix, # AoA data structure + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $i, $dims, $col, $AoA ); + + $shift ||= 1; + + die qq(ERROR: cannot right rotate uneven matrix\n) if not &matrix_check( $matrix ); + + $dims = &matrix_dims( $matrix ); + + for ( $i = 0; $i < $shift; $i++ ) + { + $col = &cols_get( $matrix, 0, 0 ); + $AoA = &cols_get( $matrix, 1, $dims->[ COLS ] - 1 ); + + &cols_push( $AoA, $col ); + + $matrix = $AoA; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub matrix_rotate_up +{ + # Martin A. Hansen, April 2007 + + # Rotates elements in a given matrix a given + # number of positions up while rows are shifted + # from the top of the matrix to the bottom. + + my ( $matrix, # AoA data structure + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $dims, $i, $row, $AoA ); + + $shift ||= 1; + + $dims = &matrix_dims( $matrix ); + + for ( $i = 0; $i < $shift; $i++ ) + { + $row = &rows_get( $matrix, 0, 0 ); + $AoA = &rows_get( $matrix, 1, $dims->[ ROWS ] - 1 ); + + &rows_push( $AoA, dclone $row ); + + $matrix = $AoA; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub matrix_rotate_down +{ + # Martin A. Hansen, April 2007 + + # Rotates elements in a given matrix a given + # number of positions down while rows are shifted + # from the bottom matrix edge to the top edge. + + my ( $matrix, # AoA data structure + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $dims, $i, $row, $AoA ); + + $shift ||= 1; + + $dims = &matrix_dims( $matrix ); + + for ( $i = 0; $i < $shift; $i++ ) + { + $row = &rows_get( $matrix, $dims->[ ROWS ] - 1, $dims->[ ROWS ] - 1 ); + $AoA = &rows_get( $matrix, 0, $dims->[ ROWS ] - 2 ); + + &rows_unshift( $AoA, $row ); + + $matrix = $AoA; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub submatrix +{ + # Martin A. Hansen, April 2007 + + # returns a submatrix sliced from a given matrix + + my ( $matrix, # AoA data structure + $row_beg, # first row - OPTIONAL (default 0) + $row_end, # last row - OPTIONAL (default last row) + $col_beg, # first col - OPTIONAL (default 0) + $col_end, # last col - OPTIONAL (default last col) + ) = @_; + + # returns AoA + + my ( $submatrix, $subsubmatrix ); + + $submatrix = &rows_get( $matrix, $row_beg, $row_end ); + $subsubmatrix = &cols_get( $submatrix, $col_beg, $col_end ); + + return wantarray ? @{ $subsubmatrix } : $subsubmatrix; +} + + +sub row_get +{ + # Martin A. Hansen, April 2008. + + # Returns a single row from a given matrix. + + my ( $matrix, # AoA data structure + $row, # row to get + ) = @_; + + # Returns a list; + + my ( $dims, $i, @list ); + + $dims = &matrix_dims( $matrix ); + + &Maasha::Common::error( qq(Row->$row outside of matrix->$dims->[ ROWS ]) ) if $row > $dims->[ ROWS ]; + + @list = @{ $matrix->[ $row ] }; + + return wantarray ? @list : \@list; +} + + +sub rows_get +{ + # Martin A. Hansen, April 2007 + + # returns a range of requested rows from a given matrix. + + my ( $matrix, # AoA data structure + $row_beg, # first row - OPTIONAL (default 0) + $row_end, # last row - OPTIONAL (default last row) + ) = @_; + + # returns AoA + + my ( @rows, $i ); + + $row_beg ||= 0; + + if ( not defined $row_end ) { + $row_end = scalar @{ $matrix }; + } + + if ( $row_end >= scalar @{ $matrix } ) + { + warn qq(WARNING: row end larger than matrix\n); + $row_end = scalar( @{ $matrix } ) - 1; + } + + die qq(ERROR: row begin "$row_beg" larger than row end "$row_end"\n) if $row_end < $row_beg; + + if ( $row_beg == 0 and $row_end == scalar( @{ $matrix } ) - 1 ) { + @rows = @{ $matrix }; + } else { + @rows = @{ $matrix }[ $row_beg .. $row_end ]; + } + + return wantarray ? @rows : \@rows; +} + + +sub col_get +{ + # Martin A. Hansen, April 2008. + + # Returns a single column from a given matrix. + + my ( $matrix, # AoA data structure + $col, # column to get + ) = @_; + + # Returns a list; + + my ( $dims, $i, @list ); + + $dims = &matrix_dims( $matrix ); + + &Maasha::Common::error( qq(Column->$col outside of matrix->$dims->[ COLS ]) ) if $col > $dims->[ COLS ]; + + for ( $i = 0; $i < $dims->[ ROWS ]; $i++ ) { + push @list, $matrix->[ $i ]->[ $col ]; + } + + return wantarray ? @list : \@list; +} + + +sub cols_get +{ + # Martin A. Hansen, April 2007 + + # returns a range of requested columns from a given matrix + + my ( $matrix, # AoA data structure + $col_beg, # first column - OPTIONAL (default 0) + $col_end, # last column - OPTIONAL (default last column) + ) = @_; + + # returns AoA + + my ( $dims, @cols, $row, @AoA ); + + $dims = &matrix_dims( $matrix ); + + $col_beg ||= 0; + + if ( not defined $col_end ) { + $col_end = $dims->[ COLS ] - 1; + } + + if ( $col_end > $dims->[ COLS ] - 1 ) + { + warn qq(WARNING: column end larger than matrix\n); + $col_end = $dims->[ COLS ] - 1; + } + + die qq(ERROR: column begin "$col_beg" larger than column end "$col_end"\n) if $col_end < $col_beg; + + if ( $col_beg == 0 and $col_end == $dims->[ COLS ] - 1 ) + { + @AoA = @{ $matrix }; + } + else + { + foreach $row ( @{ $matrix } ) + { + @cols = @{ $row }[ $col_beg .. $col_end ]; + + push @AoA, [ @cols ]; + } + } + + return wantarray ? @AoA : \@AoA; +} + + +sub col_sum +{ + my ( $matrix, + $col, + ) = @_; + + my ( $list, $sum ); + + $list = &cols_get( $matrix, $col, $col ); + $list = &matrix_flip( $list )->[ 0 ]; + + die qq(ERROR: cannot sum non-nummerical column\n); + + $sum = &Maasha::Calc::sum( $list ); + + return $sum; +} + + +sub rows_push +{ + # Martin A. Hansen, April 2007. + + # Appends one or more rows to a matrix. + + my ( $matrix, # AoA data structure + $rows, # list of rows + ) = @_; + + # returns AoA + + push @{ $matrix }, @{ $rows }; + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub rows_unshift +{ + # Martin A. Hansen, April 2007. + + # Prefixes one or more rows to a matrix. + + my ( $matrix, # AoA data structure + $rows, # list of rows + ) = @_; + + # returns AoA + + unshift @{ $matrix }, @{ $rows }; + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub cols_push +{ + # Martin A. Hansen, April 2007. + + # Appends one or more lists as columns to a matrix. + + my ( $matrix, # AoA data structure + $cols, # list of columns + ) = @_; + + # returns AoA + + my ( $dims_matrix, $dims_cols, $i ); + + $dims_matrix = &matrix_dims( $matrix ); + $dims_cols = &matrix_dims( $cols ); + + die qq(ERROR: Cannot merge columns with different row count\n) if $dims_matrix->[ ROWS ] != $dims_cols->[ ROWS ]; + + for ( $i = 0; $i < $dims_matrix->[ ROWS ]; $i++ ) + { + push @{ $matrix->[ $i ] }, @{ $cols->[ $i ] }; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub cols_unshift +{ + # Martin A. Hansen, April 2007. + + # Prefixes one or more lists as columns to a matrix. + + my ( $matrix, # AoA data structure + $cols, # list of columns + ) = @_; + + # returns AoA + + my ( $dims_matrix, $dims_cols, $i ); + + $dims_matrix = &matrix_dims( $matrix ); + $dims_cols = &matrix_dims( $cols ); + + die qq(ERROR: Cannot merge columns with different row count\n) if $dims_matrix->[ ROWS ] != $dims_cols->[ ROWS ]; + + for ( $i = 0; $i < $dims_matrix->[ ROWS ]; $i++ ) { + unshift @{ $matrix->[ $i ] }, @{ $cols->[ $i ] }; + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub rows_rotate_left +{ + # Martin A. Hansen, April 2007. + + # Given a matrix and a range of rows, rotates these rows + # left by shifting a given number of elements from + # the first position to the last. + + my ( $matrix, # AoA data structure + $beg, # first row to shift + $end, # last row to shit + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $i, $c, $row ); + + $shift ||= 1; + + for ( $i = $beg; $i <= $end; $i++ ) + { + $row = &rows_get( $matrix, $i, $i ); + + for ( $c = 0; $c < $shift; $c++ ) + { + $row = &list_rotate_left( @{ $row } ); + $matrix->[ $i ] = $row; + } + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub rows_rotate_right +{ + # Martin A. Hansen, April 2007. + + # Given a matrix and a range of rows, rotates these rows + # right by shifting a given number of elements from the + # last position to the first. + + my ( $matrix, # AoA data structure + $beg, # first row to shift + $end, # last row to shit + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $dims, $i, $c, $row ); + + $shift ||= 1; + + $dims = &matrix_dims( $matrix ); + + die qq(ERROR: end < beg: $end < $beg\n) if $end < $beg; + die qq(ERROR: row outside matrix\n) if $end >= $dims->[ ROWS ]; + + for ( $i = $beg; $i <= $end; $i++ ) + { + $row = &rows_get( $matrix, $i, $i ); + + for ( $c = 0; $c < $shift; $c++ ) + { + $row = &list_rotate_right( @{ $row } ); + $matrix->[ $i ] = $row; + } + } + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub cols_rotate_up +{ + # Martin A. Hansen, April 2007. + + # Given a matrix and a range of columns, rotates these columns + # ups by shifting the the first cell of each row from the + # first position to the last. + + my ( $matrix, # AoA data structure + $beg, # first row to shift + $end, # last row to shit + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $dims, $i, $c, $cols_pre, $col_select, $cols_post, $list ); + + $shift ||= 1; + + $dims = &matrix_dims( $matrix ); + + $cols_pre = &cols_get( $matrix, 0, $beg - 1 ) if $beg > 0; + $cols_post = &cols_get( $matrix, $end + 1, $dims->[ COLS ] - 1 ) if $end < $dims->[ COLS ] - 1; + + for ( $i = $beg; $i <= $end; $i++ ) + { + $col_select = &cols_get( $matrix, $i, $i ); + + $list = &matrix_flip( $col_select )->[ 0 ]; + + for ( $c = 0; $c < $shift; $c++ ) { + $list = &list_rotate_left( $list ); + } + + $col_select = &matrix_flip( [ $list ] ); + + if ( $cols_pre ) { + &cols_push( $cols_pre, $col_select ); + } else { + $cols_pre = $col_select; + } + } + + &cols_push( $cols_pre, $cols_post ) if $cols_post; + + $matrix = $cols_pre; + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub cols_rotate_down +{ + # Martin A. Hansen, April 2007. + + # Given a matrix and a range of columns, rotates these columns + # ups by shifting the the first cell of each row from the + # first position to the last. + + my ( $matrix, # AoA data structure + $beg, # first row to shift + $end, # last row to shit + $shift, # number of shifts - DEFAULT=1 + ) = @_; + + # returns AoA + + my ( $dims, $i, $c, $cols_pre, $col_select, $cols_post, $list ); + + $shift ||= 1; + + $dims = &matrix_dims( $matrix ); + + $cols_pre = &cols_get( $matrix, 0, $beg - 1 ) if $beg > 0; + $cols_post = &cols_get( $matrix, $end + 1, $dims->[ COLS ] - 1 ) if $end < $dims->[ COLS ] - 1; + + for ( $i = $beg; $i <= $end; $i++ ) + { + $col_select = &cols_get( $matrix, $i, $i ); + + $list = &matrix_flip( $col_select )->[ 0 ]; + + for ( $c = 0; $c < $shift; $c++ ) { + $list = &list_rotate_right( $list ); + } + + $col_select = &matrix_flip( [ $list ] ); + + if ( $cols_pre ) { + &cols_push( $cols_pre, $col_select ); + } else { + $cols_pre = $col_select; + } + } + + &cols_push( $cols_pre, $cols_post ) if $cols_post; + + $matrix = $cols_pre; + + return wantarray ? @{ $matrix } : $matrix; +} + + +sub list_rotate_left +{ + # Martin A. Hansen, April 2007. + + # given a list, shifts off the first element, + # and appends to the list, which is returned. + + my ( $list, # list to rotate + ) = @_; + + my ( @new_list, $elem ); + + @new_list = @{ $list }; + + $elem = shift @new_list; + + push @new_list, $elem; + + return wantarray ? @new_list : \@new_list; +} + + +sub list_rotate_right +{ + # Martin A. Hansen, April 2007. + + # given a list, pops off the last element, + # and prefixes to the list, which is returned. + + my ( $list, # list to rotate + ) = @_; + + my ( @new_list, $elem ); + + @new_list = @{ $list }; + + $elem = pop @new_list; + + unshift @new_list, $elem; + + return wantarray ? @new_list : \@new_list; +} + + +sub list_check_numeric +{ + # Martin A. Hansen, April 2007. + + # Checks if a given list only contains + # numerical elements. return 1 if numerical, + # else 0. + + my ( $list, # list to check + ) = @_; + + # returns integer + + my ( $elem ); + + foreach $elem ( @{ $list } ) { + return 0 if not $elem =~ /^\d+$/; # how about scientific notation ala 123.2312e-03 ? + } + + return 1; +} + + +sub list_check_sort +{ + # Martin A. Hansen, April 2007. + + # Checks if a given list is sorted. + # If the sort type is not specified, we + # are going to check the type and make a guess. + # Returns 1 if sorted else 0. + + my ( $list, # list to check + $type, # numerical of alphabetical + ) = @_; + + # returns integer + + my ( $i, $cmp ); + + if ( not $type ) + { + if ( &list_check_numeric( $list ) ) { + $type = "n"; + } else { + $type = "a"; + } + } + else + { + if ( $type =~ /^a.*/i ) { + $type = "a"; + } else { + $type = "n"; + } + } + + if ( @{ $list } > 1 ) + { + if ( $type eq "n" ) + { + for ( $i = 1; $i < @{ $list }; $i++ ) + { + $cmp = $list->[ $i - 1 ] <=> $list->[ $i ]; + + return 0 if $cmp > 0; + } + } + else + { + for ( $i = 1; $i < @{ $list }; $i++ ) + { + $cmp = $list->[ $i - 1 ] cmp $list->[ $i ]; + + return 0 if $cmp > 0; + } + } + } + + return 1; +} + + +sub list_uniq +{ + # Martin A. Hansen, April 2007. + + # returns the number of unique elements in a + # given list. + + my ( $list, # list + ) = @_; + + # returns integer + + my ( %hash, $count ); + + map { $hash{ $_ } = 1 } @{ $list }; + + $count = scalar keys %hash; + + return $count; +} + + +sub tabulate +{ + # Martin A. Hansen, April 2007. + + my ( $matrix, # AoA data structure + $col, + ) = @_; + + my ( $dims, $list, $i, $max, $len, %hash, $elem, @list ); + + $dims = &matrix_dims( $matrix ); + + $list = &cols_get( $matrix, $col, $col ); + $list = &matrix_flip( $list )->[ 0 ]; + + $max = 0; + + for ( $i = 0; $i < @{ $list }; $i++ ) + { + $hash{ $list->[ $i ] }++; + + $len = length $list->[ $i ]; + + $max = $len if $len > $max; + } + + @list = keys %hash; + + if ( &list_check_numeric( $list ) ) { + @list = sort { $a <=> $b } @list; + } else { + @list = sort { $a cmp $b } @list; + } + + foreach $elem ( @list ) + { + print $elem, " " x ( $max - length( $elem ) ), + sprintf( " %6s ", $hash{ $elem } ), + sprintf( "%.2f\n", ( $hash{ $elem } / $dims->[ ROWS ] ) * 100 ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BINARY SEARCH <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub interval_search +{ + # Martin A. Hansen, February 2008. + + # Uses binary search to locate the interval containing a + # given number. The intervals are defined by begin and end + # positions in seperate columns in a matrix. If a interval is + # found then the index of that matrix row is returned, otherwise + # -1 is returned. + + my ( $matrix, # data structure + $col1, # column with interval begins + $col2, # column with interval ends + $num, # number to search for + ) = @_; + + # Returns an integer. + + my ( $high, $low, $try ); + + $low = 0; + $high = @{ $matrix }; + + while ( $low < $high ) + { + $try = int( ( $high + $low ) / 2 ); + + # print "num->$num low->$low high->$high try->$try int1->$matrix->[ $try ]->[ $col1 ] int2->$matrix->[ $try ]->[ $col2 ]\n"; + + if ( $num < $matrix->[ $try ]->[ $col1 ] ) + { + $high = $try; + } + elsif ( $num > $matrix->[ $try ]->[ $col2 ] ) + { + $low = $try + 1; + } + else + { + return $try; + } + } + + return -1; +} + + +sub list_search +{ + # Martin A. Hansen, February 2008. + + # Uses binary search to locate a number in a list of numbers. + # If the number is found, then the index (the position of the number + # in the list) is returned, otherwise -1 is returned. + + my ( $list, # list of numbers + $num, # number to search for + ) = @_; + + # Returns an integer. + + my ( $high, $low, $try ); + + $low = 0; + $high = @{ $list }; + + while ( $low < $high ) + { + $try = int( ( $high + $low ) / 2 ); + + # print "num->$num low->$low high->$high try->$try int->$list->[ $try ]\n"; + + if ( $num < $list->[ $try ] ) + { + $high = $try; + } + elsif ( $num > $list->[ $try ] ) + { + $low = $try + 1; + } + else + { + return $try; + } + } + + return -1; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DISK SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub matrix_read +{ + # Martin A. Hansen, April 2007 + + # Reads tabular data from file into a matrix + # AoA data structure. + + my ( $path, # full path to file with data + $delimiter, # column delimiter - OPTIONAL (default tab) + $comments, # regex for comment lines to skip - OPTIONAL + $fields_ok, # list of fields to accept - OPTIONAL + ) = @_; + + # returns AoA + + my ( $fh, $line, @fields, @AoA ); + + $delimiter ||= "\t"; + + $fh = &Maasha::Common::read_open( $path ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $comments and $line =~ /^$comments/; + + @fields = split /$delimiter/, $line; + + map { splice( @fields, $_, 1 ) } @{ $fields_ok } if $fields_ok; + + push @AoA, [ @fields ]; + } + + close $fh; + + return wantarray ? @AoA : \@AoA; +} + + +sub matrix_write +{ + # Martin A. Hansen, April 2007 + + # Writes a tabular data structure to STDOUT or file. + + my ( $matrix, # AoA data structure + $path, # full path to output file - OPTIONAL (default STDOUT) + $delimiter, # column delimiter - OPTIONAL (default tab) + ) = @_; + + my ( $fh, $row ); + + $fh = &Maasha::Common::write_open( $path ) if $path; + + $delimiter ||= "\t"; + + foreach $row ( @{ $matrix } ) + { + if ( $fh ) { + print $fh join( $delimiter, @{ $row } ), "\n"; + } else { + print join( $delimiter, @{ $row } ), "\n"; + } + } + + close $fh if $fh; +} + + +sub matrix_store +{ + # Martin A. Hansen, April 2007. + + # stores a matrix to a binary file. + + my ( $path, # full path to file + $matrix, # data structure + ) = @_; + + &Maasha::Common::file_store( $path, $matrix ); +} + + +sub matrix_retrive +{ + # Martin A. Hansen, April 2007. + + # retrieves a matrix from a binary file + + my ( $path, # full path to file + ) = @_; + + my $matrix = &Maasha::Common::file_retrieve( $path ); + + return wantarray ? @{ $matrix } : $matrix; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/NCBI.pm b/code_perl/Maasha/NCBI.pm new file mode 100644 index 0000000..3c8f581 --- /dev/null +++ b/code_perl/Maasha/NCBI.pm @@ -0,0 +1,428 @@ +package Maasha::NCBI; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Stuff for interacting with NCBI Entrez + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use LWP::Simple; +use Maasha::Common; + +use vars qw( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_entry +{ + # Martin A. Hansen, March 2007. + + # connects to the ncbi website and retrieves a genbank record, + # which is returned. + + my ( $db, # database + $id, # genbank id + $type, # retrieval type + ) = @_; + + # returns string + + my ( $content, @lines, $i, $seq ); + + $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type"; + + return $content; +} + + +sub get_seq +{ + # Martin A. Hansen, March 2007. + + # connects to the ncbi website and retrieves a genbank record, + # from which the sequence is parsed and returned. + + my ( $db, # database + $id, # genbank id + $type, # retrieval type + ) = @_; + + # returns string + + my ( $content, @lines, $i, $seq ); + + $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type"; + + @lines = split "\n", $content; + + $i = 0; + + while ( $lines[ $i ] !~ /^ORIGIN/ ) { + $i++ + } + + $i++; + + while ( $lines[ $i ] !~ /^\/\// ) + { + $lines[ $i ] =~ s/^\s*\d+//; + + $seq .= $lines[ $i ]; + + $i++; + } + + $seq =~ tr/ //d; + + return $seq; +} + + +sub soft_parse +{ + # Martin A. Hansen, February 2008. + + # !!! NOT USED ANYMORE !!! # + + # Reads in and parses a file in SOFT format. + + my ( $path, # full path to SOFT file + ) = @_; + + # Returns a list. + + my ( $fh, @lines, $i, $c, $num, %key_hash, @fields, %id_hash, $id, $seq, $count, $record, @records, $platform_id, $sample_id, $sample_title ); + + $fh = &Maasha::Common::read_open( $path ); + + @lines = <$fh>; + + close $fh; + + chomp @lines; + + $i = 0; + + $num = 1; + + while ( $i < @lines ) + { + if ( $lines[ $i ] =~ /^\^PLATFORM = (.+)/ ) + { + $platform_id = $1; + } + elsif ( $lines[ $i ] =~ /^!platform_table_begin$/ ) + { + @fields = split "\t", $lines[ $i + 1 ]; + + for ( $c = 0; $c < @fields; $c++ ) { + $key_hash{ $fields[ $c ] } = $c; + } + + $c = $i + 2; + + while ( $lines[ $c ] !~ /^!platform_table_end$/ ) + { + @fields = split "\t", $lines[ $c ]; + + $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ]; + + $c++; + } + + $i = $c; + } + elsif ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ ) + { + $sample_id = $1; + } + elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ ) + { + $sample_title = $1; + } + elsif ( $lines[ $i ] =~ /^!sample_table_begin/ ) + { + undef %key_hash; + + @fields = split "\t", $lines[ $i + 1 ]; + + for ( $c = 0; $c < @fields; $c++ ) { + $key_hash{ $fields[ $c ] } = $c; + } + + $c = $i + 2; + + while ( $lines[ $c ] !~ /^!sample_table_end$/ ) + { + undef $record; + + @fields = split "\t", $lines[ $c ]; + + $id = $fields[ $key_hash{ "ID_REF" } ]; + $seq = $id_hash{ $id }; + $count = $fields[ $key_hash{ "VALUE" } ]; + + $seq =~ tr/./N/; + + $record->{ "SAMPLE_TITLE" } = $sample_title; + $record->{ "SEQ" } = $seq; + $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count ); + + push @records, $record; + + $c++; + $num++; + } + + $i = $c; + + $num = 1; + } + + $i++; + } + + return wantarray ? @records : \@records; +} + + +sub soft_index_file +{ + # Martin A. Hansen, June 2008. + + # Create a index with linenumbers of the different tables + # in a soft file. The index is returned. + + my ( $file, # file to index + ) = @_; + + # Returns + + my ( $fh, $line, $i, $c, @index, $first ); + + $fh = &Maasha::Common::read_open( $file ); + + $first = 1; + + $i = 0; + $c = 0; + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^\^/ ) + { + push @index, [ $line, $i ]; + + if ( not $first ) + { + push @{ $index[ $c - 1 ] }, $i - 1; + } + else + { + $first = 0; + } + + $c++; + } + + $i++; + } + + push @{ $index[ $c - 1 ] }, $i - 1; + + close $fh; + + return wantarray ? @index : \@index; +} + + +sub soft_get_platform +{ + # Martin A. Hansen, June 2008. + + # Given a filehandle to a SOFT file parses the platform table + # which is returned. + + my ( $fh, # filehandle + $beg, # line number where platform tables begin + $end, # line number where platform tables end + ) = @_; + + # Returns hashref + + my ( $line, @lines, $i, $c, @fields, %key_hash, %id_hash ); + + $i = 0; + + while ( $line = <$fh> ) + { + chomp $line; + + push @lines, $line if $i >= $beg; + + last if $i == $end; + + $i++; + } + + $i = 0; + + while ( $i < @lines ) + { + if ( $lines[ $i ] =~ /^!platform_table_begin$/ ) + { + @fields = split "\t", $lines[ $i + 1 ]; + + for ( $c = 0; $c < @fields; $c++ ) { + $key_hash{ $fields[ $c ] } = $c; + } + + $c = $i + 2; + + while ( $lines[ $c ] !~ /^!platform_table_end$/ ) + { + @fields = split "\t", $lines[ $c ]; + + $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ]; + + $c++; + } + + $i = $c; + } + + $i++; + } + + return wantarray ? %id_hash : \%id_hash; +} + + +sub soft_get_sample +{ + # Martin A. Hansen, June 2008. + + # Given a filehandle to a SOFT file parses the platform table + # which is returned. + + my ( $fh, # filehandle + $plat_table, # hashref with platform tables + $beg, # line number where sample table begin + $end, # line number where sample table end + ) = @_; + + # Returns hashref + + my ( $line, @lines, $i, $c, $platform_id, @fields, %key_hash, $num, $sample_id, $sample_title, $id, $seq, $count, @records, $record ); + + $i = 0; + + while ( $line = <$fh> ) + { + chomp $line; + + push @lines, $line if $i >= $beg; + + last if $i == $end; + + $i++; + } + + $i = 0; + + $num = 1; + + while ( $i < @lines ) + { + if ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ ) + { + $sample_id = $1; + } + elsif ( $lines[ $i ] =~ /!Sample_platform_id = (.+)/ ) + { + $platform_id = $1; + } + elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ ) + { + $sample_title = $1; + } + elsif ( $lines[ $i ] =~ /^!sample_table_begin/ ) + { + undef %key_hash; + + @fields = split "\t", $lines[ $i + 1 ]; + + for ( $c = 0; $c < @fields; $c++ ) { + $key_hash{ $fields[ $c ] } = $c; + } + + $c = $i + 2; + + while ( $lines[ $c ] !~ /^!sample_table_end$/ ) + { + undef $record; + + @fields = split "\t", $lines[ $c ]; + + $id = $fields[ $key_hash{ "ID_REF" } ]; + $seq = $plat_table->{ $id }; + $count = $fields[ $key_hash{ "VALUE" } ]; + + $seq =~ tr/./N/; + + $record->{ "SAMPLE_TITLE" } = $sample_title; + $record->{ "SEQ" } = $seq; + $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count ); + + push @records, $record; + + $c++; + $num++; + } + + $i = $c; + + $num = 1; + } + + $i++; + } + + return wantarray ? @records : \@records; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Patscan.pm b/code_perl/Maasha/Patscan.pm new file mode 100644 index 0000000..7aeb3c3 --- /dev/null +++ b/code_perl/Maasha/Patscan.pm @@ -0,0 +1,167 @@ +package Maasha::Patscan; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This module contains commonly used routines + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Maasha::Common; +use Maasha::Seq; +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub read_patterns +{ + # Martin A. Hansen, August 2007. + + # Read a list of patterns from file with one pattern + # per line. + + my ( $path, # full path to file + ) = @_; + + # Returns list. + + my ( $fh, $line, @patterns ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $line eq ""; + + push @patterns, $line; + } + + close $fh; + + return wantarray ? @patterns : \@patterns; +} + + +sub parse_patterns +{ + # Martin A. Hansen, November 2007. + + # Splits a string of patterns with out breaking patterns with [,,]. + + my ( $str, # comma separated list of patterns + ) = @_; + + # Returns a list. + + my ( $i, $char, $brackets, @patterns ); + + $brackets = 0; + + for ( $i = 0; $i < length $str; $i++ ) + { + $char = substr $str, $i, 1; + + if ( $char eq "[" ) { + $brackets++; + } elsif ( $char eq "]" ) { + $brackets--; + } elsif ( $char eq "," and $brackets != 0 ) { + substr $str, $i, 1, '!'; + } + } + + @patterns = split ",", $str; + + map { s/!/,/g } @patterns; + + return wantarray ? @patterns : \@patterns; +} + + +sub parse_scan_result +{ + # Martin A. Hansen, January 2007. + + # Parses scan_for_matches results + + my ( $entry, # FASTA tuple + $pattern, # pattern used in patscan + ) = @_; + + # Returns hash. + + my ( $head, $seq, $beg, $end, $len, $strand, %match ); + + ( $head, $seq ) = @{ $entry }; + + if ( $head =~ /^(.+):\[(\d+),(\d+)\]$/ ) + { + $head = $1; + $beg = $2; + $end = $3; + + if ( $beg > $end ) + { + ( $beg, $end ) = ( $end, $beg ); + + $strand = "-"; + } + else + { + $strand = "+"; + } + + $len = $end - $beg + 1; + + %match = ( + "REC_TYPE" => "PATSCAN", + "PATTERN" => $pattern, + "Q_ID" => $pattern, + "S_ID" => $head, + "S_BEG" => $beg - 1, # sfm is 1-based + "S_END" => $end - 1, # sfm is 1-based + "MATCH_LEN" => $len, + "SCORE" => 100, + "STRAND" => $strand, + "HIT" => $seq, + ); + } + else + { + warn qq(WARNING: Could not parse match header->$head<-\n); + } + + return wantarray ? %match : \%match; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Plot.pm b/code_perl/Maasha/Plot.pm new file mode 100644 index 0000000..8c76a53 --- /dev/null +++ b/code_perl/Maasha/Plot.pm @@ -0,0 +1,1028 @@ +package Maasha::Plot; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines to plot stuff with Gnuplot and SVG. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use SVG; +use IPC::Open2; +use Time::HiRes qw( gettimeofday ); +use Maasha::Common; +use Maasha::Calc; +use vars qw ( @ISA @EXPORT ); + +use constant { + WIDTH => 800, + HEIGHT => 600, + MARGIN => 40, +}; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LINEPLOTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub lineplot_simple +{ + # Martin A. Hansen, January 2008. + + # Plots a simple lineplot using Gnuplot. + + my ( $data, # data table - each column will be plottet as one line. + $options, # options hash + $tmp_dir, # temporary directory + ) = @_; + + # Returns list. + + my ( $tmp_file, $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines, $xtic_space, @plot_cmd ); + + $tmp_dir ||= $ENV{ 'TMP_DIR' }; + + $tmp_file = "$tmp_dir/lineplot_simple.tab"; + + $fh_out = &Maasha::Common::write_open( $tmp_file ); + + map { print $fh_out join( "\t", @{ $_ } ), "\n" } @{ $data }; + + close $fh_out; + + $options->{ "terminal" } ||= "dumb"; + + $cmd = "gnuplot"; + + $pid = open2( $fh_out, $fh_in, $cmd ); + + # $fh_in = \*STDERR; + + print $fh_in "set terminal $options->{ 'terminal' }\n"; + print $fh_in "set title \"$options->{ 'title' }\"\n" if $options->{ "title" }; + print $fh_in "set xlabel \"$options->{ 'xlabel' }\"\n" if $options->{ "xlabel" }; + print $fh_in "set ylabel \"$options->{ 'ylabel' }\"\n" if $options->{ "ylabel" }; + print $fh_in "set grid\n" if not $options->{ "terminal" } eq "dumb"; + print $fh_in "set autoscale\n"; + print $fh_in "unset key\n"; + print $fh_in "set xtics border in scale 0 nomirror rotate by 90 offset character 0, 0, 0\n"; + + for ( $i = 1; $i < scalar @{ $data->[ 0 ] } + 1; $i++ ) { + push @plot_cmd, qq("$tmp_file" using $i with lines ls 1); + } + + print $fh_in "plot " . join( ", ", @plot_cmd ) . "\n"; + + close $fh_in; + + while ( $line = <$fh_out> ) + { + chomp $line; + + push @lines, $line; + } + + close $fh_out; + + waitpid $pid, 0; + + unlink $tmp_file; + + return wantarray ? @lines : \@lines; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> HISTOGRAMS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub histogram_simple +{ + # Martin A. Hansen, August 2007. + + # Plots a simple histogram using Gnuplot. + + my ( $data, # list of [ xlabel, data value ] tuples + $options, # options hash + ) = @_; + + # Returns list. + + my ( $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines ); + + $options->{ "terminal" } ||= "dumb"; + + $cmd = "gnuplot"; + + $pid = open2( $fh_out, $fh_in, $cmd ); + +# $fh_in = \*STDERR; + + # print $fh_in "set terminal $options->{ 'terminal' } 10 \n"; # adsjust fontsize to 10 - find some other way to do this, because it don't work with SVG. + print $fh_in "set terminal $options->{ 'terminal' }\n"; + print $fh_in "set title \"$options->{ 'title' }\"\n" if $options->{ "title" }; + print $fh_in "set xlabel \"$options->{ 'xlabel' }\"\n" if $options->{ "xlabel" }; + print $fh_in "set ylabel \"$options->{ 'ylabel' }\"\n" if $options->{ "ylabel" }; + print $fh_in "set autoscale\n"; + print $fh_in "unset key\n"; + print $fh_in "set style fill solid\n"; + print $fh_in "set style histogram title offset character 0, 0, 0\n"; + print $fh_in "set style data histograms\n"; + print $fh_in "set xtics border in scale 0 nomirror rotate by 90 offset character 0, 0, 0\n"; + + print $fh_in "plot '-' using 2:xticlabels(1)\n"; + + for ( $i = 0; $i < @{ $data }; $i++ ) + { + print $fh_in join( "\t", "\"$data->[ $i ]->[ 0 ]\"", $data->[ $i ]->[ 1 ] ), "\n"; + } + + close $fh_in; + + while ( $line = <$fh_out> ) + { + chomp $line; + + push @lines, $line; + } + + close $fh_out; + + waitpid $pid, 0; + + return wantarray ? @lines : \@lines; +} + + +sub histogram_lendist +{ + # Martin A. Hansen, August 2007. + + # Plots a histogram using Gnuplot. + + my ( $data, # list of [ xlabel, data value ] tuples + $options, # options hash + ) = @_; + + # Returns list. + + my ( $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines, $xtic_space ); + + $options->{ "terminal" } ||= "dumb"; + + if ( $data->[ -1 ]->[ 0 ] <= 10 ) { + $xtic_space = 1; + } elsif ( $data->[ -1 ]->[ 0 ] <= 100 ) { + $xtic_space = 5; + } elsif ( $data->[ -1 ]->[ 0 ] <= 250 ) { + $xtic_space = 10; + } elsif ( $data->[ -1 ]->[ 0 ] <= 500 ) { + $xtic_space = 20; + } elsif ( $data->[ -1 ]->[ 0 ] <= 1000 ) { + $xtic_space = 50; + } elsif ( $data->[ -1 ]->[ 0 ] <= 2500 ) { + $xtic_space = 100; + } elsif ( $data->[ -1 ]->[ 0 ] <= 5000 ) { + $xtic_space = 250; + } elsif ( $data->[ -1 ]->[ 0 ] <= 10000 ) { + $xtic_space = 500; + } elsif ( $data->[ -1 ]->[ 0 ] <= 50000 ) { + $xtic_space = 1000; + } elsif ( $data->[ -1 ]->[ 0 ] <= 100000 ) { + $xtic_space = 5000; + } + + $cmd = "gnuplot"; + + $pid = open2( $fh_out, $fh_in, $cmd ); + + print $fh_in "set terminal $options->{ 'terminal' }\n"; + print $fh_in "set title \"$options->{ 'title' }\"\n" if $options->{ "title" }; + print $fh_in "set xlabel \"$options->{ 'xlabel' }\"\n" if $options->{ "xlabel" }; + print $fh_in "set ylabel \"$options->{ 'ylabel' }\"\n" if $options->{ "ylabel" }; + print $fh_in "set autoscale\n"; + print $fh_in "unset key\n"; + print $fh_in "set style fill solid\n"; + print $fh_in "set style histogram clustered gap 1 title offset character 0, 0, 0\n"; + print $fh_in "set style data histograms\n"; + print $fh_in "set xtics 0,$xtic_space border out nomirror\n"; + + print $fh_in "plot '-' using 1\n"; + + for ( $i = 0; $i < @{ $data }; $i++ ) + { + $data->[ $i ]->[ 0 ] = "." if $data->[ $i ]->[ 0 ] % 10 != 0; + + print $fh_in join( "\t", $data->[ $i ]->[ 1 ] ), "\n"; + } + + close $fh_in; + + while ( $line = <$fh_out> ) + { + chomp $line; + + push @lines, $line; + } + + close $fh_out; + + waitpid $pid, 0; + + return wantarray ? @lines : \@lines; +} + + +sub histogram_chrdist +{ + # Martin A. Hansen, August 2007. + + # Plots a histogram using Gnuplot. + + my ( $data, # list of [ xlabel, data value ] tuples + $options, # options hash + ) = @_; + + # Returns list. + + my ( $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines ); + + $options->{ "terminal" } ||= "dumb"; + + $cmd = "gnuplot"; + + $pid = open2( $fh_out, $fh_in, $cmd ); + + print $fh_in "set terminal $options->{ 'terminal' }\n"; + print $fh_in "set title \"$options->{ 'title' }\"\n" if $options->{ "title" }; + print $fh_in "set xlabel \"$options->{ 'xlabel' }\"\n" if $options->{ "xlabel" }; + print $fh_in "set ylabel \"$options->{ 'ylabel' }\"\n" if $options->{ "ylabel" }; + print $fh_in "set autoscale\n"; + print $fh_in "unset key\n"; + print $fh_in "set style fill solid\n"; + print $fh_in "set style histogram title offset character 0, 0, 0\n"; + print $fh_in "set style data histograms\n"; + print $fh_in "set xtics border in scale 0 nomirror rotate by 90 offset character 0, 0, 0\n"; + + print $fh_in "plot '-' using 2:xticlabels(1)\n"; + + for ( $i = 0; $i < @{ $data }; $i++ ) { + print $fh_in join( "\t", "\"$data->[ $i ]->[ 0 ]\"", $data->[ $i ]->[ 1 ] ), "\n"; + } + + close $fh_in; + + while ( $line = <$fh_out> ) + { + chomp $line; + + push @lines, $line; + } + + close $fh_out; + + waitpid $pid, 0; + + return wantarray ? @lines : \@lines; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DOTPLOT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub dotplot_matches +{ + # Martin A. Hansen, August 2007. + + # Generates a dotplot from a list of matches using Gnuplot. + + my ( $matches, # list of hashrefs. + $options, # options hash + $tmp_dir, # temporary directory + ) = @_; + + # Returns list. + + my ( $forward_file, $backward_file, $pid, $fh_forward, $fh_backward, + $fh_in, $fh_out, $cmd, $match, $line, @lines, $q_max, $s_max ); + + $tmp_dir ||= $ENV{ 'TMP_DIR' }; + + $forward_file = "$tmp_dir/match_f.tab"; + $backward_file = "$tmp_dir/match_r.tab"; + + $fh_forward = &Maasha::Common::write_open( $forward_file ); + $fh_backward = &Maasha::Common::write_open( $backward_file ); + + $q_max = 0; + $s_max = 0; + + foreach $match ( @{ $matches } ) + { + if ( $match->{ "DIR" } =~ /^f/ ) + { + print $fh_forward join( "\t", $match->{ "Q_BEG" } + 1, $match->{ "S_BEG" } + 1 ), "\n"; + print $fh_forward join( "\t", $match->{ "Q_END" } + 1, $match->{ "S_END" } + 1 ), "\n"; + print $fh_forward "\n\n"; + } + else + { + print $fh_backward join( "\t", $match->{ "Q_BEG" } + 1, $match->{ "S_END" } + 1 ), "\n"; + print $fh_backward join( "\t", $match->{ "Q_END" } + 1, $match->{ "S_BEG" } + 1 ), "\n"; + print $fh_backward "\n\n"; + } + + $q_max = $match->{ "Q_END" } if $match->{ "Q_END" } > $q_max; + $s_max = $match->{ "S_END" } if $match->{ "S_END" } > $s_max; + } + + $q_max++; + $s_max++; + + close $fh_forward; + close $fh_backward; + + $options->{ "terminal" } ||= "dumb"; + + $cmd = "gnuplot"; + + $pid = open2( $fh_out, $fh_in, $cmd ); + + print $fh_in "set terminal $options->{ 'terminal' }\n"; + print $fh_in "set xrange [1:$q_max]\n"; + print $fh_in "set yrange [1:$s_max]\n"; + print $fh_in "set title \"$options->{ 'title' }\"\n" if $options->{ "title" }; + print $fh_in "set xlabel \"$options->{ 'xlabel' }\"\n" if $options->{ "xlabel" }; + print $fh_in "set ylabel \"$options->{ 'ylabel' }\"\n" if $options->{ "ylabel" }; + print $fh_in "unset key\n"; + + if ( $options->{ "terminal" } ne "dumb" ) + { + print $fh_in "set style line 1 linetype 1 linecolor rgb \"green\" linewidth 2 pointtype 6 pointsize default\n"; + print $fh_in "set style line 2 linetype 1 linecolor rgb \"red\" linewidth 2 pointtype 6 pointsize default\n"; + } + + print $fh_in "set xtics border out\n"; + print $fh_in "set ytics border out\n"; + print $fh_in "set grid\n"; + + if ( $options->{ "direction" } =~ /^b/ ) { + print $fh_in qq(plot "$forward_file" with lines ls 1, "$backward_file" with lines ls 2\n); + } elsif ( $options->{ "direction" } =~ /^f/ ) { + print $fh_in qq(plot "$forward_file" with lines ls 1\n); + } elsif ( $options->{ "direction" } =~ /^r/ ) { + print $fh_in qq(plot "$backward_file" with lines ls 2\n); + } + + close $fh_in; + + while ( $line = <$fh_out> ) + { + chomp $line; + + push @lines, $line; + } + + close $fh_out; + + waitpid $pid, 0; + + unlink $forward_file; + unlink $backward_file; + + return wantarray ? @lines : \@lines; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> KARYOGRAM <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub karyogram +{ + # Martin A. Hansen, August 2007. + + # Plot hits on a karyogram for a given genome. + + my ( $data, # list of [ chr, beg, end ] triples + $options, # hashref with options + ) = @_; + + # Returns string + + my ( $karyo_file, $svg, $features, $karyo ); + + if ( $options->{ "genome" } eq "human" ) + { + $karyo_file = "/Users/m.hansen/maasha/perl_scripts/biotools/karyo_data/human_cytobands.txt"; +# $karyo_file = "/home/m.hansen/maasha/perl_scripts/biotools/karyo_data/human_cytobands.txt"; + } + else + { + $karyo_file = "/Users/m.hansen/maasha/perl_scripts/biotools/karyo_data/mouse_cytobands.txt"; + # $karyo_file = "/home/m.hansen/maasha/perl_scripts/biotools/karyo_data/mouse_cytobands.txt"; + } + + $karyo = &parse_karyo_data( $karyo_file ); + + $svg = &init_svg; + + &chromosome_layout( $svg, $karyo, $data ); + + return $svg->xmlify; +} + + +sub parse_karyo_data +{ + # X q26.1 129700001 130200000 gneg + + # color: /etc/X11/rgb.txt + + my ( $file, + ) = @_; + + my ( $fh, $chr, $line, $name, $beg, $end, $color, %features, %color_hash ); + + %color_hash = ( + acen => "DarkGrey", + gneg => "white", + gpos100 => "black", + gpos75 => "DarkGrey", + gpos66 => "DarkGrey", + gpos50 => "grey", + gpos33 => "LightGrey", + gpos25 => "LightGrey", + gvar => "LightGrey", + stalk => "DarkGrey", +# gpos75 => "rgb(169,169,169)", +# gpos66 => "gray66", +# gpos66 => "#8e8e8e", +# gpos50 => "gray50", +# gpos33 => "#e3e3e3", +# gpos33 => "gray33", +# gpos25 => "gray25", +# stalk => "rgb(169,169,169)", +# stalk => "gray66", + ); + + $fh = &Maasha::Common::read_open( $file ); + + while ( $line = <$fh> ) + { + chomp $line; + + next if $line =~ /^#/; + + # ( $chr, $name, $beg, $end, $color ) = split "\t", $line; + ( $chr, $beg, $end, $name, $color ) = split "\t", $line; + +# if ( $color =~ /^gpos(\d+)/ ) { +# $color = &color_intensity( $1 ); +# } elsif ( exists $color_hash{ $color } ) { + $color = $color_hash{ $color }; +# } else { +# die qq(ERROR: Unknown color->$color\n); +# } + + if ( exists $features{ $chr } ) + { + push @{ $features{ $chr } }, [ $name, $beg, $end, $color ]; + } + else + { + $features{ $chr } = [ [ $name, $beg, $end, $color ] ]; + } + } + + close $fh; + + return wantarray ? %features : \%features; +} + + +sub color_intensity +{ + # Martin A. Hansen, September 2007. + + # Converts a gray scale intensity in percent to rgb. + + my ( $percent, # color intensity + ) = @_; + + # Returns string + + my ( $num, $hex ); + + $num = int( $percent * 256 / 100 ); + + $num--; + + $hex = sprintf "%x", $num; + + return "#$hex$hex$hex"; + +# return "rgb($num,$num,$num)"; +} + + +sub init_svg +{ + # Martin A. Hansen, September 2005. + + # initializes svg image. + + # returns an image object + + my $svg = SVG->new( + width => WIDTH, + height => HEIGHT, + style => { + 'stroke-width' => 1, + stroke => "black", + font => 'Helvetica', + }, + ); + + return $svg; +} + + +sub chromosome_layout +{ + # Martin A. Hansen, January 2004 - August 2007. + + # Plots all chromosomes in a single + + my ( $svg, # image object + $karyo_list, # hashref with karyo data + $feat_list, # hashref with features + ) = @_; + + # returns an image object + + my ( $layout_obj, $i, $x, $y, $max, $factor, $chr_len, $chr_width, $chr_cent, $chr, $feat, $karyo, @list, $A, $B ); + + $layout_obj = $svg->group( + id => "chr_layout", + ); + + $max = $karyo_list->{ "chr1" }->[ -1 ]->[ 2 ]; + $factor = ( HEIGHT / 2 ) / $max; + $chr_width = ( HEIGHT / 4 ) / 13; + + foreach $karyo ( keys %{ $karyo_list } ) { + map { $_->[ 1 ] *= $factor; $_->[ 2 ] *= $factor } @{ $karyo_list->{ $karyo } }; + } + + foreach $feat ( keys %{ $feat_list } ) { + map { $_->[ 0 ] *= $factor; $_->[ 1 ] *= $factor } @{ $feat_list->{ $feat } }; + } + + @list = sort { $A = $a; $B = $b; $A =~ s/chr//; $B =~ s/chr//; $A <=> $B } keys %{ $karyo_list }; + + splice @list, 0, 2; + push @list, "chrX", "chrY"; + + $i = 0; + + while ( $i < @list ) + { + $chr = $list[ $i ]; + $chr_len = $karyo_list->{ $chr }->[ -1 ]->[ 2 ]; + $chr_cent = &find_cent( $karyo_list->{ $list[ $i ] } ); + + $y = HEIGHT / 2 - $chr_len; + $x = ( WIDTH / ( @list + 2 ) ) * ( $i + 1 ); + + &draw_chr( $layout_obj, $x, $y, $chr_len, $chr_width, $chr_cent, $chr, $karyo_list, $feat_list ); + + $i++; + } +} + + +sub find_cent +{ + # Martin A. Hansen, December 2003. + + # Finds the centromeric region in the karyo data. + + my ( $list ) = @_; + + my ( $acen, @nums, $cent ); + + @{ $acen } = grep { grep { /^DarkGrey$/ } @{ $_ } } @{ $list }; + + push @nums, $acen->[ 0 ]->[ 1 ]; + push @nums, $acen->[ 0 ]->[ 2 ]; + push @nums, $acen->[ 1 ]->[ 1 ]; + push @nums, $acen->[ 1 ]->[ 2 ]; + + @nums = sort { $a <=> $b } @nums; + + $cent = ( $nums[ 1 ] + $nums[ 2 ] ) / 2; + + return $cent; +} + + +sub draw_chr +{ + # Martin A. Hansen, December 2003. + + # draws a whole cromosome with or without centromeric region + + my ( $svg, # image object + $x, # x position + $y, # y position + $chr_len, # lenght of chromosome + $chr_width, # width of chromosome + $chr_cent, # position of centromeric region + $chr, # chromosome + $karyo_list, # hashref with karyo data + $feat_list, # hashref with features + ) = @_; + + # returns image object + + my ( $chr_obj, $clip_obj, $gr_obj ); + + $chr_obj = $svg->group( + id => $chr, + ); + + if ( exists $feat_list->{ $chr } ) { + &draw_chr_feat( $chr_obj, $x, $y, $chr_width, $feat_list->{ $chr } ); + } + + $clip_obj = $chr_obj->clipPath( + id => $chr . "_clipPath", + ); + + $clip_obj->rectangle( + x => sprintf( "%.3f", $x ), + y => sprintf( "%.3f", $y ), + width => sprintf( "%.3f", $chr_width ), + height => sprintf( "%.3f", $chr_cent ), + rx => 10, + ry => 10, + ); + + $clip_obj->rectangle( + x => sprintf( "%.3f", $x ), + y => sprintf( "%.3f", $y + $chr_cent ), + width => sprintf( "%.3f", $chr_width ), + height => sprintf( "%.3f", $chr_len - $chr_cent ), + rx => 10, + ry => 10, + ); + + $gr_obj = $chr_obj->group( + "clip-path" => "url(#$chr" . "_clipPath)", + ); + + if ( exists $karyo_list->{ $chr } ) { + &draw_karyo_data( $gr_obj, $x, $y, $chr_width, $karyo_list->{ $chr } ); + } + + $gr_obj->rectangle( + x => sprintf( "%.3f", $x ), + y => sprintf( "%.3f", $y ), + width => sprintf( "%.3f", $chr_width ), + height => sprintf( "%.3f", $chr_cent ), + fill => 'none', + rx => 10, + ry => 10, + ); + + $gr_obj->rectangle( + x => sprintf( "%.3f", $x ), + y => sprintf( "%.3f", $y + $chr_cent ), + width => sprintf( "%.3f", $chr_width ), + height => sprintf( "%.3f", $chr_len - $chr_cent ), + fill => 'none', + rx => 10, + ry => 10, + ); + + &draw_chr_num( $chr_obj, $x, $y, $chr_len, $chr_width, $chr ); +} + + +sub draw_chr_num +{ + # Martin A. Hansen, December 2003. + + # draws a cromosome number + + my ( $svg, # image object + $x, # x position + $y, # y position + $chr_len, # lenght of chromosome + $chr_width, # width of chromosome + $chr, # chromosome number + ) = @_; + + # returns image object + + my ( $chr_num, $chars, @a, $word_width ); + + $chr_num = $chr; + $chr_num =~ s/chr//; + + $chars = @a = split "", $chr_num; + + $word_width = ( $chars * 8 ) / 2; + + $svg->text( + x => sprintf("%.3f", $x + ( $chr_width / 2 ) - $word_width ), + y => sprintf("%.3f", $y + $chr_len + 15 ), + )->cdata( $chr_num ); +} + + +sub draw_karyo_data +{ + # Martin A. Hansen, February 2004. + + # Plots chromosome features + + my ( $svg, + $x, + $y, + $chr_width, + $list, + ) = @_; + + # returns an image object + + my ( $feat_beg, $feat_end, $feat_height, $i, $color, $label ); + + for ( $i = 0; $i < @{ $list }; $i++ ) + { + ( $label, $feat_beg, $feat_end, $color ) = @{ $list->[ $i ] }; + + $feat_height = $feat_end - $feat_beg; + + $svg->rectangle( + x => sprintf("%.3f", $x ), + y => sprintf("%.3f", $y + $feat_beg ), + width => sprintf("%.3f", $chr_width ), + height => sprintf("%.3f", $feat_height ), + 'stroke-width' => 0, + fill => $color, + ); + } +} + + +sub draw_chr_feat +{ + # Martin A. Hansen, February 2004. + + # Plots chromosome features + + my ( $svg, + $x, + $y, + $chr_width, + $list, + ) = @_; + + # returns an image object + + my ( $feat_beg, $feat_end, $feat_height, $i, $color, $height, $width, $x1, $y1, %lookup ); + + for ( $i = 0; $i < @{ $list }; $i++ ) + { + ( $feat_beg, $feat_end, $color ) = @{ $list->[ $i ] }; + + $feat_height = $feat_end - $feat_beg; + + $x1 = sprintf("%.0f", $x + ( $chr_width / 2 ) ), + $y1 = sprintf("%.0f", $y + $feat_beg ), + $width = sprintf("%.0f", ( $chr_width / 2 ) + 5 ), + $height = sprintf("%.0f", $feat_height ); + + if ( $height < 1 ) + { + $height = 1; + + if ( exists $lookup{ $x1 . $y1 } ) { + next; + } else { + $lookup{ $x1 . $y1 } = 1; + } + } + + $svg->rectangle( + x => $x1, + y => $y1, + width => $width, + height => $height, + stroke => $color, + fill => $color, + ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQUENCE LOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub seq_logo +{ + # Martin A. Hansen, August 2007. + + # Calculates and renders a sequence logo in SVG format. + + my ( $entries, # aligned sequence entries - list of tuples + ) = @_; + + # Returns string. + + my ( $type, $bit_max, $logo_data, $svg ); + + $type = &Maasha::Seq::seq_guess_type( $entries->[ 0 ]->[ 1 ] ); + + if ( $type =~ /^p/i ) { + $bit_max = 4; + } else { + $bit_max = 2; + } + + $logo_data = &Maasha::Seq::seqlogo_calc( $bit_max, $entries ); + + $svg = &Maasha::Plot::svg_init(); + + &svg_draw_logo( $svg, $logo_data, $bit_max, $type ); + &svg_draw_logo_scale( $svg, $bit_max ); + + return $svg->xmlify; +} + + +sub svg_init +{ + # Martin A. Hansen, October 2005. + + # inititalizes SVG object, which is returned. + + my $svg; + + $svg = SVG->new( + style => { + 'font-weight' => 'normal', + 'font-family' => 'Courier New', + 'font-size' => 10, + }, + ); + + return $svg; +} + + +sub svg_draw_logo +{ + # Martin A. Hansen, January 2007. + + # Renders a sequence logo in SVG using a + # given data structure with logo details. + + my ( $svg, # SVG object, + $logo_data, # data structure + $bit_max, # maximum bit height + $type, # sequence type + $nocolor, # render black and white - OPTIONAL + ) = @_; + + my ( $pos, $elem, $char, $char_height_bit, $char_height_px, $block, $x, $y, $scale_factor, $color ); + + $x = 0; + + foreach $pos ( @{ $logo_data } ) + { + $y = 30; + + foreach $elem ( @{ $pos } ) + { + ( $char, $char_height_bit ) = @{ $elem }; + + $char_height_px = $char_height_bit * ( 30 / $bit_max ); + + $block = $svg->group( + transform => "translate($x,$y)", + ); + + $scale_factor = $char_height_px / 7; + + if ( $nocolor ) { + $color = "black"; + } elsif ( $type eq "dna" or $type eq "rna" ) { + $color = &Maasha::Seq::color_nuc( $char ); + } else { + $color = &Maasha::Seq::color_pep( $char ); + } + + $block->text( + transform => "scale(1,$scale_factor)", + x => 0, + y => 0, + style => { + 'font-weight' => 'bold', + fill => &Maasha::Seq::color_palette( $color ), + } + )->cdata( $char ); + + $y -= $char_height_px; + } + + $x += 7; + } +} + + +sub svg_draw_logo_scale +{ + # Martin A. Hansen, January 2007. + + # draws the bit scale for the sequence logo + + my ( $svg, # SVG object, + $bit_max, # maximum bit height + ) = @_; + + my ( $scale, $i ); + + $scale = $svg->group( + transform => "translate(-10)", + style => { + stroke => 'black', + 'font-size' => '8px', + } + ); + + $svg->text( +# transform => "translate(0,$logo_y)", + transform => "rotate(-90)", + x => -26, + y => -30, + style => { + stroke => 'none', + } + )->cdata( "bits" ); + + $scale->line( + x1 => 0, + x2 => 0, + y1 => 0, + y2 => 30, + ); + + for ( $i = 0; $i <= $bit_max; $i++ ) + { + $scale->line( + x1 => -5, + x2 => 0, + y1 => ( 30 / $bit_max ) * $i, + y2 => ( 30 / $bit_max ) * $i, + ); + + $scale->text( + x => -13, + y => ( 30 / $bit_max ) * $i + 2, + style => { + stroke => 'none', + } + )->cdata( $bit_max - $i ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/README b/code_perl/Maasha/README new file mode 100644 index 0000000..d65aabb --- /dev/null +++ b/code_perl/Maasha/README @@ -0,0 +1,22 @@ +These are modules written by me (Martin A. Hansen aka Maasha). + +You are welcome to modify the code here, but do leave a note in +the subroutines you change like this: + + sub some_subroutine + { + # Martin A. Hansen, Jan 2008. + + # Changed by , Juli 2008. (fixed minor bug) + + ... + } + +You may also add new subroutines, but you should strongly consider +adding your own Perl modules subdirectory. For more information, see: + + ../biopieces/code_perl/README + + + +Martin A. Hansen, July 2008 diff --git a/code_perl/Maasha/SQL.pm b/code_perl/Maasha/SQL.pm new file mode 100644 index 0000000..c7feeaa --- /dev/null +++ b/code_perl/Maasha/SQL.pm @@ -0,0 +1,511 @@ +package Maasha::SQL; + +# Copyright (C) 2006 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation of MySQL via the DBI module. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use warnings; + +use DBI; +use Data::Dumper; + +use Maasha::Common; + +use vars qw( @ISA @EXPORT ); +use Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub create_database +{ + my ( $database, + $user, + $password, + ) = @_; + + system( "mysqladmin create $database --user=$user --password=$password" ) == 0 or + die qq(ERROR: Could not create database "$database"!\n); + + return; +} + + +sub database_exists +{ + # Martin A. Hansen, May 2008. + + # Checks if a given database exists. Returns 1 if so, + # otherwise 0. + + my ( $database, # MySQL database + $user, # MySQL username + $pass, # MySQL password + ) = @_; + + # Return boolean. + + my ( @databases ); + + @databases = &list_databases( $user, $pass ); + + if ( grep /^$database$/i, @databases ) { + return 1; + } else { + return 0; + } +} + + +sub list_databases +{ + # Martin A. Hansen, May 2008. + + # Returns a list of databases available. + + my ( $user, # MySQL username + $pass, # MySQL password + ) = @_; + + # Returns a list. + + my ( @databases ); + + @databases = &Maasha::Common::run_and_return( "mysqlshow", "--user=$user --password=$pass" ); + + splice @databases, 0, 3; + + pop @databases; + + map { s/^\|\s+([^\s]+)\s+\|$/$1/ } @databases; + + return wantarray ? @databases : \@databases; +} + + +sub request +{ + my ( $dbh, + $sql, + ) = @_; + + my ( $sth, $errstr ); + + if ( not $sth = $dbh->prepare( $sql ) ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL PREPARE ERROR" ); + } + + if ( not $sth->execute ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL EXECUTE ERROR" ); + } + + return; +} + + +sub query_hash +{ + # Niels Larsen, April 2003. + + # Executes a given sql query and returns the result as a hash + # or hash reference. The keys are set to the values of the given + # key. + + my ( $dbh, # Database handle + $sql, # SQL string + $key, # Key string, like "id", "name", .. + ) = @_; + + # Returns a hash. + + my ( $sth, $hash, $errstr ); + + if ( not $sth = $dbh->prepare( $sql ) ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL PREPARE ERROR" ); + } + + if ( not $sth->execute ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL EXECUTE ERROR" ); + } + + if ( $hash = $sth->fetchall_hashref( $key ) ) + { + return wantarray ? %{ $hash } : $hash; + } + else + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "DATABASE RETRIEVE ERROR" ); + } + + return; +} + + +sub query_array +{ + # Niels Larsen, April 2003. + + # Executes a given sql query and returns the result as a table + # or table reference. + + my ( $dbh, # Database handle + $sql, # SQL string + $out, # Output specification, see DBI documentation. + ) = @_; + + # Returns a list. + + my ( $sth, $table, $errstr, @status ); + + if ( not $sth = $dbh->prepare( $sql ) ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL PREPARE ERROR" ); + } + + if ( not $sth->execute ) + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "SQL EXECUTE ERROR" ); + } + + if ( $table = $sth->fetchall_arrayref( $out ) ) + { + return wantarray ? @{ $table } : $table; + } + else + { + $errstr = $DBI::errstr; + + &disconnect( $dbh ); + die qq(ERROR: $errstr, "DATABASE RETRIEVE ERROR" ); + } +} + + +sub query_hashref_list +{ + # Martin A. Hansen, May 2008. + + # Executes a SQL query and return the result + # as a list of hashrefs. + + my ( $dbh, # database handle + $sql, # sql query + ) = @_; + + # Returns datastructure. + + my $table = $dbh->selectall_arrayref( $sql, { Slice => {} } ); + + return wantarray ? @{ $table } : $table; +} + + +sub delete_table +{ + my ( $dbh, + $table, + ) = @_; + + &request( $dbh, "drop table $table" ); +} + + +sub list_tables +{ + my ( $dbh, + ) = @_; + + my ( @list ); + + @list = &query_array( $dbh, "show tables" ); + + if ( @list ) { + @list = map { $_->[0] } @list; + } else { + @list = (); + } + + return wantarray ? @list : \@list; +} + + +sub table_exists +{ + my ( $dbh, + $name, + ) = @_; + + if ( grep /^$name$/, &list_tables( $dbh ) ) { + return 1; + } else { + return; + } +} + + +sub connect +{ + # Martin A. Hansen, May 2008. + + # Given a database, user and password, + # obtains a database handle if the databse exists. + + my ( $database, # MySQL database + $user, # MySQL user + $pass, # MySQL password + ) = @_; + + # Returns object. + + my ( $dbh ); + + &Maasha::Common::error( qq(Database "$database" does not exist) ) if not &database_exists( $database, $user, $pass ); + + $dbh = DBI->connect( + "dbi:mysql:$database", + $user, + $pass, + { + RaiseError => 0, + PrintError => 0, + AutoCommit => 0, + ShowErrorStatement => 1, + } + ); + + if ( $dbh ) { + return $dbh; + } else { + &Maasha::Common::error( qq($DBI::errstr) ); + } +} + + +sub disconnect +{ + my ( $dbh, + ) = @_; + + if ( not $dbh->disconnect ) + { + die qq(ERROR: $DBI::errstr ); + } +} + + +sub update_field +{ + # Martin A. Hansen, April 2003. + + # updates the content of a single table cell + + my ( $dbh, # database handle + $table, # table name + $column, # column where updating + $old_val, # the old cell content + $new_val, # the new cell content + ) = @_; + + my ( $sql, $count, $count_sql ); + + $count_sql = qq( SELECT $column FROM $table WHERE $column="$old_val"; ); + + $count = scalar &query_array( $dbh, $count_sql ); + + if ( $count > 1 ) + { + warn qq(WARNING: More than one entry found "$count_sql"\n); + } + elsif ( $count == 0 ) + { + &disconnect( $dbh ); + die qq(ERROR: entry not found "$count_sql"\n); + } + else + { + $sql = qq( UPDATE $table SET $column="$new_val" WHERE $column="$old_val"; ); + &request( $dbh, $sql ); + } + + return; +} + + +sub delete_row +{ + # Martin A. Hansen, April 2003. + + # deletes a record form a table + + my ( $dbh, # database handle + $table, # table name + $field, # field e.g. rec no + $pattern, # specific pattern + ) = @_; + + my $sql; + + $sql = qq(DELETE FROM $table WHERE $field = "$pattern";); + + &request( $dbh, $sql ); + + return; +} + + +sub add_row +{ + # Martin A. Hansen, April 2003. + + # adds a record to a table; + + my ( $dbh, # database handle + $table, # table name + $fields, # row to be inserted + ) = @_; + + my ( $sql, $field, @fields, $quote_sql ); + + foreach $field ( @{ $fields } ) + { + if ( $field eq "NULL" or $field eq '' ) { + push @fields, "NULL"; + } else { + push @fields, $dbh->quote( $field ); + } + } + + $sql = "INSERT INTO $table VALUES ( " . join( ", ", @fields ) . " );"; + + &request( $dbh, $sql ); + + return; +} + + +sub add_column +{ + # Martin A. Hansen, April 2003. + + # inserts a column in a table + + my ( $dbh, # database handle + $table, # table name + $column, # name of column + $type, # variable type + $index, # enable index + ) = @_; + + my $sql; + + if ( $index ) { + $sql = "ALTER TABLE $table ADD COLUMN ( $column $type, INDEX $column" . "_index ( $column ) );"; + } else { + $sql = "ALTER TABLE $table ADD COLUMN ( $column $type );"; + } + + &request( $dbh, $sql ); + + return; +} + + +sub del_column +{ + # Martin A. Hansen, April 2003. + + # deletes a column from a table + + my ( $dbh, # databse handle + $table, # table name + $column, # column to be deleted + ) = @_; + + my $sql; + + $sql = "ALTER TABLE $table DROP COLUMN $column;"; + + &request( $dbh, $sql ); + + return; +} + + +sub load_sql_file +{ + # Martin A. Hansen, January 2004. + + # loads , seperated file in to sql table + + my ( $dbh, # database handle object + $path, # filename with path + $table, # table to load data into + $delimiter, # column delimiter - OPTIONAL + ) = @_; + + # returns database handle object + + my $sql; + + $delimiter ||= "\t"; + + $sql = qq( LOAD DATA LOCAL INFILE "$path" INTO TABLE $table FIELDS TERMINATED BY '$delimiter' ); + + &SQL::request( $dbh, $sql ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Seq.pm b/code_perl/Maasha/Seq.pm new file mode 100644 index 0000000..1d5d5cb --- /dev/null +++ b/code_perl/Maasha/Seq.pm @@ -0,0 +1,1390 @@ +package Maasha::Seq; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# yak yak yak + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use IPC::Open2; +use List::Util qw( shuffle ); +use Time::HiRes qw( gettimeofday ); + +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub seq_guess_type +{ + # Martin A. Hansen, May 2007. + + # Makes a qualified guess on the type of a given squence. + + my ( $seq, # sequence to check + ) = @_; + + # returns string. + + my ( $check_seq, $count ); + + if ( length $seq > 100 ) { + $check_seq = substr $seq, 0, 100; + } else { + $check_seq = $seq; + } + + if ( $count = $check_seq =~ tr/FLPQIEflpqie// and $count > 0 ) { + return "protein"; + } elsif ( $count = $check_seq =~ tr/Uu// and $count > 0 ) { + return "rna"; + } else { + return "dna"; + } +} + + +sub wrap +{ + # Martin A. Hansen, July 2007. + + # Wraps a given string reference accoring to given width. + + my ( $strref, # ref to sting to wrap + $wrap, # wrap width + ) = @_; + + # Returns nothing. + + ${ $strref } =~ s/(.{$wrap})/$1\n/g; + + chomp ${ $strref }; +} + + +sub dna_revcomp +{ + # Niels Larsen + # modified Martin A. Hansen, March 2005. + + # Returns the reverse complement of a dna sequence with preservation of case + # according to this mapping, + # + # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn + # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn + + my ( $seq, # seq + ) = @_; + + # returns string + + $seq = reverse $seq; + + $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/; + + return $seq; +} + + +sub rna_revcomp +{ + # Niels Larsen + # modified Martin A. Hansen, March 2005. + + # Returns the complement of a rna sequence with preservation of case + # according to this mapping, + # + # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn + # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn + + my ( $seq, # seq + ) = @_; + + $seq = reverse $seq; + + $seq =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/; + + return $seq; +} + + +sub dna_comp +{ + # Niels Larsen + # modified Martin A. Hansen, March 2005. + + # Returns the reverse complement of a dna sequence with preservation of case + # according to this mapping, + # + # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn + # TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn + + my ( $seqref, # seqref + ) = @_; + + # Returns nothing. + + ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/TCGAAYRWSKMDHBVNtcgaayrwskmdhbvn/; +} + + +sub rna_comp +{ + # Niels Larsen + # modified Martin A. Hansen, March 2005. + + # Returns the complement of a rna sequence with preservation of case + # according to this mapping, + # + # AGCUTRYWSMKHDVBNagcutrywsmkhdvbn + # UCGAAYRWSKMDHBVNucgaayrwskmdhbvn + + my ( $seqref, # seqref + ) = @_; + + # Returns nothing. + + ${ $seqref } =~ tr/AGCUTRYWSMKHDVBNagcutrywsmkhdvbn/UCGAAYRWSKMDHBVNucgaayrwskmdhbvn/; +} + + +sub dna2rna +{ + # Martin A. Hansen, March 2007 + + # Converts DNA sequence to RNA + + my ( $seq, # nucleotide sequence + ) = @_; + + # returns string + + $seq =~ tr/Tt/Uu/; + + return $seq; +} + + +sub rna2dna +{ + # Martin A. Hansen, March 2007 + + # Converts RNA sequence to DNA + + my ( $seq, # nucleotide sequence + ) = @_; + + # returns string + + $seq =~ tr/Uu/Tt/; + + return $seq; +} + + +sub nuc2ambiguity +{ + # Martin A. Hansen, March 2005. + + # given a string of nucleotides + # returns the corresponding ambiguity code + + my ( $str, + $type, # DNA or RNA - DEFAULT DNA + ) = @_; + + my ( %hash, @nts, $key, $code, %nt_hash ); + + $str = uc $str; + + if ( not $type or $type =~ /dna/i ) + { + $str =~ s/N/ACGT/g; + $str =~ s/B/CGT/g; + $str =~ s/D/AGT/g; + $str =~ s/H/ACT/g; + $str =~ s/V/ACG/g; + $str =~ s/K/GT/g; + $str =~ s/Y/CT/g; + $str =~ s/S/CG/g; + $str =~ s/W/AT/g; + $str =~ s/R/AG/g; + $str =~ s/M/AC/g; + } + else + { + $str =~ s/N/ACGU/g; + $str =~ s/B/CGU/g; + $str =~ s/D/AGU/g; + $str =~ s/H/ACU/g; + $str =~ s/V/ACG/g; + $str =~ s/K/GU/g; + $str =~ s/Y/CU/g; + $str =~ s/S/CG/g; + $str =~ s/W/AU/g; + $str =~ s/R/AG/g; + $str =~ s/M/AC/g; + } + + @nts = split //, $str; + + %nt_hash = map { $_ => 1 } @nts; + + @nts = sort keys %nt_hash; + + $key = join "", @nts; + + %hash = ( + 'A' => 'A', + 'C' => 'C', + 'G' => 'G', + 'T' => 'T', + 'U' => 'U', + 'AC' => 'M', + 'AG' => 'R', + 'AT' => 'W', + 'AU' => 'W', + 'CG' => 'S', + 'CT' => 'Y', + 'CU' => 'Y', + 'GT' => 'K', + 'GU' => 'K', + 'ACG' => 'V', + 'ACT' => 'H', + 'ACU' => 'H', + 'AGT' => 'D', + 'AGU' => 'D', + 'CGT' => 'B', + 'CGU' => 'B', + 'ACGT' => 'N', + 'ACGU' => 'N', + ); + + $code = $hash{ $key }; + + warn qq(WARNING: No ambiguity code for key->$key\n) if not $code; + + return $code; +} + + +sub aa2codons +{ + # Martin A. Hansen, March 2005. + + # given an amino acid, returns a list of corresponding codons + + my ( $aa, # amino acid to translate + ) = @_; + + # returns list + + my ( %hash, $codons ); + + $aa = uc $aa; + + %hash = ( + 'F' => [ 'TTT', 'TTC' ], # Phe + 'L' => [ 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG' ], # Leu + 'S' => [ 'TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC' ], # Ser + 'Y' => [ 'TAT', 'TAC' ], # Tyr + '*' => [ 'TAA', 'TAG', 'TGA' ], # Stop + 'X' => [ 'TAA', 'TAG', 'TGA' ], # Stop + 'C' => [ 'TGT', 'TGC' ], # Cys + 'W' => [ 'TGG' ], # Trp + 'P' => [ 'CCT', 'CCC', 'CCA', 'CCG' ], # Pro + 'H' => [ 'CAT', 'CAC' ], # His + 'Q' => [ 'CAA', 'CAG' ], # Gln + 'R' => [ 'CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG' ], # Arg + 'I' => [ 'ATT', 'ATC', 'ATA' ], # Ile + 'M' => [ 'ATG' ], # Met + 'T' => [ 'ACT', 'ACC', 'ACA', 'ACG' ], # Thr + 'N' => [ 'AAT', 'AAC' ], # Asn + 'K' => [ 'AAA', 'AAG' ], # Lys + 'V' => [ 'GTT', 'GTC', 'GTA', 'GTG' ], # Val + 'A' => [ 'GCT', 'GCC', 'GCA', 'GCG' ], # Ala + 'D' => [ 'GAT', 'GAC' ], # Asp + 'E' => [ 'GAA', 'GAG' ], # Glu + 'G' => [ 'GGT', 'GGC', 'GGA', 'GGG' ], # Gly + ); + + $codons = $hash{ $aa }; + + return wantarray ? @{ $codons } : $codons; +} + + +sub codon2aa +{ + # Martin A. Hansen, March 2005. + + # given a codon, returns the correponding + # vertebrate amino acid. + + my ( $codon, # codon to translate + ) = @_; + + # returns string + + my ( %hash, $aa ); + + die qq(ERROR: Bad codon: "$codon"\n) if not $codon =~ /[ATCGatcg]{3}/; + + %hash = ( + 'TTT' => 'F', # Phe + 'TTC' => 'F', # Phe + 'TTA' => 'L', # Leu + 'TTG' => 'L', # Leu + 'TCT' => 'S', # Ser + 'TCC' => 'S', # Ser + 'TCA' => 'S', # Ser + 'TCG' => 'S', # Ser + 'TAT' => 'Y', # Tyr + 'TAC' => 'Y', # Tyr + 'TAA' => '*', # Stop + 'TAG' => '*', # Stop + 'TGT' => 'C', # Cys + 'TGC' => 'C', # Cys + 'TGA' => '*', # Stop + 'TGG' => 'W', # Trp + 'CTT' => 'L', # Leu + 'CTC' => 'L', # Leu + 'CTA' => 'L', # Leu + 'CTG' => 'L', # Leu + 'CCT' => 'P', # Pro + 'CCC' => 'P', # Pro + 'CCA' => 'P', # Pro + 'CCG' => 'P', # Pro + 'CAT' => 'H', # His + 'CAC' => 'H', # His + 'CAA' => 'Q', # Gln + 'CAG' => 'Q', # Gln + 'CGT' => 'R', # Arg + 'CGC' => 'R', # Arg + 'CGA' => 'R', # Arg + 'CGG' => 'R', # Arg + 'ATT' => 'I', # Ile + 'ATC' => 'I', # Ile + 'ATA' => 'I', # Ile + 'ATG' => 'M', # Met + 'ACT' => 'T', # Thr + 'ACC' => 'T', # Thr + 'ACA' => 'T', # Thr + 'ACG' => 'T', # Thr + 'AAT' => 'N', # Asn + 'AAC' => 'N', # Asn + 'AAA' => 'K', # Lys + 'AAG' => 'K', # Lys + 'AGT' => 'S', # Ser + 'AGC' => 'S', # Ser + 'AGA' => 'R', # Arg + 'AGG' => 'R', # Arg + 'GTT' => 'V', # Val + 'GTC' => 'V', # Val + 'GTA' => 'V', # Val + 'GTG' => 'V', # Val + 'GCT' => 'A', # Ala + 'GCC' => 'A', # Ala + 'GCA' => 'A', # Ala + 'GCG' => 'A', # Ala + 'GAT' => 'D', # Asp + 'GAC' => 'D', # Asp + 'GAA' => 'E', # Glu + 'GAG' => 'E', # Glu + 'GGT' => 'G', # Gly + 'GGC' => 'G', # Gly + 'GGA' => 'G', # Gly + 'GGG' => 'G', # Gly + ); + + $aa = $hash{ uc $codon }; + + return $aa; +} + + +sub translate +{ + # Martin A. Hansen, June 2005. + + # translates a dna sequence to protein according to a optional given + # frame. + + my ( $dna, # dna sequence + $frame, # frame of translation - OPTIONAL + ) = @_; + + # returns string + + my ( $codon, $pos, $pep ); + + $frame ||= 1; + + if ( $frame =~ /-?[1-3]/ ) + { + if ( $frame < 0 ) { + $dna = &Maasha::Seq::dna_revcomp( $dna ); + } + + $frame = abs( $frame ) - 1; + + $dna =~ s/^.{${frame}}//; + } + else + { + &Maasha::Common::error( qq(Badly formated frame "$frame") ); + } + + $pos = 0; + + while ( $codon = substr $dna, $pos, 3 ) + { + last if not length $codon == 3; + + $pep .= &codon2aa( $codon ); + + $pos += 3; + } + + return $pep; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RNA FOLDING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub fold_struct_rnafold +{ + # Martin A. Hansen, February 2008. + + # Given a squence fold this using RNAfold. + + my ( $seq, # sequence to fold + ) = @_; + + # Returns a tuple of fold string and free energy. + + my ( $pid, $fh_out, $fh_in, @lines, $struct, $energy ); + + $pid = open2( $fh_out, $fh_in, "RNAfold -noPS" ); + + &Maasha::Fasta::put_entry( [ "RNAfold", $seq ], $fh_in ); + + close $fh_in; + + @lines = <$fh_out>; + + close $fh_out; + + waitpid $pid, 0; + + chomp @lines; + + if ( $lines[ - 1 ] =~ /^([^ ]+) \((.+)\)$/ ) + { + $struct = $1; + $energy = $2; + } + + return wantarray ? ( $struct, $energy ) : [ $struct, $energy ]; +} + + +sub fold_struct_contrastruct +{ + # Martin A. Hansen, February 2008. + + # Given a sequence fold this using Contrafold. + + my ( $seq, # sequence to fold + $tmp_dir, # temporary directory - OPTIONAL + ) = @_; + + # Returns a tuple of fold string and temp index. + + my ( $tmp_file, $out_file1, $out_file2, $fh, $line, $struct, @AoA, $i, $temp, $index ); + + $tmp_dir ||= $ENV{ 'TMP_DIR' }; + + $tmp_file = "$tmp_dir/fold.fna"; + $out_file1 = "$tmp_dir/fold.out1"; + $out_file2 = "$tmp_dir/fold.out2"; + + &Maasha::Fasta::put_entries( [ [ "fold", $seq ] ], $tmp_file ); + + &Maasha::Common::run( "contrafold", "predict --parens $out_file1 --bpseq $out_file2 $tmp_file" ); + + unlink $tmp_file; + + $fh = &Maasha::Common::read_open( $out_file1 ); + + while ( $line = <$fh> ) + { + chomp $line; + + $struct = $line; + } + + close $fh; + + unlink $out_file1; + + $fh = &Maasha::Common::read_open( $out_file2 ); + + while ( $line = <$fh> ) + { + chomp $line; + + push @AoA, [ split " ", $line ]; + } + + close $fh; + + unlink $out_file2; + + for ( $i = 0; $i < @AoA; $i++ ) + { + if ( $AoA[ $i ]->[ 2 ] != 0 ) + { + last if $AoA[ $i ]->[ 0 ] > $AoA[ $i ]->[ 2 ]; + + $temp += &base_pair_melting_temp( $AoA[ $i ]->[ 1 ] . $AoA[ $AoA[ $i ]->[ 2 ] - 1 ]->[ 1 ] ); + } + } + + $index = sprintf( "%.2f", $temp / length $seq ); + + return wantarray ? ( $struct, $index ) : [ $struct, $index ]; +} + + +sub base_pair_melting_temp +{ + # Martin A. Hansen, February 2008. + + # Given a basepair, returns the melting temperature. + + my ( $bp, # basepair string + ) = @_; + + # Returns integer + + my ( %melt_hash ); + + %melt_hash = ( + AA => 0, + AT => 2, + AC => 0, + AG => 0, + AU => 2, + TA => 2, + TT => 0, + TC => 0, + TG => 1, # + TU => 0, + CA => 0, + CT => 0, + CC => 0, + CG => 4, + CU => 0, + GA => 0, + GT => 1, # + GC => 4, + GG => 0, + GU => 1, # + UA => 2, + UT => 0, + UC => 0, + UG => 1, # + UU => 0, + ); + + return $melt_hash{ uc $bp }; +} + + +sub generate_dna_oligos +{ + # Martin A. Hansen, April 2007. + + # Generates all possible DNA oligos of a given wordsize. + + # alternative way: perl -MData::Dumper -e '@CONV = glob( "{T,C,A,G}" x 4 ); print Dumper( \@CONV )' + + + my ( $wordsize, # size of DNA oligos + ) = @_; + + # Returns list + + my ( @alph, @oligos, $oligo, $char, @list ); + + @alph = ( qw( A T C G N ) ); + @oligos = ( '' ); + + for ( 1 .. $wordsize ) + { + foreach $oligo ( @oligos ) + { + foreach $char ( @alph ) { + push @list, $oligo . $char; + } + } + + @oligos = @list; + + undef @list; + } + + return wantarray ? @oligos : \@oligos; +} + + +sub seq2oligos +{ + # Martin A. Hansen, April 2007 + + # Given a sequence and a wordsize, + # breaks the sequence into overlapping + # oligos of that wordsize. + + my ( $seq, # sequence reference + $wordsize, # wordsize + ) = @_; + + # returns list + + my ( $i, $oligo, @oligos ); + + for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ ) + { + $oligo = substr ${ $seq }, $i, $wordsize; + + push @oligos, $oligo; + } + + return wantarray ? @oligos : \@oligos; +} + + +sub seq2oligos_uniq +{ + # Martin A. Hansen, April 2007 + + # Given a sequence and a wordsize, + # breaks the sequence into overlapping + # oligos of that wordsize and return + # only unique words. + + my ( $seq, # sequence reference + $wordsize, # wordsize + ) = @_; + + # returns list + + my ( $i, $oligo, %lookup, @oligos ); + + for ( $i = 0; $i < length( ${ $seq } ) - $wordsize + 1; $i++ ) + { + $oligo = substr ${ $seq }, $i, $wordsize; + + if ( not exists $lookup{ $oligo } ) + { + push @oligos, $oligo; + $lookup{ $oligo } = 1; + } + } + + return wantarray ? @oligos : \@oligos; +} + + +sub oligo_freq +{ + # Martin A. Hansen, August 2007. + + # Given a hashref with oligo=>count, calculates + # a frequency table. Returns a list of hashes + + my ( $oligo_freq, # hashref + ) = @_; + + # Returns data structure + + my ( @freq_table, $total ); + + $total = 0; + + map { push @freq_table, { OLIGO => $_, COUNT => $oligo_freq->{ $_ } }; $total += $oligo_freq->{ $_ } } keys %{ $oligo_freq }; + + @freq_table = sort { $b->{ "COUNT" } <=> $a->{ "COUNT" } or $a->{ "OLIGO" } cmp $b->{ "OLIGO" } } @freq_table; + + map { $_->{ "FREQ" } = sprintf( "%.4f", $_->{ "COUNT" } / $total ) } @freq_table; + + return wantarray ? return @freq_table : \@freq_table; +} + + +sub seq_generate +{ + # Martin A. Hansen, May 2007 + + # Generates a random sequence given a sequence length + # and a alphabet. + + my ( $len, # sequence length + $alph, # sequence alphabet + ) = @_; + + # returns string + + my ( $alph_len, $i, $seq ); + + $alph_len = scalar @{ $alph }; + + for ( $i = 0; $i < $len; $i++ ) { + $seq .= $alph->[ int( rand( $alph_len ) ) ]; + } + + return $seq; +} + + +sub seq_shuffle +{ + # Martin A. Hansen, December 2007. + + # Shuffles sequence of a given string. + + my ( $seq, # sequence string + ) = @_; + + # Returns string. + + my ( @list ); + + @list = split "", $seq; + + return join "", shuffle( @list ); +} + + +sub seq_alph +{ + # Martin A. Hansen, May 2007. + + # returns a requested alphabet + + my ( $type, # alphabet type + ) = @_; + + # returns list + + my ( @alph ); + + if ( $type =~ /^dna$/i ) { + @alph = qw( A T C G ); + } elsif ( $type =~ /^rna$/i ) { + @alph = qw( A U C G ); + } elsif ( $type =~ /^prot/i ) { + @alph = qw( F L S Y C W P H Q R I M T N K V A D E G ); + } else { + die qq(ERROR: Unknown alphabet type: "$type"\n); + } + + return wantarray ? @alph : \@alph; +} + + +sub seq_analyze +{ + # Martin A. Hansen, August 2007. + + # Analyses the sequence composition of a given sequence. + + my ( $seq, # sequence to analyze + ) = @_; + + # Returns hash + + my ( %analysis, @chars, @chars_lc, $char, %char_hash, $gc, $at, $lc, $max, $res_sum, @indels, %indel_hash ); + + $analysis{ "SEQ_TYPE" } = uc &Maasha::Seq::seq_guess_type( $seq ); + $analysis{ "SEQ_LEN" } = length $seq; + + @indels = qw( - ~ . _ ); + + if ( $analysis{ "SEQ_TYPE" } eq "DNA" ) + { + @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn"; + @chars_lc = split //, "agcutrywsmkhdvbn"; + } + elsif ( $analysis{ "SEQ_TYPE" } eq "RNA" ) + { + @chars = split //, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn"; + @chars_lc = split //, "agcutrywsmkhdvbn"; + } + else + { + @chars = split //, "FLSYCWPHQRIMTNKVADEGflsycwphqrimtnkvadeg"; + @chars_lc = split //, "flsycwphqrimtnkvadeg"; + } + + @char_hash{ @chars } = map { eval "scalar \$seq =~ tr/$_//" } @chars; + @indel_hash{ @indels } = map { eval "scalar \$seq =~ tr/$_//" } @indels; + + if ( $analysis{ "SEQ_TYPE" } =~ /DNA|RNA/ ) + { + $gc = $char_hash{ "g" } + $char_hash{ "G" } + $char_hash{ "c" } + $char_hash{ "C" }; + $at = $char_hash{ "a" } + $char_hash{ "A" } + $char_hash{ "t" } + $char_hash{ "T" } + $char_hash{ "u" } + $char_hash{ "U" }; + + $analysis{ "GC%" } = sprintf( "%.2f", 100 * $gc / $analysis{ "SEQ_LEN" } ); + + map { $lc += $char_hash{ lc $_ } } @chars_lc; + + $analysis{ "SOFT_MASK%" } = sprintf( "%.2f", 100 * $lc / $analysis{ "SEQ_LEN" } ); + $analysis{ "HARD_MASK%" } = sprintf( "%.2f", 100 * ( $char_hash{ "n" } + $char_hash{ "N" } ) / $analysis{ "SEQ_LEN" } ); + } + + $max = 0; + + foreach $char ( @chars_lc ) + { + $char = uc $char; + + $char_hash{ $char } += $char_hash{ lc $char }; + + $analysis{ "RES:$char" } = $char_hash{ $char }; + + $max = $char_hash{ $char } if $char_hash{ $char } > $max; + + $analysis{ "RES_SUM" } += $char_hash{ $char }; + } + + map { $analysis{ "RES:$_" } = $indel_hash{ $_ } } @indels; + + $analysis{ "MIX_INDEX" } = sprintf( "%.2f", $max / $analysis{ "SEQ_LEN" } ); + $analysis{ "MELT_TEMP" } = sprintf( "%.2f", 4 * $gc + 2 * $at ); + + return wantarray ? %analysis : \%analysis; +} + + +sub seq_complexity +{ + # Martin A. Hansen, May 2008. + + # Given a sequence computes a complexity index + # as the most common di-residue over + # the sequence length. Return ~1 if the entire + # sequence is homopolymeric. Above 0.4 indicates + # low complexity sequence. + + my ( $seq, # sequence + ) = @_; + + # Returns float. + + my ( $len, $i, $max, $di, %hash ); + + $seq = uc $seq; + $len = length $seq; + $max = 0; + + for ( $i = 0; $i < $len - 1; $i++ ) { + $hash{ substr $seq, $i, 2 }++; + } + + foreach $di ( keys %hash ) { + $max = $hash{ $di } if $hash{ $di } > $max; + } + + return $max / $len; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SEQLOGO <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub seqlogo_calc +{ + # Martin A. Hansen, January 2007. + + # given max bit size and a list of aligned entries + # in FASTA format, calculates for each sequence position + # the height of the letters in bits. + # returns a data structure with [ letter, height ] tuples + # for all letters at each position. + + my ( $bit_max, # maximum bit height + $entries, # FASTA entries + ) = @_; + + # returns data structure + + my ( $logo_len, $char_tot, $i, %char_hash, $bit_height, $bit_diff, $char_heights, @logo ); + + $logo_len = length $entries->[ 0 ]->[ 1 ]; + $char_tot = scalar @{ $entries }; + + for ( $i = 0; $i < $logo_len; $i++ ) + { + undef %char_hash; + + map { $char_hash{ uc substr( $_->[ 1 ], $i, 1 ) }++ } @{ $entries }; + + delete $char_hash{ "-" }; + delete $char_hash{ "_" }; + delete $char_hash{ "~" }; + delete $char_hash{ "." }; + + $bit_height = &seqlogo_calc_bit_height( \%char_hash, $char_tot ); + + $bit_diff = $bit_max - $bit_height; + + $char_heights = &seqlogo_calc_char_heights( \%char_hash, $char_tot, $bit_diff ); + + push @logo, $char_heights; + } + + return wantarray ? @logo : \@logo; +} + + +sub seqlogo_calc_bit_height +{ + # Martin A. Hansen, January 2007. + + # calculates the bit height using Shannon's famous + # general formula for uncertainty as documentet: + # http://www.ccrnp.ncifcrf.gov/~toms/paper/hawaii/latex/node5.html + + my ( $char_hash, # hashref with chars and frequencies + $tot, # total number of chars + ) = @_; + + # returns float + + my ( $char, $freq, $bit_height ); + + foreach $char ( keys %{ $char_hash } ) + { + $freq = $char_hash->{ $char } / $tot; + + $bit_height += $freq * ( log( $freq ) / log( 2 ) ); + } + + $bit_height *= -1; + + return $bit_height; +} + + +sub seqlogo_calc_char_heights +{ + # Martin A. Hansen, January 2007. + + # calculates the hight of each char in bits, and sorts + # according to height. + + my ( $char_hash, # hashref with chars and frequencies + $tot, # tot number of chars + $bit_diff, # information gained from uncertainties + ) = @_; + + # returns list of tuples + + my ( $char, $freq, $char_height, @char_heights ); + + foreach $char ( keys %{ $char_hash } ) + { + $freq = $char_hash->{ $char } / $tot; + + $char_height = $freq * $bit_diff; # char height in bits + + push @char_heights, [ $char, $char_height ]; + } + + @char_heights = sort { $a->[ 1 ] <=> $b->[ 1 ] } @char_heights; + + return wantarray ? @char_heights : \@char_heights; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RESIDUE COLORS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub color_pep +{ + # Martin A. Hansen, October 2005. + + # color scheme for proteins as defined in Mview. + # given a char returns the appropriate color. + # The amino acids are colored according to physicochemical properties: + # bright green = hydrophobic; dark green = large hydrophobic; + # bright blue = negative charge; red = positive charge; + # dull blue = small alcohol; purple = polar; yellow = cysteine. + + my ( $char, # char to decorate + ) = @_; + + # returns string + + my ( %hash, $color_set ); + + %hash = ( + K => "bright-red", + R => "bright-red", + H => "dark-green", + D => "bright-blue", + E => "bright-blue", + S => "dull-blue", + T => "dull-blue", + N => "purple", + Q => "purple", + A => "bright-green", + V => "bright-green", + I => "bright-green", + L => "bright-green", + M => "bright-green", + F => "dark-green", + Y => "dark-green", + W => "dark-green", + C => "yellow", + G => "bright-green", + P => "bright-green", + Z => "dark-gray", + B => "dark-gray", + "?" => "light-gray", + "~" => "light-gray", + "*" => "dark-gray", + ); + + if ( exists $hash{ uc $char } ) { + $color_set = $hash{ uc $char }; + } else { + $color_set = "black"; + } + + return $color_set; +} + + +sub color_nuc +{ + # Martin A. Hansen, October 2005. + + # color scheme for nucleotides as defined in Mview. + # given a char returns the appropriate color + # according to physical/chemical proterties. + + my ( $char, # char to decorate + ) = @_; + + # returns string + + my ( %hash, $color_set ); + + %hash = ( + A => "bright-red", + G => "yellow", + C => "blue", + T => "green", + U => "green", + ); + + if ( exists $hash{ uc $char } ) { + $color_set = $hash{ uc $char }; + } else { + $color_set = "black"; + } + + return $color_set; +} + + +sub color_palette +{ + # Martin A. Hansen, October 2005. + + # hash table with color-names and color-hex. + + my ( $color, # common color name + ) = @_; + + # returns string + + my ( %hash ); + + %hash = ( + "black" => "#000000", + "white" => "#ffffff", + "red" => "#ff0000", + "green" => "#00ff00", + "blue" => "#0000ff", + "cyan" => "#00ffff", + "magenta" => "#ff00ff", +# "yellow" => "#ffff00", + "yellow" => "#ffc800", + "purple" => "#6600cc", + "dull-blue" => "#0099ff", + "dark-green-blue" => "#33cccc", + "medium-green-blue" => "#00ffcc", + "bright-blue" => "#0033ff", + "dark-green" => "#009900", + "bright-green" => "#33cc00", + "orange" => "#ff3333", + "orange-brown" => "#cc6600", + "bright-red" => "#cc0000", + "light-gray" => "#999999", + "dark-gray" => "#666666", + "gray0" => "#ffffff", + "gray1" => "#eeeeee", + "gray2" => "#dddddd", + "gray3" => "#cccccc", + "gray4" => "#bbbbbb", + "gray5" => "#aaaaaa", + "gray6" => "#999999", + "gray7" => "#888888", + "gray8" => "#777777", + "gray9" => "#666666", + "gray10" => "#555555", + "gray11" => "#444444", + "gray12" => "#333333", + "gray13" => "#222222", + "gray14" => "#111111", + "gray15" => "#000000", + "clustal-red" => "#ff1111", + "clustal-blue" => "#1155ff", + "clustal-green" => "#11dd11", + "clustal-cyan" => "#11ffff", + "clustal-yellow" => "#ffff11", + "clustal-orange" => "#ff7f11", + "clustal-pink" => "#ff11ff", + "clustal-purple" => "#6611cc", + "clustal-dull-blue" => "#197fe5", + "clustal-dark-gray" => "#666666", + "clustal-light-gray" => "#999999", + "lin-A" => "#90fe23", + "lin-R" => "#fe5e2d", + "lin-N" => "#2e3d2d", + "lin-D" => "#00903b", + "lin-C" => "#004baa", + "lin-Q" => "#864b00", + "lin-E" => "#3fa201", + "lin-G" => "#10fe68", + "lin-H" => "#b2063b", + "lin-I" => "#04ced9", + "lin-L" => "#4972fe", + "lin-K" => "#c4a100", + "lin-M" => "#2a84dd", + "lin-F" => "#a60ade", + "lin-P" => "#fe61fe", + "lin-S" => "#f7e847", + "lin-T" => "#fefeb3", + "lin-W" => "#4a007f", + "lin-Y" => "#e903a8", + "lin-V" => "#5bfdfd", + ); + + if ( exists $hash{ $color } ) { + return $hash{ $color }; + } else { + print STDERR qq(WARNING: color "$color" not found in palette!\n); + } +} + + +sub color_contrast +{ + # Martin A. Hansen, October 2005. + + # Hash table with contrast colors to be used for frontground + # text on a given background color. + + my ( $color, # background color + ) = @_; + + # returns string + + my ( %hash ); + + %hash = ( + "black" => "white", + "white" => "black", + "red" => "white", + "green" => "white", + "blue" => "white", + "cyan" => "white", + "magenta" => "white", + "yellow" => "black", + "purple" => "white", + "dull-blue" => "white", + "dark-green-blue" => "white", + "medium-green-blue" => "white", + "bright-blue" => "white", + "dark-green" => "white", + "bright-green" => "black", + "orange" => "", + "orange-brown" => "", + "bright-red" => "white", + "light-gray" => "black", + "dark-gray" => "white", + "gray0" => "", + "gray1" => "", + "gray2" => "", + "gray3" => "", + "gray4" => "", + "gray5" => "", + "gray6" => "", + "gray7" => "", + "gray8" => "", + "gray9" => "", + "gray10" => "", + "gray11" => "", + "gray12" => "", + "gray13" => "", + "gray14" => "", + "gray15" => "", + "clustal-red" => "black", + "clustal-blue" => "black", + "clustal-green" => "black", + "clustal-cyan" => "black", + "clustal-yellow" => "black", + "clustal-orange" => "black", + "clustal-pink" => "black", + "clustal-purple" => "black", + "clustal-dull-blue" => "black", + "clustal-dark-gray" => "black", + "clustal-light-gray" => "black", + "lin-A" => "", + "lin-R" => "", + "lin-N" => "", + "lin-D" => "", + "lin-C" => "", + "lin-Q" => "", + "lin-E" => "", + "lin-G" => "", + "lin-H" => "", + "lin-I" => "", + "lin-L" => "", + "lin-K" => "", + "lin-M" => "", + "lin-F" => "", + "lin-P" => "", + "lin-S" => "", + "lin-T" => "", + "lin-W" => "", + "lin-Y" => "", + "lin-V" => "", + ); + + if ( exists $hash{ $color } ) { + return $hash{ $color }; + } else { + print STDERR qq(WARNING: color "$color" not found in palette!\n); + } +} + + +sub seq_word_pack +{ + # Martin A. Hansen, April 2008. + + # Packs a sequence word into a binary number. + + my ( $word, # Word to be packed + ) = @_; + + # Returns integer. + + my ( %hash, $bin, $word_size, $pad ); + + %hash = ( + 'A' => '000', + 'T' => '001', + 'C' => '010', + 'G' => '100', + 'N' => '011', + '-' => '101', + '.' => '110', + '~' => '111', + ); + + map { $bin .= pack "B3", $hash{ $_ } } split //, $word; + + $word_size = length $word; + + $pad = ( 3 * $word_size ) / 8; + + if ( $pad =~ /\./ ) + { + $pad = ( ( int $pad + 1 ) * 8 ) - 3 * $word_size; + + $bin .= pack "B$pad", 0 x $pad; + } + + return $bin; +} + + +sub seq_word_unpack +{ + # Martin A. Hansen, April 2008. + + # Unpacks a binary sequence word to ASCII. + + my ( $bin, # Binary sequence word + $word_size, # Size of word + ) = @_; + + # Returns string. + + my ( %hash, $word ); + + %hash = ( + '000' => 'A', + '001' => 'T', + '010' => 'C', + '100' => 'G', + '011' => 'N', + '101' => '-', + '110' => '.', + '111' => '~', + ); + + map { $word .= $hash{ $_ } } unpack "(B3)$word_size", $bin; + + return $word; +} + + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Solid.pm b/code_perl/Maasha/Solid.pm new file mode 100644 index 0000000..f738288 --- /dev/null +++ b/code_perl/Maasha/Solid.pm @@ -0,0 +1,164 @@ +package Maasha::Solid; + + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation Solid sequence files with di-base encoding. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> CONSTANTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my %CONVERT_HASH = ( + 'A0' => 'A', 'AA' => 0, + 'A1' => 'C', 'AC' => 1, + 'A2' => 'G', 'AG' => 2, + 'A3' => 'T', 'AT' => 3, + 'C0' => 'C', 'CA' => 1, + 'C1' => 'A', 'CC' => 0, + 'C2' => 'T', 'CG' => 3, + 'C3' => 'G', 'CT' => 2, + 'G0' => 'G', 'GA' => 2, + 'G1' => 'T', 'GC' => 3, + 'G2' => 'A', 'GG' => 0, + 'G3' => 'C', 'GT' => 1, + 'T0' => 'T', 'TA' => 3, + 'T1' => 'G', 'TC' => 2, + 'T2' => 'C', 'TG' => 1, + 'T3' => 'A', 'TT' => 0, + 'AN' => 4, + 'CN' => 4, + 'GN' => 4, + 'TN' => 4, + 'NA' => 5, + 'NC' => 5, + 'NG' => 5, + 'NT' => 5, + 'NN' => 6, +); + + +# from Solid - ABI + +sub define_color_code { + + my %color = (); + + $color{AA} = 0; + $color{CC} = 0; + $color{GG} = 0; + $color{TT} = 0; + $color{AC} = 1; + $color{CA} = 1; + $color{GT} = 1; + $color{TG} = 1; + $color{AG} = 2; + $color{CT} = 2; + $color{GA} = 2; + $color{TC} = 2; + $color{AT} = 3; + $color{CG} = 3; + $color{GC} = 3; + $color{TA} = 3; + $color{AN} = 4; + $color{CN} = 4; + $color{GN} = 4; + $color{TN} = 4; + $color{NA} = 5; + $color{NC} = 5; + $color{NG} = 5; + $color{NT} = 5; + $color{NN} = 6; + + return(%color); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub color_space2seq +{ + # Martin A. Hansen, April 2008. + + # Converts a di-base encoded Solid sequence to + # regular sequence. + + my ( $seq_cs, # di-base encode sequence + ) = @_; + + # Returns a string. + + my ( @codes, $base, $i, $seq ); + + @codes = split //, $seq_cs; + $base = shift @codes; + $seq = $base; + + for ( $i = 0; $i < @codes; $i++ ) + { + $base = $CONVERT_HASH{ $base . $codes[ $i ] }; + $seq .= $base; + } + + return $seq; +} + + +sub seq2color_space +{ + # Martin A. Hansen, April 2008. + + # Converts a sequence to di-base encoded Solid sequence. + + my ( $seq, # sequence + ) = @_; + + # Returns a string. + + my ( $i, $seq_cs ); + + $seq_cs = substr $seq, 0, 1; + + for ( $i = 0; $i < length( $seq ) - 1; $i++ ) { + $seq_cs .= $CONVERT_HASH{ substr( $seq, $i, 2 ) }; + } + + return $seq_cs; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; diff --git a/code_perl/Maasha/Stockholm.pm b/code_perl/Maasha/Stockholm.pm new file mode 100644 index 0000000..45e379c --- /dev/null +++ b/code_perl/Maasha/Stockholm.pm @@ -0,0 +1,133 @@ +package Maasha::Stockholm; + +# Copyright (C) 2006 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation of the Stockholm format. +# http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Maasha::Common; +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_stockholm_entry +{ + # Martin A. Hansen, February 2007. + + # Given a file handle, returns the next stockholm + # entry as a list of lines. + + my ( $fh, # file handle + ) = @_; + + # returns a list + + my ( $line, @lines ); + + while ( defined $fh and $line = <$fh> ) + { + chomp $line; + + push @lines, $line; + + last if $line eq "//"; + } + + if ( not @lines ) { + return undef; + } else { + return wantarray ? @lines : \@lines; + } +} + + +sub parse_stockholm_entry +{ + # Martin A. Hansen, February 2007. + + # given a Stockholm entry as a list of lines, + # parses this into an elaborate data structure. + # Compultory fields: AC ID DE AU SE SS BM GA TC NC TP SQ + # Non-compultory fields: PI DC DR RC RN RM RT RA RL CC + + my ( $entry, # stockholm entry + ) = @_; + + # returns data structure + + my ( $line, %hash, %align_hash, @align_list, @align ); + + foreach $line ( @{ $entry } ) + { + next if $line =~ /^# /; + + if ( $line =~ /^#=GF\s+([^\s]+)\s+(.*)$/ ) + { + push @{ $hash{ "GF" }{ $1 } }, $2; + } + elsif ( $line =~ /^#=GC\s+([^\s]+)\s+(.*)$/ ) + { + push @{ $hash{ "GC" }{ $1 } }, $2; + } + elsif ( $line =~ /^#=GS\s+([^\s]+)\s+([^\s]+)\s+(.*)$/ ) + { + push @{ $hash{ "GS" }{ $1 }{ $2 } }, $3; + } + elsif ( $line =~ /^#=GR\s+([^\s]+)\s+([^\s]+)\s+(.*)$/ ) + { + push @{ $hash{ "GR" }{ $1 }{ $2 } }, $3; + } + elsif ( $line =~ /^([^\s]+)\s+(.+)$/ ) + { + push @align_list, $1 if not exists $align_hash{ $1 }; + + $align_hash{ $1 } .= $2; + } + } + + map { $hash{ "GF" }{ $_ } = join " ", @{ $hash{ "GF" }{ $_ } } } keys %{ $hash{ "GF" } }; + map { $hash{ "GC" }{ $_ } = join "", @{ $hash{ "GC" }{ $_ } } } keys %{ $hash{ "GC" } }; + map { push @align, [ $_, $align_hash{ $_ } ] } @align_list; + + push @align, [ "SS_cons", $hash{ "GC" }{ "SS_cons" } ]; + push @align, [ "RF", $hash{ "GC" }{ "RF" } ] if $hash{ "GC" }{ "RF" }; + + delete $hash{ "GC" }; + + $hash{ "ALIGN" } = \@align; + + return wantarray ? %hash : \%hash; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/Test.pm b/code_perl/Maasha/Test.pm new file mode 100644 index 0000000..c5a319f --- /dev/null +++ b/code_perl/Maasha/Test.pm @@ -0,0 +1,24 @@ +package Maasha::Test; + + +use warnings; +use strict; + +require Exporter; + +use vars qw( @ISA @EXPORT @EXPORT_OK ); + +@ISA = qw( Exporter ); + +@EXPORT_OK = qw( + hello_world +); + +sub hello_world +{ + print "Hello Word\n"; +} + +END { + &hello_world; +} diff --git a/code_perl/Maasha/TwoBit.pm b/code_perl/Maasha/TwoBit.pm new file mode 100644 index 0000000..cc9d600 --- /dev/null +++ b/code_perl/Maasha/TwoBit.pm @@ -0,0 +1,746 @@ +package Maasha::TwoBit; + +# Copyright (C) 2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Stuff for interacting with the 2bit format as described here: +# http://genome.ucsc.edu/FAQ/FAQformat#format7 + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use vars qw( @ISA @EXPORT ); + +use Data::Dumper; + +use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "TMP_DIR" } ); + +int find_block_beg( char *string, char c, int beg, int len ) +{ + /* Martin A. Hansen, March 2008 */ + + /* Given a string and a begin position, locates the next */ + /* position in the string MATCHING a given char. */ + /* This position is returned. If the char is not found -1 is returned. */ + + int i; + + for ( i = beg; i < len; i++ ) + { + if ( string[ i ] == c ) { + return i; + } + } + + return -1; +} + + +int find_block_len( char *string, char c, int beg, int len ) +{ + /* Martin A. Hansen, March 2008 */ + + /* Given a string and a begin position, locates the next length of */ + /* a block consisting of a given char. The length of that block is returned. */ + + int i; + + i = beg; + + while ( i < len && string[ i ] == c ) + { + i++; + } + + return i - beg; +} + + +char l2n[26] = { 2, 255, 1, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255 }; + +void dna2bin( char *raw, int size ) +{ + /* Khisanth from #perl March 2008 */ + + /* Encodes a DNA string to a bit array */ + + Inline_Stack_Vars; + unsigned int i = 0; + unsigned char packed_value = 0; + char *packed = malloc( size / 4 ); + packed[0] = 0; + + for( i = 0; i < size / 4; i++ ) { + packed_value = l2n[ raw[i*4] - 'A' ] << 6 + | l2n[ raw[i*4+1] - 'A' ] << 4 + | l2n[ raw[i*4+2] - 'A' ] << 2 + | l2n[ raw[i*4+3] - 'A' ]; + packed[i] = packed_value; + } + + Inline_Stack_Reset; + Inline_Stack_Push(sv_2mortal(newSVpvn(packed, size / 4 ))); + Inline_Stack_Done; + free(packed); +} + + +char *conv[256] = { + "TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", + "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", + "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", + "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", + "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", + "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", + "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", + "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", + "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", + "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", + "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", + "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", + "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", + "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", + "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", + "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", + "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", + "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", + "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", + "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", + "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", + "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", + "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", + "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", + "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", + "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", + "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", + "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", + "GGGT", "GGGC", "GGGA", "GGGG" +}; + + +void bin2dna( char *raw, int size ) +{ + /* Khisanth from #perl, March 2008 */ + + /* Converts a bit array to DNA which is returned. */ + + Inline_Stack_Vars; + char *unpacked = malloc( 4 * size + 1 ); + + int i = 0; + unsigned char conv_index; + unpacked[0] = 0; + + for( i = 0; i < size; i++ ) { + memset( &conv_index, raw[i], 1 ); + memcpy( unpacked + i*4, conv[conv_index], 4); + } + + Inline_Stack_Reset; + Inline_Stack_Push(sv_2mortal(newSVpvn(unpacked, 4 * size))); + Inline_Stack_Done; + free(unpacked); +} + + +void bin2dna_old( char *bin, int bin_len ) +{ + /* Martin A. Hansen, March 2008 */ + + /* Converts a binary string to DNA which is returned. */ + + Inline_Stack_Vars; + + int i, c; + + char *dna = ( char* )( malloc( bin_len / 2 ) ); + + c = 0; + + for ( i = 1; i < bin_len; i += 2 ) + { + if ( bin[ i - 1 ] == '1' ) + { + if ( bin[ i ] == '1' ) { + dna[ c ] = 'G'; + } else { + dna[ c ] = 'A'; + } + } + else + { + if ( bin[ i ] == '1' ) { + dna[ c ] = 'C'; + } else { + dna[ c ] = 'T'; + } + } + + c++; + } + + Inline_Stack_Reset; + Inline_Stack_Push( sv_2mortal( newSVpvn( dna, ( bin_len / 2 ) ) ) ); + Inline_Stack_Done; + + free( dna ); +} + + +void hard_mask( char *seq, int beg, int len, int sub_beg, int sub_len ) +{ + /* Martin A. Hansen, March 2008 */ + + /* Hard masks a sequnce in a given interval, which is trimmed, */ + /* if it does not match the sequence. */ + + int i, mask_beg, mask_len; + + if ( sub_beg + sub_len >= beg && sub_beg <= beg + len ) + { + mask_beg = beg - sub_beg; + + if ( mask_beg < 0 ) { + mask_beg = 0; + } + + mask_len = len; + + if ( sub_len < mask_len ) { + mask_len = sub_len; + } + + for ( i = mask_beg; i < mask_beg + mask_len; i++ ) { + seq[ i ] = 'N'; + } + } +} + + +void soft_mask( char *seq, int beg, int len, int sub_beg, int sub_len ) +{ + /* Martin A. Hansen, March 2008 */ + + /* Soft masks a sequnce in a given interval, which is trimmed, */ + /* if it does not match the sequence. */ + + int i, mask_beg, mask_len; + + if ( sub_beg + sub_len >= beg && sub_beg <= beg + len ) + { + mask_beg = beg - sub_beg; + + if ( mask_beg < 0 ) { + mask_beg = 0; + } + + mask_len = len; + + if ( sub_len < mask_len ) { + mask_len = sub_len; + } + + for ( i = mask_beg; i < mask_beg + mask_len; i++ ) { + seq[ i ] = seq[ i ] ^ ' '; + } + } +} + +END_C + + +use Maasha::Common; +use Maasha::Fasta; +use Maasha::Seq; + +use constant { + SEQ_NAME => 0, + SEQ => 1, +}; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub twobit_get_TOC +{ + # Martin A. Hansen, March 2008. + + # Fetches the table of contents (TOC) from a 2bit file. + # The TOC is returned as a list of lists. + + # The 2bit format is described here: + # http://genome.ucsc.edu/FAQ/FAQformat#format7 + + my ( $fh, # filehandle + ) = @_; + + # Returns AoA. + + my ( $signature, $version, $seq_count, $reserved, $i, $seq_name_size, $string, $seq_name, $offset, @AoA ); + + sysseek $fh, 0, 0; + + $signature = &unpack_32bit( $fh ); + $version = &unpack_32bit( $fh ); + $seq_count = &unpack_32bit( $fh ); + $reserved = &unpack_32bit( $fh ); + + &Maasha::Common::error( qq(2bit file signature didn't match - inverse bit order?) ) if $signature != 0x1A412743; + + for ( $i = 0; $i < $seq_count; $i++ ) + { + $seq_name_size = &unpack_8bit( $fh ); + + sysread $fh, $string, $seq_name_size; + + $seq_name = unpack( "A$seq_name_size", $string ); + + $offset = &unpack_32bit( $fh ); + + push @AoA, [ $seq_name, $offset ]; + } + + return wantarray ? @AoA : \@AoA; +} + + +sub twobit_get_seq +{ + # Martin A. Hansen, March 2008. + + # Given a filehandle to a 2bit file, gets the sequence + # or subsequence from an 2bit file entry at the given + # offset position. + + # The 2bit format is described here: + # http://genome.ucsc.edu/FAQ/FAQformat#format7 + + my ( $fh, # filehandle + $offset, # byte position + $sub_beg, # begin of subsequence - OPTIONAL + $sub_len, # length of subsequence - OPTIONAL + $mask, # retrieve soft mask information flag - OPTIONAL + ) = @_; + + # Returns a string. + + my ( $string, $seq_len, $n_count, $n, @n_begs, @n_sizes, $m_count, $m, @m_begs, @m_sizes, $reserved, $seq ); + + $sub_beg ||= 0; + $sub_len ||= 9999999999; + + sysseek $fh, $offset, 0; + + $seq_len = &unpack_32bit( $fh ); + $sub_len = $seq_len if $sub_len > $seq_len; + + $n_count = &unpack_32bit( $fh ); + + map { push @n_begs, &unpack_32bit( $fh ) } 1 .. $n_count; + map { push @n_sizes, &unpack_32bit( $fh ) } 1 .. $n_count; + + $m_count = &unpack_32bit( $fh ); + + map { push @m_begs, &unpack_32bit( $fh ) } 1 .. $m_count; + map { push @m_sizes, &unpack_32bit( $fh ) } 1 .. $m_count; + + $reserved = &unpack_32bit( $fh ); + + $offset += 4 + 4 + $n_count * 8 + 4 + $m_count * 8 + 4; + + $seq = &unpack_dna( $fh, $offset, $sub_beg, $sub_len ); + + for ( $n = 0; $n < $n_count; $n++ ) + { + hard_mask( $seq, $n_begs[ $n ], $n_sizes[ $n ], $sub_beg, $sub_len ); + + last if $sub_beg + $sub_len < $n_begs[ $n ]; + } + + if ( $mask ) + { + for ( $m = 0; $m < $m_count; $m++ ) + { + soft_mask( $seq, $m_begs[ $m ], $m_sizes[ $m ], $sub_beg, $sub_len ); + + last if $sub_beg + $sub_len < $m_begs[ $m ]; + } + } + + return $seq; +} + + +sub unpack_8bit +{ + # Martin A. Hansen, March 2008. + + # Reads in 8 bits from the given filehandle + # and returns the encoded value. + + # NB swap still needs fixing. + + my ( $fh, # filehandle + $swap, # bit order swap flag - OPTIONAL + ) = @_; + + # Returns integer. + + my ( $string, $val ); + + sysread $fh, $string, 1; + + $val = unpack( "C", $string ); + + return $val; +} + + +sub unpack_32bit +{ + # Martin A. Hansen, March 2008. + + # Reads in 32 bits from the given filehandle + # and returns the encoded value. + + my ( $fh, # filehandle + $swap, # bit order swap flag - OPTIONAL + ) = @_; + + # Returns integer. + + my ( $string, $val ); + + sysread $fh, $string, 4; + + if ( $swap ) { + $val = unpack( "N", $string ); + } else { + $val = unpack( "V", $string ); + } + + return $val; +} + + +sub unpack_dna +{ + # Martin A. Hansen, March 2008. + + # Unpacks the DNA beginning at the given filehandle. + # The DNA packed to two bits per base, where the first + # base is in the most significant 2-bit byte; the last + # base is in the least significant 2 bits. The packed + # DNA field is padded with 0 bits as necessary to take + # an even multiple of 32 bits in the file. + + # NB swap still needs fixing. + + my ( $fh, # filehandle + $offset, # file offset + $beg, # sequence beg + $len, # sequence length + $swap, # bit order swap flag - OPTIONAL + ) = @_; + + # Returns a string. + + my ( $bin, $bin_beg, $bin_len, $dna, $bin_diff, $len_diff ); + + $bin_beg = int( $beg / 4 ); + $bin_beg-- if $beg % 4; + $bin_beg = 0 if $bin_beg < 0; + + $bin_len = int( $len / 4 ); + $bin_len++ if $len % 4; + + sysseek $fh, $offset + $bin_beg, 0; + sysread $fh, $bin, $bin_len; + + $dna = bin2dna( $bin, $bin_len ); + + $bin_diff = $beg - $bin_beg * 4; + $len_diff = $bin_len * 4 - $len; + + $dna =~ s/^.{$bin_diff}// if $bin_diff; + $dna =~ s/.{$len_diff}$// if $len_diff; + + return $dna; +} + + +sub fasta2twobit +{ + # Martin A. Hansen, March 2008. + + # Converts a FASTA file to 2bit format. + + my ( $fh_in, # file handle to FASTA file + $fh_out, # output file handle - OPTIONAL + $mask, # preserver soft masking - OPTIONAL + ) = @_; + + my ( $seq_offset, $offset, $entry, $mask_index, $seq_len, $seq_name_len, $pack_len, $rec_len, $index, $bin, $seq ); + + $fh_out = \*STDOUT if not $fh_out; + + # ---- Creating content index ---- + + $seq_offset = 0; # offset for reading sequence from FASTA file + $offset = 16; # offset starting after header line which is 16 bytes + + while ( $entry = &Maasha::Fasta::get_entry( $fh_in ) ) + { + $seq_len = length $entry->[ SEQ ]; + $seq_name_len = length $entry->[ SEQ_NAME ]; + + $mask_index = &mask_locate( $entry->[ SEQ ], $mask ); + + $pack_len = ( $seq_len + ( 4 - ( $seq_len ) % 4 ) ) / 4; + + $rec_len = ( + 4 # Sequence length + + 4 # N blocks + + 4 * $mask_index->{ "N_COUNT" } # N begins + + 4 * $mask_index->{ "N_COUNT" } # N lengths + + 4 # M blocks + + 4 * $mask_index->{ "M_COUNT" } # M begins + + 4 * $mask_index->{ "M_COUNT" } # M lengths + + 4 # reserved + + $pack_len # Packed DNA - 32 bit multiplum of 2 bit/base sequence in bytes + ); + + push @{ $index }, { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ_NAME_LEN => $seq_name_len, + SEQ_BEG => $seq_offset + $seq_name_len + 2, + SEQ_LEN => $seq_len, + N_COUNT => $mask_index->{ "N_COUNT" }, + N_BEGS => $mask_index->{ "N_BEGS" }, + N_LENS => $mask_index->{ "N_LENS" }, + M_COUNT => $mask_index->{ "M_COUNT" }, + M_BEGS => $mask_index->{ "M_BEGS" }, + M_LENS => $mask_index->{ "M_LENS" }, + REC_LEN => $rec_len, + }; + + $offset += ( + + 1 # 1 byte SEQ_NAME size + + $seq_name_len # SEQ_NAME depending on SEQ_NAME size + + 4 # 32 bit offset position of sequence record + ); + + $seq_offset += $seq_name_len + 2 + $seq_len + 1; + } + + # ---- Printing Header ---- + + $bin = pack( "V4", oct "0x1A412743", "0", scalar @{ $index }, 0 ); # signature, version, sequence count and reserved + + print $fh_out $bin; + + # ---- Printing TOC ---- + + undef $bin; + + foreach $entry ( @{ $index } ) + { + $bin .= pack( "C", $entry->{ "SEQ_NAME_LEN" } ); # 1 byte SEQ_NAME size + $bin .= pack( qq(A$entry->{ "SEQ_NAME_LEN" }), $entry->{ "SEQ_NAME" } ); # SEQ_NAME depending on SEQ_NAME size + $bin .= pack( "V", $offset ); # 32 bit offset position of sequence record + + $offset += $entry->{ "REC_LEN" }; + } + + print $fh_out $bin; + + # ---- Printing Records ---- + + foreach $entry ( @{ $index } ) + { + undef $bin; + + $bin .= pack( "V", $entry->{ "SEQ_LEN" } ); + $bin .= pack( "V", $entry->{ "N_COUNT" } ); + + map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_BEGS" } }; + map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_LENS" } }; + + $bin .= pack( "V", $entry->{ "M_COUNT" } ); + + map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_BEGS" } }; + map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_LENS" } }; + + $bin .= pack( "V", 0 ); + + sysseek $fh_in, $entry->{ "SEQ_BEG" }, 0; + sysread $fh_in, $seq, $entry->{ "SEQ_LEN" }; + + $seq = uc $seq; + $seq =~ tr/RYWSMKHDVBN/TTTTTTTTTTT/; + + $bin .= &pack_dna( $seq ); + + print $fh_out $bin; + } + + close $fh_in; + close $fh_out; +} + + +sub pack_dna +{ + # Martin A. Hansen, March 2008. + + # Packs a DNA sequence into a bit array, The DNA packed to two bits per base, + # represented as so: T - 00, C - 01, A - 10, G - 11. The first base is + # in the most significant 2-bit byte; the last base is in the least significant + # 2 bits. For example, the sequence TCAG is represented as 00011011. + # The packedDna field is padded with 0 bits as necessary to take an even + # multiple of 32 bits in the file. + + my ( $dna, # dna string to pack + ) = @_; + + # Returns bit array + + my ( $bin ); + + $dna .= "T" x ( 4 - ( length( $dna ) % 4 ) ); + + $bin = dna2bin( $dna, length $dna ); + + return $bin; +} + + +sub mask_locate +{ + # Martin A. Hansen, March 2008. + + # Locate N-blocks and M-blocks in a given sequence. + # These blocks a continously streches of Ns and Ms in a string, + # and the begins and lenghts of these blocks are saved in a + # hash along with the count of each block type. + + my ( $seq, # Sequence + $mask, # preserve soft masking flag - OPTIONAL + ) = @_; + + # Returns a hash. + + my ( $n_mask, $m_mask, $seq_len, $pos, $n_beg, $n_len, $m_beg, $m_len, @n_begs, @n_lens, @m_begs, @m_lens, %mask_hash ); + + $seq =~ tr/atcgunRYWSMKHDVBrywsmkhdvb/MMMMMNNNNNNNNNNNNNNNNNNNNN/; + + $n_mask = 1; # always mask Ns. + $m_mask = $mask || 0; + + $seq_len = length $seq; + + $pos = 0; + + while ( $n_mask or $m_mask ) + { + if ( $n_mask ) + { + $n_beg = find_block_beg( $seq, "N", $pos, $seq_len ); + + $n_mask = 0 if $n_beg < 0; + } + + if ( $m_mask ) + { + $m_beg = find_block_beg( $seq, "M", $pos, $seq_len ); + + $m_mask = 0 if $m_beg < 0; + } + + if ( $n_mask and $m_mask ) + { + if ( $n_beg < $m_beg ) + { + $n_len = find_block_len( $seq, "N", $n_beg, $seq_len ); + + push @n_begs, $n_beg; + push @n_lens, $n_len; + + $pos = $n_beg + $n_len; + } + else + { + $m_len = find_block_len( $seq, "M", $m_beg, $seq_len ); + + push @m_begs, $m_beg; + push @m_lens, $m_len; + + $pos = $m_beg + $m_len; + } + } + elsif ( $n_mask ) + { + $n_len = find_block_len( $seq, "N", $n_beg, $seq_len ); + + push @n_begs, $n_beg; + push @n_lens, $n_len; + + $pos = $n_beg + $n_len; + } + elsif ( $m_mask ) + { + $m_len = find_block_len( $seq, "M", $m_beg, $seq_len ); + + push @m_begs, $m_beg; + push @m_lens, $m_len; + + $pos = $m_beg + $m_len; + } + else + { + last; + } + } + + %mask_hash = ( + N_COUNT => scalar @n_begs, + N_BEGS => [ @n_begs ], + N_LENS => [ @n_lens ], + M_COUNT => scalar @m_begs, + M_BEGS => [ @m_begs ], + M_LENS => [ @m_lens ], + ); + + return wantarray ? %mask_hash : \%mask_hash; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm new file mode 100644 index 0000000..5170725 --- /dev/null +++ b/code_perl/Maasha/UCSC.pm @@ -0,0 +1,1582 @@ +package Maasha::UCSC; + +# Copyright (C) 2007 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Stuff for interacting with UCSC genome browser + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use vars qw ( @ISA @EXPORT ); + +use Data::Dumper; +use Time::HiRes qw( gettimeofday ); + +use Maasha::Common; +use Maasha::Calc; +use Maasha::Matrix; + +use constant { + CHR_BEG => 0, + NEXT_CHR_BEG => 1, + CHR_END => 2, + INDEX_BEG => 3, + INDEX_LEN => 4, +}; + +@ISA = qw( Exporter ); + +my $TIME = gettimeofday(); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED + + +sub bed_get_entry +{ + # Martin A. Hansen, December 2007. + + # Reads a bed entry given a filehandle. + + my ( $fh, # file handle + $columns, # number of BED columns to read - OPTIONAL + ) = @_; + + # Returns hashref. + + my ( $line, @fields, %entry ); + + $line = <$fh>; + + $line =~ s/(\n|\r)$//g; # some people have carriage returns in their BED files -> Grrrr + + return if not defined $line; + + @fields = split "\t", $line; + + $columns ||= scalar @fields; + + if ( $columns == 3 ) + { + %entry = ( + "CHR" => $fields[ 0 ], + "CHR_BEG" => $fields[ 1 ], + "CHR_END" => $fields[ 2 ] - 1, + ); + } + elsif ( $columns == 4 ) + { + %entry = ( + "CHR" => $fields[ 0 ], + "CHR_BEG" => $fields[ 1 ], + "CHR_END" => $fields[ 2 ] - 1, + "Q_ID" => $fields[ 3 ], + ); + } + elsif ( $columns == 5 ) + { + %entry = ( + "CHR" => $fields[ 0 ], + "CHR_BEG" => $fields[ 1 ], + "CHR_END" => $fields[ 2 ] - 1, + "Q_ID" => $fields[ 3 ], + "SCORE" => $fields[ 4 ], + ); + } + elsif ( $columns == 6 ) + { + %entry = ( + "CHR" => $fields[ 0 ], + "CHR_BEG" => $fields[ 1 ], + "CHR_END" => $fields[ 2 ] - 1, + "Q_ID" => $fields[ 3 ], + "SCORE" => $fields[ 4 ], + "STRAND" => $fields[ 5 ], + ); + } + elsif ( $columns == 12 ) + { + %entry = ( + "CHR" => $fields[ 0 ], + "CHR_BEG" => $fields[ 1 ], + "CHR_END" => $fields[ 2 ] - 1, + "Q_ID" => $fields[ 3 ], + "SCORE" => $fields[ 4 ], + "STRAND" => $fields[ 5 ], + "THICK_BEG" => $fields[ 6 ], + "THICK_END" => $fields[ 7 ] - 1, + "ITEMRGB" => $fields[ 8 ], + "BLOCKCOUNT" => $fields[ 9 ], + "BLOCKSIZES" => $fields[ 10 ], + "Q_BEGS" => $fields[ 11 ], + ); + } + else + { + &Maasha::Common::error( qq(Bad BED format in line->$line<-) ); + } + + $entry{ "REC_TYPE" } = "BED"; + $entry{ "BED_LEN" } = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1; + $entry{ "BED_COLS" } = $columns; + + return wantarray ? %entry : \%entry; +} + + +sub bed_get_entries +{ + # Martin A. Hansen, January 2008. + + # Given a path to a BED file, read in all entries + # and return. + + my ( $path, # full path to BED file + $columns, # number of columns in BED file - OPTIONAL (but is faster) + ) = @_; + + # Returns a list. + + my ( $fh, $entry, @list ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $entry = &bed_get_entry( $fh ) ) { + push @list, $entry; + } + + close $fh; + + return wantarray ? @list : \@list; +} + + +sub bed_put_entry +{ + # Martin A. Hansen, Septermber 2007. + + # Writes a BED entry to file. + + # NB, this could really be more robust!? + + my ( $record, # hashref + $fh, # file handle - OPTIONAL + $columns, # number of columns in BED file - OPTIONAL (but is faster) + ) = @_; + + # Returns nothing. + + my ( @fields ); + + $columns ||= 12; # max number of columns possible + + if ( $columns == 3 ) + { + push @fields, $record->{ "CHR" }; + push @fields, $record->{ "CHR_BEG" }; + push @fields, $record->{ "CHR_END" } + 1; + } + elsif ( $columns == 4 ) + { + $record->{ "Q_ID" } =~ s/\s+/_/g; + + push @fields, $record->{ "CHR" }; + push @fields, $record->{ "CHR_BEG" }; + push @fields, $record->{ "CHR_END" } + 1; + push @fields, $record->{ "Q_ID" }; + } + elsif ( $columns == 5 ) + { + $record->{ "Q_ID" } =~ s/\s+/_/g; + $record->{ "SCORE" } =~ s/\.\d*//; + + push @fields, $record->{ "CHR" }; + push @fields, $record->{ "CHR_BEG" }; + push @fields, $record->{ "CHR_END" } + 1; + push @fields, $record->{ "Q_ID" }; + push @fields, $record->{ "SCORE" }; + } + elsif ( $columns == 6 ) + { + $record->{ "Q_ID" } =~ s/\s+/_/g; + $record->{ "SCORE" } =~ s/\.\d*//; + + push @fields, $record->{ "CHR" }; + push @fields, $record->{ "CHR_BEG" }; + push @fields, $record->{ "CHR_END" } + 1; + push @fields, $record->{ "Q_ID" }; + push @fields, $record->{ "SCORE" }; + push @fields, $record->{ "STRAND" }; + } + else + { + $record->{ "Q_ID" } =~ s/\s+/_/g; + $record->{ "SCORE" } =~ s/\.\d*//; + + push @fields, $record->{ "CHR" }; + push @fields, $record->{ "CHR_BEG" }; + push @fields, $record->{ "CHR_END" } + 1; + push @fields, $record->{ "Q_ID" }; + push @fields, $record->{ "SCORE" }; + push @fields, $record->{ "STRAND" }; + push @fields, $record->{ "THICK_BEG" } if defined $record->{ "THICK_BEG" }; + push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" }; + push @fields, $record->{ "ITEMRGB" } if defined $record->{ "ITEMRGB" }; + push @fields, $record->{ "BLOCKCOUNT" } if defined $record->{ "BLOCKCOUNT" }; + push @fields, $record->{ "BLOCKSIZES" } if defined $record->{ "BLOCKSIZES" }; + push @fields, $record->{ "Q_BEGS" } if defined $record->{ "Q_BEGS" }; + } + + if ( $fh ) { + print $fh join( "\t", @fields ), "\n"; + } else { + print join( "\t", @fields ), "\n"; + } +} + + +sub bed_put_entries +{ + # Martin A. Hansen, January 2008. + + # Write a list of BED entries. + + my ( $entries, # list of entries, + $fh, # file handle - OPTIONAL + ) = @_; + + # Returns nothing. + + map { &bed_put_entry( $_, $fh ) } @{ $entries }; +} + + +sub bed_analyze +{ + # Martin A. Hansen, March 2008. + + # Given a bed record, analysis this to give information + # about intron/exon sizes. + + my ( $entry, # BED entry + ) = @_; + + # Returns hashref. + + my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot ); + + $exon_max = 0; + $exon_min = 9999999999; + $intron_max = 0; + $intron_min = 9999999999; + + $entry->{ "EXONS" } = $entry->{ "BLOCKCOUNT" }; + + @begs = split /,/, $entry->{ "Q_BEGS" }; + @lens = split /,/, $entry->{ "BLOCKSIZES" }; + + for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ ) + { + $exon_len = @lens[ $i ]; + + $entry->{ "EXON_LEN_$i" } = $exon_len; + + $exon_max = $exon_len if $exon_len > $exon_max; + $exon_min = $exon_len if $exon_len < $exon_min; + + $exon_tot += $exon_len; + } + + $entry->{ "EXON_LEN_-1" } = $exon_len; + $entry->{ "EXON_MAX_LEN" } = $exon_max; + $entry->{ "EXON_MIN_LEN" } = $exon_min; + $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } ); + + $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1; + $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0; + + if ( $entry->{ "INTRONS" } ) + { + for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ ) + { + $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] ); + + $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len; + + $intron_max = $intron_len if $intron_len > $intron_max; + $intron_min = $intron_len if $intron_len < $intron_min; + + $intron_tot += $intron_len; + } + + $entry->{ "INTRON_LEN_-1" } = $intron_len; + $entry->{ "INTRON_MAX_LEN" } = $intron_max; + $entry->{ "INTRON_MIN_LEN" } = $intron_min; + $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } ); + } + + return wantarray ? %{ $entry } : $entry; +} + + +sub bed_sort +{ + # Martin A. Hansen, March 2008. + + # Sort a potential huge BED file according to + # CHR, CHR_BEG and optionally STRAND. + + my ( $tmp_dir, # temporary directory used for sorting + $file, # BED file to sort + $strand, # flag to sort on strand - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries ); + + $fh_in = &Maasha::Common::read_open( $file ); + + while ( $entry = &bed_get_entry( $fh_in ) ) + { + if ( $strand ) { + $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" }; + } else { + $key = $entry->{ "CHR" }; + } + + $fh_hash{ $key } = &Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key }; + + &bed_put_entry( $entry, $fh_hash{ $key } ); + } + + close $fh_in; + + map { close $_ } keys %fh_hash; + + $fh_out = &Maasha::Common::write_open( "$tmp_dir/temp.sort" ); + + foreach $part_file ( sort keys %fh_hash ) + { + $entries = &bed_get_entries( "$tmp_dir/$part_file.sort" ); + + @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; + + map { &bed_put_entry( $_, $fh_out ) } @{ $entries }; + + unlink "$tmp_dir/$part_file.sort"; + } + + close $fh_out; + + rename "$tmp_dir/temp.sort", $file; +} + + +sub bed_merge_entries +{ + # Martin A. Hansen, February 2008. + + # Merge a list of given BED entries in one big entry. + + my ( $entries, # list of BED entries to be merged + ) = @_; + + # Returns hash. + + my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry ); + + @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; + + for ( $i = 0; $i < @{ $entries }; $i++ ) + { + &Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" }; + &Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" }; + + push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i ); + + if ( exists $entries->[ $i ]->{ "Q_BEGS" } ) + { + @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" }; + @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" }; + } + else + { + @q_begs = 0; + @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1; + } + + map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs; + + push @new_q_begs, @q_begs; + push @new_blocksizes, @blocksizes; + } + + map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs; + + %new_entry = ( + CHR => $entries->[ 0 ]->{ "CHR" }, + CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" }, + CHR_END => $entries->[ -1 ]->{ "CHR_END" }, + REC_TYPE => "BED", + BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1, + BED_COLS => 12, + Q_ID => join( ":", @q_ids ), + SCORE => 999, + STRAND => $entries->[ 0 ]->{ "STRAND" } || "+", + THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" }, + THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" }, + ITEMRGB => "0,0,0", + BLOCKCOUNT => scalar @new_q_begs, + BLOCKSIZES => join( ",", @new_blocksizes ), + Q_BEGS => join( ",", @new_q_begs ), + ); + + return wantarray ? %new_entry : \%new_entry; +} + + +sub bed_split_entry +{ + # Martin A. Hansen, February 2008. + + # Splits a given BED entry into a list of blocks, + # which are returned. A list of 6 column BED entry is returned. + + my ( $entry, # BED entry hashref + ) = @_; + + # Returns a list. + + my ( @q_begs, @blocksizes, $block, @blocks, $i ); + + if ( exists $entry->{ "BLOCKCOUNT" } ) + { + @q_begs = split ",", $entry->{ "Q_BEGS" }; + @blocksizes = split ",", $entry->{ "BLOCKSIZES" }; + + for ( $i = 0; $i < @q_begs; $i++ ) + { + undef $block; + + $block->{ "CHR" } = $entry->{ "CHR" }; + $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ]; + $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1; + $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i ); + $block->{ "SCORE" } = $entry->{ "SCORE" }; + $block->{ "STRAND" } = $entry->{ "STRAND" }; + $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1, + $block->{ "BED_COLS" } = 6; + $block->{ "REC_TYPE" } = "BED"; + + push @blocks, $block; + } + } + else + { + @blocks = @{ $entry }; + } + + return wantarray ? @blocks : \@blocks; +} + + + +sub bed_overlap +{ + # Martin A. Hansen, February 2008. + + # Checks if two BED entries overlap and + # return 1 if so - else 0; + + my ( $entry1, # hashref + $entry2, # hashref + $no_strand, # don't check strand flag - OPTIONAL + ) = @_; + + # Return bolean. + + return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" }; + return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" }; + + if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) { + return 0; + } else { + return 1; + } +} + + +sub bed_upload_to_ucsc +{ + # Martin A. Hansen, September 2007. + + # Upload a BED file to the UCSC database. + + my ( $tmp_dir, # temporary directory + $file, # file to upload, + $options, # argument hashref + $append, # flag indicating table should be appended + ) = @_; + + # Returns nothing. + + my ( $args, $table, $sql_file, $fh_out, $fh_in ); + + if ( $append ) { + $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file; + } else { + $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file; + } + + if ( $options->{ "sec_struct" } ) + { + $table = $options->{ "table" }; + + &Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/; + + $sql_file = "$tmp_dir/upload_RNA_SS.sql"; + + $fh_out = &Maasha::Common::write_open( $sql_file ); + + print $fh_out qq( +CREATE TABLE $table ( + bin smallint not null, # Bin number for browser speedup + chrom varchar(255) not null, # Chromosome or FPC contig + chromStart int unsigned not null, # Start position in chromosome + chromEnd int unsigned not null, # End position in chromosome + name varchar(255) not null, # Name of item + score int unsigned not null, # Score from 0-1000 + strand char(1) not null, # + or - + size int unsigned not null, # Size of element. + secStr longblob not null, # Parentheses and '.'s which define the secondary structure + conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0). + #Indices + INDEX(name(16)), + INDEX(chrom(8), bin), + INDEX(chrom(8), chromStart) +); + ); + + close $fh_out; + + &Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" ); + + unlink $sql_file; + } + else + { + &Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub psl_get_entries +{ + # Martin A. Hansen, February 2008. + + # Reads PSL entries and returns a record. + + my ( $path, # full path to PSL file + ) = @_; + + # Returns hashref. + + my ( $fh, @lines, @fields, $i, %record, @records ); + + $fh = &Maasha::Common::read_open( $path ); + + @lines = <$fh>; + + close $fh; + + chomp @lines; + + for ( $i = 5; $i < @lines; $i++ ) + { + @fields = split "\t", $lines[ $i ]; + + &Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21; + + undef %record; + + %record = ( + REC_TYPE => "PSL", + MATCHES => $fields[ 0 ], + MISMATCHES => $fields[ 1 ], + REPMATCHES => $fields[ 2 ], + NCOUNT => $fields[ 3 ], + QNUMINSERT => $fields[ 4 ], + QBASEINSERT => $fields[ 5 ], + SNUMINSERT => $fields[ 6 ], + SBASEINSERT => $fields[ 7 ], + STRAND => $fields[ 8 ], + Q_ID => $fields[ 9 ], + Q_LEN => $fields[ 10 ], + Q_BEG => $fields[ 11 ], + Q_END => $fields[ 12 ] - 1, + S_ID => $fields[ 13 ], + S_LEN => $fields[ 14 ], + S_BEG => $fields[ 15 ], + S_END => $fields[ 16 ] - 1, + BLOCKCOUNT => $fields[ 17 ], + BLOCKSIZES => $fields[ 18 ], + Q_BEGS => $fields[ 19 ], + S_BEGS => $fields[ 20 ], + ); + + $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" }; + + push @records, { %record }; + } + + return wantarray ? @records : \@records; +} + + +sub psl_put_header +{ + # Martin A. Hansen, September 2007. + + # Write a PSL header to file. + + my ( $fh, # file handle - OPTIONAL + ) = @_; + + # Returns nothing. + + $fh = \*STDOUT if not $fh; + + print $fh qq(psLayout version 3 +match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStart match match count bases count bases name size start end name size start end count +--------------------------------------------------------------------------------------------------------------------------------------------------------------- +); +} + + +sub psl_put_entry +{ + # Martin A. Hansen, September 2007. + + # Write a PSL entry to file. + + my ( $record, # hashref + $fh, # file handle - OPTIONAL + ) = @_; + + # Returns nothing. + + $fh = \*STDOUT if not $fh; + + my @output; + + push @output, $record->{ "MATCHES" }; + push @output, $record->{ "MISMATCHES" }; + push @output, $record->{ "REPMATCHES" }; + push @output, $record->{ "NCOUNT" }; + push @output, $record->{ "QNUMINSERT" }; + push @output, $record->{ "QBASEINSERT" }; + push @output, $record->{ "SNUMINSERT" }; + push @output, $record->{ "SBASEINSERT" }; + push @output, $record->{ "STRAND" }; + push @output, $record->{ "Q_ID" }; + push @output, $record->{ "Q_LEN" }; + push @output, $record->{ "Q_BEG" }; + push @output, $record->{ "Q_END" } + 1; + push @output, $record->{ "S_ID" }; + push @output, $record->{ "S_LEN" }; + push @output, $record->{ "S_BEG" }; + push @output, $record->{ "S_END" } + 1; + push @output, $record->{ "BLOCKCOUNT" }; + push @output, $record->{ "BLOCKSIZES" }; + push @output, $record->{ "Q_BEGS" }; + push @output, $record->{ "S_BEGS" }; + + print $fh join( "\t", @output ), "\n"; +} + + +sub psl_upload_to_ucsc +{ + # Martin A. Hansen, September 2007. + + # Upload a PSL file to the UCSC database. + + my ( $file, # file to upload, + $options, # argument hashref + $append, # flag indicating table should be appended + ) = @_; + + # Returns nothing. + + my ( $args ); + + if ( $append ) { + $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file; + } else { + $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file; + } + + &Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub update_my_tracks +{ + # Martin A. Hansen, September 2007. + + # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl + + my ( $options, # hashref + $type, # track type + ) = @_; + + # Returns nothing. + + my ( $file, $fh_in, $fh_out, $line, $time ); + + $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra"; + + # ---- create a backup ---- + + $fh_in = &Maasha::Common::read_open( $file ); + $fh_out = &Maasha::Common::write_open( "$file~" ); + + while ( $line = <$fh_in> ) { + print $fh_out $line; + } + + close $fh_in; + close $fh_out; + + # ---- append track ---- + + $time = &Maasha::Common::time_stamp(); + + $fh_out = &Maasha::Common::append_open( $file ); + + if ( $type eq "sec_struct" ) + { + print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; + + print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; + + print $fh_out "# Database $options->{ 'database' }\n\n"; + + print $fh_out "track $options->{ 'table' }\n"; + print $fh_out "shortLabel $options->{ 'short_label' }\n"; + print $fh_out "longLabel $options->{ 'long_label' }\n"; + print $fh_out "group $options->{ 'group' }\n"; + print $fh_out "priority $options->{ 'priority' }\n"; + print $fh_out "visibility $options->{ 'visibility' }\n"; + print $fh_out "color $options->{ 'color' }\n"; + print $fh_out "type bed 6 +\n"; + print $fh_out "mafTrack multiz17way\n"; + + print $fh_out "\n# //\n"; + } + else + { + print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; + + print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n"; + + print $fh_out "# Database $options->{ 'database' }\n\n"; + + print $fh_out "track $options->{ 'table' }\n"; + print $fh_out "shortLabel $options->{ 'short_label' }\n"; + print $fh_out "longLabel $options->{ 'long_label' }\n"; + print $fh_out "group $options->{ 'group' }\n"; + print $fh_out "priority $options->{ 'priority' }\n"; + print $fh_out "useScore 1\n" if $options->{ 'use_score' }; + print $fh_out "visibility $options->{ 'visibility' }\n"; + print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0"; + print $fh_out "color $options->{ 'color' }\n"; + print $fh_out "type $type\n"; + + print $fh_out "\n# //\n"; + } + + close $fh_out; + + &Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub phastcons_get_entry +{ + # Martin A. Hansen, December 2007. + + # Given a file handle to a PhastCons file get the + # next entry which is all the lines after a "fixedStep" + # line and until the next "fixedStep" line or EOF. + + my ( $fh, # filehandle + ) = @_; + + # Returns a list of lines + + my ( $entry, @lines ); + + local $/ = "\nfixedStep "; + + $entry = <$fh>; + + chomp $entry; + + @lines = split "\n", $entry; + + return if @lines == 0; + + $lines[ 0 ] =~ s/fixedStep?\s*//; + + return wantarray ? @lines : \@lines; +} + + +sub phastcons_parse_entry +{ + # Martin A. Hansen, December 2007. + + # Given a PhastCons entry converts this to a + # list of super blocks. + + my ( $lines, # list of lines + $args, # argument hash + ) = @_; + + # Returns + + my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs ); + + $info = shift @{ $lines }; + + if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ ) + { + $chr = $1; + $beg = $2; + $step = $3; + + die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ... + } + + $i = 0; + + while ( $i < @{ $lines } ) + { + if ( $lines->[ $i ] >= $args->{ "threshold" } ) + { + $c = $i + 1; + + while ( $c < @{ $lines } ) + { + if ( $lines->[ $c ] < $args->{ "threshold" } ) + { + $j = $c + 1; + + while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) { + $j++; + } + + if ( $j - $c > $args->{ "gap" } ) + { + if ( $c - $i >= $args->{ "min" } ) + { + push @blocks, { + CHR => $chr, + CHR_BEG => $beg + $i - 1, + CHR_END => $beg + $c - 2, + CHR_LEN => $c - $i, + }; + } + + $i = $j; + + last; + } + + $c = $j + } + else + { + $c++; + } + } + + if ( $c - $i >= $args->{ "min" } ) + { + push @blocks, { + CHR => $chr, + CHR_BEG => $beg + $i - 1, + CHR_END => $beg + $c - 2, + CHR_LEN => $c - $i, + }; + } + + $i = $c; + } + else + { + $i++; + } + } + + $i = 0; + + while ( $i < @blocks ) + { + $c = $i + 1; + + while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } ) + { + $c++; + } + + push @super_blocks, [ @blocks[ $i .. $c - 1 ] ]; + + $i = $c; + } + + foreach $super_block ( @super_blocks ) + { + foreach $block ( @{ $super_block } ) + { + push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" }; + push @lens, $block->{ "CHR_LEN" } - 1; + } + + $lens[ -1 ]++; + + push @entries, { + CHR => $super_block->[ 0 ]->{ "CHR" }, + CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + CHR_END => $super_block->[ -1 ]->{ "CHR_END" }, + Q_ID => "Q_ID", + SCORE => 100, + STRAND => "+", + THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" }, + THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1, + ITEMRGB => "0,200,100", + BLOCKCOUNT => scalar @{ $super_block }, + BLOCKSIZES => join( ",", @lens ), + Q_BEGS => join( ",", @begs ), + }; + + undef @begs; + undef @lens; + } + + return wantarray ? @entries : \@entries; +} + + +sub phastcons_index_create +{ + # Martin A. Hansen, January 2008. + + # Indexes a concatenated PhastCons file. + # The index consists of a hash with chromosomes as keys, + # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. + + my ( $path, # path to PhastCons file + ) = @_; + + # Returns a hashref + + my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i ); + + $fh = &Maasha::Common::read_open( $path ); + + $pos = 0; + + while ( $entry = &Maasha::UCSC::phastcons_get_entry( $fh ) ) + { + $locator = shift @{ $entry }; + + if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) + { + $chr = $1; + $beg = $2 - 1; # phastcons files are 1-based + $step = $3; + } + else + { + &Maasha::Common::error( qq(Could not parse PhastCons locator: $locator) ); + } + + $pos += length( $locator ) + 11; + + $index_beg = $pos; + +# map { $pos += length( $_ ) + 1 } @{ $entry }; + + $pos += 6 * scalar @{ $entry }; + + $index_len = $pos - $index_beg; + + push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ]; + } + + close $fh; + + foreach $chr ( keys %index ) + { + for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) { + $index{ $chr }->[ $i ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ]; + } + + $index{ $chr }->[ -1 ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ CHR_END ] + 1; + } + + return wantarray ? %index : \%index; +} + + +sub phastcons_index_store +{ + # Martin A. Hansen, January 2008. + + # Writes a PhastCons index to binary file. + + my ( $path, # full path to file + $index, # list with index + ) = @_; + + # returns nothing + + &Maasha::Common::file_store( $path, $index ); +} + + +sub phastcons_index_retrieve +{ + # Martin A. Hansen, January 2008. + + # Retrieves a PhastCons index from binary file. + + my ( $path, # full path to file + ) = @_; + + # returns list + + my $index; + + $index = &Maasha::Common::file_retrieve( $path ); + + return wantarray ? %{ $index } : $index; +} + + +sub phastcons_index_lookup +{ + # Martin A. Hansen, January 2008. + + # Retrieve PhastCons scores from a indexed + # Phastcons file given a chromosome and + # begin and end positions. + + my ( $index, # data structure + $fh, # filehandle to datafile + $chr, # chromosome + $chr_beg, # chromosome beg + $chr_end, # chromosome end + $flank, # include flanking region - OPTIONAL + ) = @_; + + # Returns a list + + my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores ); + + $flank ||= 0; + + $chr_beg -= $flank; + $chr_end += $flank; + +# print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n"; + + if ( exists $index->{ $chr } ) + { + $index_beg = &Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg ); + + if ( $index_beg < 0 ) { + &Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) ); + } + + if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] ) + { + $index_end = $index_beg; + } + else + { + $index_end = &Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end ); + + if ( $index_end < 0 ) { + &Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) ); + } + } + + map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg; + + if ( $index_beg == $index_end ) + { + $beg = &Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ); + $end = &Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] ); + + if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ) + { + @vals = split "\n", &Maasha::Common::file_read( + $fh, + $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ), + 6 * ( $end - $beg + 1 ), + ); + } + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; + } + } + else + { + $beg = &Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ); + +# print Dumper( $beg, $index->{ $chr }->[ $index_beg ] ); +# print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ NEXT_CHR_BEG ] ); + + # beg next + # v v + # |||||||||....... + + if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] ) + { + @vals = split "\n", &Maasha::Common::file_read( + $fh, + $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ), + 6 * ( $index->{ $chr }->[ $index_beg ]->[ CHR_END ] - $beg + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; + } + } + + $end = &Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] ); + + if ( $end <= $index->{ $chr }->[ $index_end ]->[ CHR_END ] ) + { + @vals = split "\n", &Maasha::Common::file_read( + $fh, + $index->{ $chr }->[ $index_end ]->[ INDEX_BEG ], + 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ]; + } + } + + for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ ) + { + @vals = split "\n", &Maasha::Common::file_read( + $fh, + $index->{ $chr }->[ $i ]->[ INDEX_BEG ], + 6 * ( $index->{ $chr }->[ $i ]->[ CHR_END ] - $index->{ $chr }->[ $i ]->[ CHR_BEG ] + 1 ), + ); + + for ( $c = 0; $c < @vals; $c++ ) { + $scores->[ $c + $index->{ $chr }->[ $i ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ]; + } + } + } + } + else + { + &Maasha::Common::error( qq(Chromosome "$chr" was not found in index) ); + } + + return wantarray ? @{ $scores } : $scores; +} + + +sub phastcons_normalize +{ + # Martin A. Hansen, January 2008. + + # Normalizes a list of lists with PhastCons scores, + # in such a way that each list contains the same number + # or PhastCons scores. + + my ( $AoA, # AoA with PhastCons scores + ) = @_; + + # Returns AoA. + + my ( $list, $max, $min, $mean, $diff ); + + $min = 99999999; + $max = 0; + + foreach $list ( @{ $AoA } ) + { + $min = scalar @{ $list } if scalar @{ $list } < $min; + $max = scalar @{ $list } if scalar @{ $list } > $max; + } + + $mean = int( ( $min + $max ) / 2 ); + +# print STDERR "min->$min max->$max mean->$mean\n"; + + foreach $list ( @{ $AoA } ) + { + $diff = scalar @{ $list } - $mean; + + &phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0; + &phastcons_list_deflate( $list, $diff ) if $diff > 0; + } + + return wantarray ? @{ $AoA } : $AoA; +} + + +sub phastcons_list_inflate +{ + # Martin A. Hansen, January 2008. + + # Inflates a list with a given number of elements + # in such a way that the extra elements are introduced + # evenly over the entire length of the list. The value + # of the extra elements is based on a mean of the + # adjacent elements. + + my ( $list, # list of elements + $diff, # number of elements to introduce + ) = @_; + + # Returns nothing + + my ( $len, $space, $i, $pos ); + + $len = scalar @{ $list }; + + $space = $len / $diff; + + for ( $i = 0; $i < $diff; $i++ ) + { + $pos = int( ( $space / 2 ) + $i * $space ); + + splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2; + # splice @{ $list }, $pos, 0, "X"; + } + + die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff; +} + + +sub phastcons_list_deflate +{ + # Martin A. Hansen, January 2008. + + # Deflates a list by removing a given number of elements + # evenly distributed over the entire list. + + my ( $list, # list of elements + $diff, # number of elements to remove + ) = @_; + + # Returns nothing + + my ( $len, $space, $i, $pos ); + + $len = scalar @{ $list }; + + $space = ( $len - $diff ) / $diff; + + for ( $i = 0; $i < $diff; $i++ ) + { + $pos = int( ( $space / 2 ) + $i * $space ); + + splice @{ $list }, $pos, 1; + } + + die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff; +} + + +sub phastcons_mean +{ + # Martin A. Hansen, January 2008. + + # Given a normalized PhastCons matrix in an AoA, + # calculate the mean for each column and return as a list. + + my ( $AoA, # AoA with normalized PhastCons scores + ) = @_; + + # Returns a list + + my ( @list ); + + $AoA = &Maasha::Matrix::matrix_flip( $AoA ); + + map { push @list, &Maasha::Calc::mean( $_ ) } @{ $AoA }; + + return wantarray ? @list : \@list; +} + + +sub phastcons_median +{ + # Martin A. Hansen, January 2008. + + # Given a normalized PhastCons matrix in an AoA, + # calculate the median for each column and return as a list. + + my ( $AoA, # AoA with normalized PhastCons scores + ) = @_; + + # Returns a list + + my ( @list ); + + $AoA = &Maasha::Matrix::matrix_flip( $AoA ); + + map { push @list, &Maasha::Calc::median( $_ ) } @{ $AoA }; + + return wantarray ? @list : \@list; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub maf_extract +{ + # Martin A. Hansen, April 2008. + + # Executes mafFrag to extract a subalignment from a multiz track + # in the UCSC genome browser database. + + my ( $tmp_dir, # temporary directory + $database, # genome database + $table, # table with the multiz track + $chr, # chromosome + $beg, # begin position + $end, # end position + $strand, # strand + ) = @_; + + # Returns a list of record + + my ( $tmp_file, $align ); + + $tmp_file = "$tmp_dir/maf_extract.maf"; + + &Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" ); + + $align = &maf_parse( $tmp_file ); + + unlink $tmp_file; + + return wantarray ? @{ $align } : $align; +} + + +sub maf_parse +{ + # Martin A. Hansen, April 2008. + + + my ( $path, # full path to MAF file + ) = @_; + + # Returns a list of record. + + my ( $fh, $line, @fields, @align ); + + $fh = &Maasha::Common::read_open( $path ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^s/ ) + { + @fields = split / /, $line; + + push @align, { + SEQ_NAME => $fields[ 1 ], + SEQ => $fields[ -1 ], + ALIGN => 1, + ALIGN_LEN => length $fields[ -1 ], + } + } + } + + close $fh; + + return wantarray ? @align : \@align; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub fixedstep_put_entry +{ + # Martin A. Hansen, April 2008. + + # Outputs a block of fixedStep values. + # Used for outputting wiggle data. + + my ( $chr, # chromosome + $beg, # start position + $block, # list of scores + $fh, # filehandle - OPTIONAL + ) = @_; + + # Returns nothing. + + $beg += 1; # fixedStep format is 1 based. + + if ( $fh ) + { + print $fh "fixedStep chrom=$chr start=$beg step=1\n"; + + map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block }; + } + else + { + print "fixedStep chrom=$chr start=$beg step=1\n"; + + map { printf( "%d\n", ( $_ + 1 ) ) } @{ $block }; + } +} + + +sub wiggle_upload_to_ucsc +{ + # Martin A. Hansen, May 2008. + + # Upload a wiggle file to the UCSC database. + + my ( $tmp_dir, # temporary directory + $wib_dir, # wib directory + $wig_file, # file to upload, + $options, # argument hashref + ) = @_; + + # Returns nothing. + + my ( $args ); + +# $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file; + +# &Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" ); + + `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub ucsc_get_user +{ + # Martin A. Hansen, May 2008 + + # Fetches the MySQL database user name from the + # .hg.conf file in the users home directory. + + # Returns a string. + + my ( $fh, $line, $user ); + + $fh = &Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^db\.user=(.+)/ ) + { + $user = $1; + + last; + } + } + + close $fh; + + return $user; +} + + +sub ucsc_get_password +{ + # Martin A. Hansen, May 2008 + + # Fetches the MySQL database password from the + # .hg.conf file in the users home directory. + + # Returns a string. + + my ( $fh, $line, $password ); + + $fh = &Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" ); + + while ( $line = <$fh> ) + { + chomp $line; + + if ( $line =~ /^db\.password=(.+)/ ) + { + $password = $1; + + last; + } + } + + close $fh; + + return $password; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ + diff --git a/code_perl/README b/code_perl/README new file mode 100644 index 0000000..6fe1107 --- /dev/null +++ b/code_perl/README @@ -0,0 +1,18 @@ +New developers should add their own subdirectory here for their own Perl modules: + + ../biopieces/code_perl/Maasha # this is my Perl modules directory. + ../biopieces/code_perl/ # this could be your new modules directory. + +After creating your new subdirectory you can write some Perl modules and put there: + ../biopieces/code_perl//.pm + ../biopieces/code_perl//.pm + ... + +Finally, add your new Perl modules directory to the PERL5LIB path in the configuration: + ../biopieces/bp_conf/bashrc # look inside this file for information on how to do that. + + +All done. + + +Martin A. Hansen, July 2008.