From aa2eb787be61d57de1fc22bf34c506b8c28485ac Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 1 Jul 2008 02:56:37 +0000 Subject: [PATCH] added tile_seq.wiki git-svn-id: http://biopieces.googlecode.com/svn/trunk@89 74ccb610-7750-0410-82ae-013aeee3265d --- bp_usage/rename_keys | 18 ---- bp_usage/rename_keys.wiki | 66 ++++++++++++++ bp_usage/tile_seq | 19 ---- bp_usage/tile_seq.wiki | 187 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 253 insertions(+), 37 deletions(-) delete mode 100644 bp_usage/rename_keys create mode 100644 bp_usage/rename_keys.wiki delete mode 100644 bp_usage/tile_seq create mode 100644 bp_usage/tile_seq.wiki diff --git a/bp_usage/rename_keys b/bp_usage/rename_keys deleted file mode 100644 index 15c2e17..0000000 --- a/bp_usage/rename_keys +++ /dev/null @@ -1,18 +0,0 @@ -Author: Martin Asser Hansen - Copyright (C) - All rights reserved - -Contact: mail@maasha.dk - -Date: August 2007 - -License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) - -Description: Rename keys in stream. - -Usage: ... | rename_keys [options] - -Options: [-k | --keys=] - Keys to find and replace. -Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN -Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT - -Examples: ... | rename_keys -k PATTERN_LEN,HIT_LEN - Renames PATTERN_LEN key to HIT_LEN. - diff --git a/bp_usage/rename_keys.wiki b/bp_usage/rename_keys.wiki new file mode 100644 index 0000000..0edf138 --- /dev/null +++ b/bp_usage/rename_keys.wiki @@ -0,0 +1,66 @@ +=Biopiece: rename_keys= + +==Synopsis== + +Rename keys of records in stream. + +==Description== + +Sometimes it is necessary to rename record keys to allow biopieces, who require +specific record keys, to operate on the records. This could e.g. be BLAST records +where there is both a subject ID (S_ID) and a subject sequence (S_SEQ) as well as +a query ID (Q_ID) and and a query sequence (Q_SEQ). If you want to write either the +query sequence or the subject sequence as FASTA output, you will have to rename the +record keys accordingly. + +==Usage== + +{{{ +... | rename_keys [options] +}}} + +==Options== + +{{{ +[-k | --keys=] - Keys to find and replace. +[-I | --stream_in=] - Read input from stream file - Default=STDIN +[-O | --stream_out=] - Write output to stream file - Default=STDOUT +}}} + +==Examples== + +To rename all record key Q_ID to SEQ_NAME do: + +{{{ +... | rename_keys -k Q_ID,SEQ_NAME +}}} + +If you need to rename more than one key, then pipe the stream though [rename_keys] twice: + +{{{ +... | rename_keys -k Q_ID,SEQ_NAME | rename_keys -k Q_SEQ,SEQ +}}} + +==See also== + +[blast_seq] + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. + +mail@maasha.dk + +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +[rename_keys] is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ diff --git a/bp_usage/tile_seq b/bp_usage/tile_seq deleted file mode 100644 index 0c48126..0000000 --- a/bp_usage/tile_seq +++ /dev/null @@ -1,19 +0,0 @@ -Author: Martin Asser Hansen - Copyright (C) - All rights reserved - -Contact: mail@maasha.dk - -Date: February 2008 - -License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) - -Description: Using the first sequence in stream as reference, tile all subsequent sequences based on pairwise alignments. - -Usage: ... | $script [options] - -Options: [-i | --identity=] - Minimum identity (%) for pairwise alignment - Default=70 -Options: [-s | --supress_indels] - Supress insertions in query sequence. -Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN -Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT - -Examples: ... | $script -i 75 - Tile all sequences in stream that have a similarity higher than 75%. - diff --git a/bp_usage/tile_seq.wiki b/bp_usage/tile_seq.wiki new file mode 100644 index 0000000..6d4cbf6 --- /dev/null +++ b/bp_usage/tile_seq.wiki @@ -0,0 +1,187 @@ +=Biopiece: tile_seq= + +==Synopsis== + +Using the first sequence in the stream as reference, tile all subsequent sequences +based on pairwise alignments. + +==Description== + +[tile_seq] can create an alignment of several sequences based on pairwise alignments. +This is useful for e.g. matching short sequences such as ESTs or deep sequencing reads +against a reference sequence. [tile_seq] is more precise than a multiple alignment, where +the introduction of indels in the reference sequence will most likely ruin the alignment. +Also, [tile_seq] is capable of dealing with thousands of sequences. + +[tile_seq] currently uses Muscle as alignment engine, and Muscle must be installed in +order for [tile_seq] to work. + +For more about Muscle: + +http://www.drive5.com/muscle/ + +==Usage== + +{{{ +... | tile_seq [options] +}}} + +==Options== + +{{{ +[-i | --identity=] - Minimum identity (%) for pairwise alignment - Default=70 +[-s | --supress_indels] - Supress insertions in query sequence. +[-I | --stream_in=] - Read input from stream file - Default=STDIN +[-O | --stream_out=] - Write output to stream file - Default=STDOUT +}}} + +==Examples== + +Consider the following file `test.fna` containing these FASTA entries: + +{{{ +>ref +ACGACTAGCATCGACTGACA +>test1 +CTAGCTTCGACT +>test2 +GAATCGACT +>test3 +ACGAAACTAGCATC +>test4 +AGCATCGACT +>test5TAACAGGCACT +}}} + +In order to tile the test1, test2 ... test5 sequences against the reference sequence, +first read in the sequence using [read_fasta] and then pipe through [tile_seq]: + +{{{ +read_fasta -i test.fna | tile_seq + +SEQ: ACGACTAGCATCGACTGACA +SEQ_NAME: ref +--- +SEQ: ACGAAACTAGCATC------ +SEQ_NAME: test3_+_85.71 +--- +SEQ: ----CTAGCTTCGACT---- +SEQ_NAME: test1_+_91.67 +--- +SEQ: ------AGCATCGACT---- +SEQ_NAME: test4_+_100.00 +--- +SEQ: -------GAATCGACT---- +SEQ_NAME: test2_+_88.89 +--- +}}} + +The resulting tiled sequences show the reference sequence as the first sequence, and then +the subsequence sequences sorted alphabetically by the sequence itself, thus giving the +tiled output. To pieces of information is added to the SEQ_NAME key, namely the orientation +of the pairwise alignment that gave the highest similarity, and a global identity score that +is calculated as the number of matches over the length of the shortest sequence in the pairwise +alignment. Use the `-i` switch to change the identity cutoff for the inclusion of alignments: + +{{{ +read_fasta -i test.fna | tile_seq -i 60 + +SEQ: ACGACTAGCATCGACTGACA +SEQ_NAME: ref +--- +SEQ: ACGAAACTAGCATC------ +SEQ_NAME: test3_+_85.71 +--- +SEQ: ----CTAGCTTCGACT---- +SEQ_NAME: test1_+_91.67 +--- +SEQ: -----TAACAGGCACT---- +SEQ_NAME: test5_+_63.64 +--- +SEQ: ------AGCATCGACT---- +SEQ_NAME: test4_+_100.00 +--- +SEQ: -------GAATCGACT---- +SEQ_NAME: test2_+_88.89 +--- +}}} + +Now test5 is part of the alignment, and the tiled sequences can be written using [write_align]: + +{{{ +read_fasta -i test.fna | tile_seq -i 60 | write_align -x + + . . +ref ACGACTAGCATCGACTGACA +test3_+_85.71 ACGAAACTAGCATC------ +test1_+_91.67 ----CTAGCTTCGACT---- +test5_+_63.64 -----TAACAGGCACT---- +test4_+_100.00 ------AGCATCGACT---- +test2_+_88.89 -------GAATCGACT---- +Consensus: 50% -------------ACT---- +}}} + +To better illustrate mismatches in the alignment use [invert_align]: + +{{{ +read_fasta -i test.fna | tile_seq -i 60 | invert_align | write_align -x + + . . +ref ACGACTAGCATCGACTGACA +test3_+_85.71 ----AACTAGCATC______ +test1_+_91.67 ____-----T------____ +test5_+_63.64 _____--A--GGC---____ +test4_+_100.00 ______----------____ +test2_+_88.89 _______-A-------____ +Consensus: 50% -------------------- +}}} + +Now we clearly see that an insertion in test3 offsets the alignment. This can behaviour can be +suppressed using the `-s` switch to [tile_seq]: + +{{{ +read_fasta -i test.fna | tile_seq -i 60 -s | invert_align | write_align -x + + . . +ref ACGACTAGCATCGACTGACA +test3_+_100.00 ------------________ +test1_+_91.67 ____-----T------____ +test5_+_63.64 _____--A--GGC---____ +test4_+_100.00 ______----------____ +test2_+_88.89 _______-A-------____ +Consensus: 50% -------------------- +}}} + +Note that the identity score of test3 changes dramatically with the use of the `-s` switch. + +==See also== + +[read_fasta] + +[invert_align] + +[write_align] + +[align_seq] + +[write_fasta] + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. + +mail@maasha.dk + +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +[tile_seq] is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ -- 2.39.5