From 0d89b8deac1cfe90a5246ef634d217ebcf15d3f7 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 1 Jul 2008 06:23:56 +0000 Subject: [PATCH] more wikis git-svn-id: http://biopieces.googlecode.com/svn/trunk@93 74ccb610-7750-0410-82ae-013aeee3265d --- bp_usage/analyze_seq | 26 -------- bp_usage/analyze_seq.wiki | 132 +++++++++++++++++++++++++++++++++++++ bp_usage/count_vals | 18 ----- bp_usage/count_vals.wiki | 120 +++++++++++++++++++++++++++++++++ bp_usage/uniq_vals | 19 ------ bp_usage/uniq_vals.wiki | 135 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 387 insertions(+), 63 deletions(-) delete mode 100644 bp_usage/analyze_seq create mode 100644 bp_usage/analyze_seq.wiki delete mode 100644 bp_usage/count_vals create mode 100644 bp_usage/count_vals.wiki delete mode 100644 bp_usage/uniq_vals create mode 100644 bp_usage/uniq_vals.wiki diff --git a/bp_usage/analyze_seq b/bp_usage/analyze_seq deleted file mode 100644 index 55b45f8..0000000 --- a/bp_usage/analyze_seq +++ /dev/null @@ -1,26 +0,0 @@ -Author: Martin Asser Hansen - Copyright (C) - All rights reserved - -Contact: mail@maasha.dk - -Date: August 2007 - -License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) - -Description: Analysis the residue composition of each sequence in stream. - -Usage: ... | $script [options] - -Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN -Options: [-O | --stream_out=] - Write output to file - Default=STDOUT - -Examples: ... | $script - Analyzes all sequences in stream. - -Keys out: SEQ_TYPE - Guessed Sequence type. -Keys out: SEQ_LEN - Sequence length. -Keys out: RES - Residue count. -Keys out: RES_SUM - Sum of all non-indel residues. -Keys out: GC% - GC content in percent for DNA/RNA sequences. -Keys out: HARD_MASK% - Percentage of sequence hard-masked with N's. -Keys out: SOFT_MASK% - Percentage of sequence soft-masked with lower case letters. -Keys out: MIX_INDEX - Sequence composition index: most common residue over the sequence length. -Keys out: MELT_TEMP - Melting temperature of DNA/RNA sequence: 4 degrees per GC pair, 2 degrees per AT/U pair. diff --git a/bp_usage/analyze_seq.wiki b/bp_usage/analyze_seq.wiki new file mode 100644 index 0000000..2350dfe --- /dev/null +++ b/bp_usage/analyze_seq.wiki @@ -0,0 +1,132 @@ +=Biopiece: analyze_seq= + +==Synopsis== + +Analyzes the sequence composition of sequences in the stream. + +==Description== + +[analyze_seq] analyzes the sequence specified by the SEQ key in each record. This is +done by automagically guessing the sequence type based on the first 100 residues, and +then determining the frequency of relevant residues and indels. Futhermore, GC% is +determined for nucleotide sequence as well as SOFT_MASK% (soft masked sequence is +indicated by lower case letters), HARD_MASK% (hard masked sequence consists of N's). +Also, a MIX_INDEX is calculated that indicates the complexity of the sequence defined as +the most common residue over the sequence length (that [analyze_seq] also determines). +Finally, for nucleotide sequences, a melting temperature is calculated (4 degrees per GC-pair, +2 degrees per AT/U-pair) + +==Usage== + +{{{ +... | analyze_seq [options] +}}} + +==Options== + +{{{ +[-I | --stream_in=] - Read input from stream file - Default=STDIN +[-O | --stream_out=] - Write output to file - Default=STDOUT +}}} + +==Examples== + +Consider the file `test.fna` containing the single entry: + +{{{ +>test +ACGACGCATNNNNNNactgatcga +}}} + +To analyze this sequence, read the file using [read_fasta] and then pipe the stream through [analyze_vals]: + +{{{ +read_fasta -i test.fna | analyze_seq + +RES:D: 0 +MIX_INDEX: 0.25 +RES:W: 0 +RES:G: 4 +RES:B: 0 +SOFT_MASK%: 37.50 +RES:V: 0 +SEQ_NAME: test +RES_SUM: 24 +HARD_MASK%: 25.00 +RES:H: 0 +RES:S: 0 +RES:.: 0 +RES:N: 6 +RES:A: 6 +GC%: 37.50 +MELT_TEMP: 54.00 +RES:Y: 0 +RES:M: 0 +RES:T: 3 +RES:_: 0 +RES:K: 0 +SEQ_TYPE: DNA +RES:~: 0 +SEQ: ACGACGCATNNNNNNactgatcga +RES:R: 0 +SEQ_LEN: 24 +RES:C: 5 +RES:-: 0 +RES:U: 0 +--- +}}} + +If you have a stack of sequences in one file and you want to determine the mean GC content +of all the sequences, you can do it using the [mean_vals] biopiece: + +{{{ +read_fasta -i test.fna | analyze_seq | mean_vals -k GC% -x + +GC%_MEAN: 37.50 +--- +}}} + +Similarly, if you want the total count of Ns in all sequences use the biopiece [sum_vals]: + +{{{ +read_fasta -i test.fna | analyze_seq | sum_vals -k RES:N + +RES:N_SUM: 6 +--- +}}} + +Finally, if you want to remove low complexity sequence from the stream, [grab] is your friend: + +{{{ +read_fasta -i test.fna | analyze_seq | grab -e 'MIX_INDEX<0.85' +}}} + +==See also== + +[read_fasta] + +[grab] + +[mean_vals] + +[sum_vals] + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. + +mail@maasha.dk + +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +[analyze_seq] is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ diff --git a/bp_usage/count_vals b/bp_usage/count_vals deleted file mode 100644 index 8a18d0f..0000000 --- a/bp_usage/count_vals +++ /dev/null @@ -1,18 +0,0 @@ -Author: Martin Asser Hansen - Copyright (C) - All rights reserved - -Contact: mail@maasha.dk - -Date: August 2007 - -License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) - -Description: Count the number of times values of given keys exists in stream. - -Usage: ... | count_vals [options] - -Options: [-k | --keys=] - Comma separeted list of keys -Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN -Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT - -Examples: ... | count_vals -k SEQ - Count occurence of each SEQ in stream. - diff --git a/bp_usage/count_vals.wiki b/bp_usage/count_vals.wiki new file mode 100644 index 0000000..afb5bb0 --- /dev/null +++ b/bp_usage/count_vals.wiki @@ -0,0 +1,120 @@ +=Biopiece: count_vals= + +==Synopsis== + +Count the number of times values of given keys exists in stream + +==Description== + +Given a comma seperated list of keys [count_vals] for each of these keys counts the +number of identical values. Since the count basically is dependant on one hash per key, +[count_vals] have the potential to blow the memory quite easily. This is countered by +caching the count to disk for every 5 million records, however, the disk caching may be slow. + +==Usage== + +{{{ +... | count_vals [options] +}}} + +==Options== + +{{{ +[-k | --keys=] - Comma separeted list of keys. +[-I | --stream_in=] - Read input from stream file - Default=STDIN +[-O | --stream_out=] - Write output to stream file - Default=STDOUT +}}} + +==Examples== + +Consider the following two column table in the file `test.tab`: + +{{{ +Human H2 +Human H3 +Dog D1 +Dog D2 +Mouse M1 +}}} + +To count the values of both columns we first read the table with [read_tab] and then pipe the result to [count_vals]: + +{{{ +read_tab -i test.tab | count_vals -k V0,V1 + +V0: Human +V1_COUNT: 1 +V1: H1 +V0_COUNT: 3 +--- +V0: Human +V1_COUNT: 1 +V1: H2 +V0_COUNT: 3 +--- +V0: Human +V1_COUNT: 1 +V1: H3 +V0_COUNT: 3 +--- +V0: Dog +V1_COUNT: 1 +V1: D1 +V0_COUNT: 2 +--- +V0: Dog +V1_COUNT: 1 +V1: D2 +V0_COUNT: 2 +--- +V0: Mouse +V1_COUNT: 1 +V1: M1 +V0_COUNT: 1 +--- +}}} + +The result is that for each of the specified keys (V0 and V1) a new key with the suffix _COUNT +is added where the value is the global count. The result is better displayed after piping through [write_tab]: + +{{{ +read_tab -i test.tab | count_vals -k V0,V1 | write_tab -xck V0,V0_COUNT,V1,V1_COUNT + +#V0 V0_COUNT V1 V1_COUNT +Human 3 H1 1 +Human 3 H2 1 +Human 3 H3 1 +Dog 2 D1 1 +Dog 2 D2 1 +Mouse 1 M1 1 +}}} + +==See also== + +[read_tab] + +[write_tab] + +[uniq_vals] + +[grab] + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. + +mail@maasha.dk + +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +[count_vals] is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ diff --git a/bp_usage/uniq_vals b/bp_usage/uniq_vals deleted file mode 100644 index dd0d53a..0000000 --- a/bp_usage/uniq_vals +++ /dev/null @@ -1,19 +0,0 @@ -Author: Martin Asser Hansen - Copyright (C) - All rights reserved - -Contact: mail@maasha.dk - -Date: August 2007 - -License: GNU General Public License version 2 (http://www.gnu.org/copyleft/gpl.html) - -Description: Locate records in stream where the values for a given key is unique or non-unique. - -Usage: ... | uniq_vals [options] - -Options: [-k | --key=] - Key for which the value is checked for uniqueness. -Options: [-i | --invert] - Display non-unique records. -Options: [-I | --stream_in=] - Read input from stream file - Default=STDIN -Options: [-O | --stream_out=] - Write output to stream file - Default=STDOUT - -Examples: ... | uniq_vals -k SEQ_NAME - Locate records with unique SEQ_NAME value. - diff --git a/bp_usage/uniq_vals.wiki b/bp_usage/uniq_vals.wiki new file mode 100644 index 0000000..c0fe889 --- /dev/null +++ b/bp_usage/uniq_vals.wiki @@ -0,0 +1,135 @@ +=Biopiece: uniq_vals= + +==Synopsis== + +Select unique or non-unique records from the stream based on the value of a given key. + +==Description== + +[uniq_vals] selects records from the stream by checking values of a given key. If a duplicate +record exists, it will only be output once. Thus, [uniq_vals] does _not_ locate records +where the value to the specified key is located only once (see [count_vals]). If the `-i` switch +is used, then non-unique records are located. + +==Usage== + +{{{ +... | uniq_vals [options] +}}} + +==Options== + +{{{ +[-k | --key=] - Key for which the value is checked for uniqueness. +[-i | --invert] - Display non-unique records. +[-I | --stream_in=] - Read input from stream file - Default=STDIN +[-O | --stream_out=] - Write output to stream file - Default=STDOUT +}}} + +==Examples== + +Consider the following two column table in the file `test.tab`: + +{{{ +Human H2 +Human H3 +Dog D1 +Dog D2 +Mouse M1 +}}} + +To locate all unique values of the first columen we use [read_tab] and pipe the result to [uniq_vals]: + +{{{ +read_tab -i test.tab | uniq_vals -k V0 + +V0: Human +V1: H1 +--- +V0: Dog +V1: D1 +--- +V0: Mouse +V1: M1 +--- +}}} + +The result is three records, one _unique_ for each V0. + +If we instead want the non-unique records we use the `-i` switch with [uniq_vals]: + +{{{ +read_tab -i test.tab | uniq_vals -k V0 -i + +V0: Human +V1: H2 +--- +V0: Human +V1: H3 +--- +V0: Dog +V1: D2 +--- +}}} + +... and the result shows those records which duplicate values to V0. + +So, how do we get the non-duplicated record with the `Mouse`? That is in fact not a job +for [uniq_vals], but rather for [count_vals] and [grab]. + +{{{ +read_tab -i test.tab | count_vals -k V0 | grab -e 'V0_COUNT=1' + +V0: Mouse +V1: M1 +V0_COUNT: 1 +--- +}}} + +However, if we use both [count_vals] and [uniq_vals] we can obtain a list of how many times +each of the records were duplicated based on the first column: + +{{{ +read_tab -i test.tab | count_vals -k V0 | uniq_vals -k 'V0_COUNT' + +V0: Human +V1: H1 +V0_COUNT: 3 +--- +V0: Dog +V1: D1 +V0_COUNT: 2 +--- +V0: Mouse +V1: M1 +V0_COUNT: 1 +--- +}}} + +==See also== + +[read_tab] + +[count_vals] + +[grab] + +==Author== + +Martin Asser Hansen - Copyright (C) - All rights reserved. + +mail@maasha.dk + +August 2007 + +==License== + +GNU General Public License version 2 + +http://www.gnu.org/copyleft/gpl.html + +==Help== + +[uniq_vals] is part of the Biopieces framework. + +http://code.google.com/p/biopieces/ -- 2.39.5