# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# Join sequences in the stream.
+# Slice aligned sequences in the stream to obtain subsequences.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
raise "either --beg/--end or --forward/--reverse|--reverse_rc must be specified"
end
+if options[:template_file]
+ template = Fasta.open(options[:template_file]).get_entry
+end
+
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
input.each_record do |record|
if record[:SEQ]
entry = Seq.new(seq: record[:SEQ])
unless options[:beg]
- compact = Seq.new(seq: entry.seq.dup)
+ compact = template ? template : Seq.new(seq: entry.seq.dup)
compact.seq.delete! "-.~"
fmatch = compact.patmatch(options[:forward],
--- /dev/null
+SEQ: A-TGAC-GCTGGCGGCATGCTTTACACATGCAAGTCGAACG-GCAGCGG-----GGGCTTCGGCCT----GCC-G--GCGAG-TGGCGAACGGGTGAGTA
+SEQ_NAME: ID00000000
+SEQ_LEN: 100
+---
+SEQ: ACG-AC-GCTGGCGGCATGCTTA-CACATGCAAGTCGCACGAAC------------CTTTCGGG-------------GTT-GGTGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000001
+SEQ_LEN: 100
+---
+SEQ: ACG-AC-GTT-GCGATGCGTCTTAAGCATGCAAGTCGAGCGGGC--TTA----TTCGGGCAACTGGA----TA-A--GTTAG-CGGCGAACTGGTGAGTA
+SEQ_NAME: ID00000002
+SEQ_LEN: 100
+---
+SEQ: ACGA-C-GCTGGCGGCAGGCCTAATACATGCAAGTCGAGCTGCA------------CCTTCGGG-------------TGAGC-TGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000003
+SEQ_LEN: 100
+---
+SEQ: ACGA-C-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGTGAGTA
+SEQ_NAME: ID00000004
+SEQ_LEN: 100
+---
+SEQ: ACGA-CAGCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGTGAGTA
+SEQ_NAME: ID00000005
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGCGAA-AA-------CACTTCGGTG-------T-GAGTAGAG-CGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000006
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCCTAATACATGCAAGTCGAGCGGTC------------CTTTCGGG-------------GGCAG-CGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000007
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAACG-ATGACTC----TCTAGCTTGCTAGA----GAAG--ATTAG-TGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000008
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAACGGGC-----------A-CTTCGG-T------------GCTAG-TGGCAGACGGGTGAGTA
+SEQ_NAME: ID00000009
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------ATAGCAATAT------------GTCAG-CGGCAGACGGGTGAGTA
+SEQ_NAME: ID00000010
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGTGAGTA
+SEQ_NAME: ID00000011
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCATGCTTAACACATGCAAGTCGCACGGTC-------------AGCAAT--------------GGCAG-TGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000012
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCATGCTTAACACATGCAAGTCGCGCGGTC-------------AGCAAT--------------GGCAG-CGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000013
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCGCGCCTAACACATGCAAGTCGAACGAGC--GAG--A-GAGAGCTTGCTTTCT---CG-A--GCGAG-TGGCGAACGGGTGAGTA
+SEQ_NAME: ID00000014
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000015
+SEQ_LEN: 100
+---
+SEQ: ACGAAC-GCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACGGGTGAGTA
+SEQ_NAME: ID00000016
+SEQ_LEN: 100
+---
+SEQ: ACGACC-GCT-GCGGCGTGCCTAACACATGCAAGTCCGACGTGA--AAG-----GGGAGCAATCCC----CCG-G--TAGGG-TGGCAAACGGGTGAGTA
+SEQ_NAME: ID00000017
+SEQ_LEN: 100
+---
+SEQ: AT---C--AGGGCGGGATGCCTAACACATGCAAGTCGAACG-GCAGCACAG-GGAGAGCTTGCTCTC-TGGGT-G--GCGAG-TGGCGGACGGGTGAGGA
+SEQ_NAME: ID00000018
+SEQ_LEN: 100
+---
+SEQ: AT--AC--CTGGCGG-AGGCCT-ACACATGCAAGTCGTACG-GT-AGAC----AGAAACTTGCTTCT----CT-T--GAGAT-CCGCGGACGGGTGAGTA
+SEQ_NAME: ID00000019
+SEQ_LEN: 100
+---
--- /dev/null
+>template
+AT--AC-GCT-GCGG-AGGC-TAA-ACATGCA-GTCGGGCG-GAAACGA--T-GGTAGCTTGCTACCA--GGC-G--TCGAG-CGGCGGACGGGTGAGTA
--- /dev/null
+SEQ: AC-GCT
+SEQ_NAME: ID00000000
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000001
+SEQ_LEN: 6
+---
+SEQ: AC-GTT
+SEQ_NAME: ID00000002
+SEQ_LEN: 6
+---
+SEQ: -C-GCT
+SEQ_NAME: ID00000003
+SEQ_LEN: 6
+---
+SEQ: -C-GCT
+SEQ_NAME: ID00000004
+SEQ_LEN: 6
+---
+SEQ: -CAGCT
+SEQ_NAME: ID00000005
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000006
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000007
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000008
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000009
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000010
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000011
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000012
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000013
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000014
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000015
+SEQ_LEN: 6
+---
+SEQ: AC-GCT
+SEQ_NAME: ID00000016
+SEQ_LEN: 6
+---
+SEQ: CC-GCT
+SEQ_NAME: ID00000017
+SEQ_LEN: 6
+---
+SEQ: -C--AG
+SEQ_NAME: ID00000018
+SEQ_LEN: 6
+---
+SEQ: AC--CT
+SEQ_NAME: ID00000019
+SEQ_LEN: 6
+---
--- /dev/null
+SEQ: TGGCGGCATGCTTTACACATGCAAGTCGAACG-GCAGCGG-----GGGCTTCGGCCT----GCC-G--GCGAG-TGGCGAACGGGT
+SEQ_NAME: ID00000000
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCATGCTTA-CACATGCAAGTCGCACGAAC------------CTTTCGGG-------------GTT-GGTGGCGGACGGGT
+SEQ_NAME: ID00000001
+SEQ_LEN: 86
+---
+SEQ: T-GCGATGCGTCTTAAGCATGCAAGTCGAGCGGGC--TTA----TTCGGGCAACTGGA----TA-A--GTTAG-CGGCGAACTGGT
+SEQ_NAME: ID00000002
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCCTAATACATGCAAGTCGAGCTGCA------------CCTTCGGG-------------TGAGC-TGGCGGACGGGT
+SEQ_NAME: ID00000003
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGT
+SEQ_NAME: ID00000004
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGT
+SEQ_NAME: ID00000005
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCCTAACACATGCAAGTCGAGCGCGAA-AA-------CACTTCGGTG-------T-GAGTAGAG-CGGCGGACGGGT
+SEQ_NAME: ID00000006
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCCTAATACATGCAAGTCGAGCGGTC------------CTTTCGGG-------------GGCAG-CGGCGGACGGGT
+SEQ_NAME: ID00000007
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAACG-ATGACTC----TCTAGCTTGCTAGA----GAAG--ATTAG-TGGCGGACGGGT
+SEQ_NAME: ID00000008
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAACGGGC-----------A-CTTCGG-T------------GCTAG-TGGCAGACGGGT
+SEQ_NAME: ID00000009
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------ATAGCAATAT------------GTCAG-CGGCAGACGGGT
+SEQ_NAME: ID00000010
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACGGGT
+SEQ_NAME: ID00000011
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCATGCTTAACACATGCAAGTCGCACGGTC-------------AGCAAT--------------GGCAG-TGGCGGACGGGT
+SEQ_NAME: ID00000012
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCATGCTTAACACATGCAAGTCGCGCGGTC-------------AGCAAT--------------GGCAG-CGGCGGACGGGT
+SEQ_NAME: ID00000013
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCGCGCCTAACACATGCAAGTCGAACGAGC--GAG--A-GAGAGCTTGCTTTCT---CG-A--GCGAG-TGGCGAACGGGT
+SEQ_NAME: ID00000014
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACGGGT
+SEQ_NAME: ID00000015
+SEQ_LEN: 86
+---
+SEQ: TGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACGGGT
+SEQ_NAME: ID00000016
+SEQ_LEN: 86
+---
+SEQ: T-GCGGCGTGCCTAACACATGCAAGTCCGACGTGA--AAG-----GGGAGCAATCCC----CCG-G--TAGGG-TGGCAAACGGGT
+SEQ_NAME: ID00000017
+SEQ_LEN: 86
+---
+SEQ: GGGCGGGATGCCTAACACATGCAAGTCGAACG-GCAGCACAG-GGAGAGCTTGCTCTC-TGGGT-G--GCGAG-TGGCGGACGGGT
+SEQ_NAME: ID00000018
+SEQ_LEN: 86
+---
+SEQ: TGGCGG-AGGCCT-ACACATGCAAGTCGTACG-GT-AGAC----AGAAACTTGCTTCT----CT-T--GAGAT-CCGCGGACGGGT
+SEQ_NAME: ID00000019
+SEQ_LEN: 86
+---
--- /dev/null
+SEQ: A-TGAC-GCTGGCGGCATGCTTTACACATGCAAGTCGAACG-GCAGCGG-----GGGCTTCGGCCT----GCC-G--GCGAG-TGGCGAACG
+SEQ_NAME: ID00000000
+SEQ_LEN: 92
+---
+SEQ: ACG-AC-GCTGGCGGCATGCTTA-CACATGCAAGTCGCACGAAC------------CTTTCGGG-------------GTT-GGTGGCGGACG
+SEQ_NAME: ID00000001
+SEQ_LEN: 92
+---
+SEQ: ACG-AC-GTT-GCGATGCGTCTTAAGCATGCAAGTCGAGCGGGC--TTA----TTCGGGCAACTGGA----TA-A--GTTAG-CGGCGAACT
+SEQ_NAME: ID00000002
+SEQ_LEN: 92
+---
+SEQ: ACGA-C-GCTGGCGGCAGGCCTAATACATGCAAGTCGAGCTGCA------------CCTTCGGG-------------TGAGC-TGGCGGACG
+SEQ_NAME: ID00000003
+SEQ_LEN: 92
+---
+SEQ: ACGA-C-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACG
+SEQ_NAME: ID00000004
+SEQ_LEN: 92
+---
+SEQ: ACGA-CAGCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACG
+SEQ_NAME: ID00000005
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGCGAA-AA-------CACTTCGGTG-------T-GAGTAGAG-CGGCGGACG
+SEQ_NAME: ID00000006
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCCTAATACATGCAAGTCGAGCGGTC------------CTTTCGGG-------------GGCAG-CGGCGGACG
+SEQ_NAME: ID00000007
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAACG-ATGACTC----TCTAGCTTGCTAGA----GAAG--ATTAG-TGGCGGACG
+SEQ_NAME: ID00000008
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAACGGGC-----------A-CTTCGG-T------------GCTAG-TGGCAGACG
+SEQ_NAME: ID00000009
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------ATAGCAATAT------------GTCAG-CGGCAGACG
+SEQ_NAME: ID00000010
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCAGGCTTAACACATGCAAGTCGAGCGGGC-----------GTAGCAATAC------------GTCAG-CGGCAGACG
+SEQ_NAME: ID00000011
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCATGCTTAACACATGCAAGTCGCACGGTC-------------AGCAAT--------------GGCAG-TGGCGGACG
+SEQ_NAME: ID00000012
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCATGCTTAACACATGCAAGTCGCGCGGTC-------------AGCAAT--------------GGCAG-CGGCGGACG
+SEQ_NAME: ID00000013
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCGCGCCTAACACATGCAAGTCGAACGAGC--GAG--A-GAGAGCTTGCTTTCT---CG-A--GCGAG-TGGCGAACG
+SEQ_NAME: ID00000014
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACG
+SEQ_NAME: ID00000015
+SEQ_LEN: 92
+---
+SEQ: ACGAAC-GCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAC--CGA--C-GGGAGCTTGCTCCCT---TA-G--GTCAG-CGGCGGACG
+SEQ_NAME: ID00000016
+SEQ_LEN: 92
+---
+SEQ: ACGACC-GCT-GCGGCGTGCCTAACACATGCAAGTCCGACGTGA--AAG-----GGGAGCAATCCC----CCG-G--TAGGG-TGGCAAACG
+SEQ_NAME: ID00000017
+SEQ_LEN: 92
+---
+SEQ: AT---C--AGGGCGGGATGCCTAACACATGCAAGTCGAACG-GCAGCACAG-GGAGAGCTTGCTCTC-TGGGT-G--GCGAG-TGGCGGACG
+SEQ_NAME: ID00000018
+SEQ_LEN: 92
+---
+SEQ: AT--AC--CTGGCGG-AGGCCT-ACACATGCAAGTCGTACG-GT-AGAC----AGAAACTTGCTTCT----CT-T--GAGAT-CCGCGGACG
+SEQ_NAME: ID00000019
+SEQ_LEN: 92
+---
--- /dev/null
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -I $in.1 -b 5 -e 10 -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -I $in.1 -f TGGCGGCATG -r GCGAACGGGT -m 0 -i 0 -d 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -f TGGCGGCATG -R ACCCGTTCGC -m 0 -i 0 -d 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -f TGGgGGCATG -r GCGAtCGGGT -m 1 -i 0 -d 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -f TGGCGcGCATG -r GCGAtACGGGT -m 0 -i 1 -d 0 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -f TGGCGCATG -r GCGAAGGGT -m 0 -i 0 -d 1 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -f ATACGCTGCGGAGGCTA -r TTGCTACCAGGCGTCGAGCGGCGGACGGG -t $in.2 -m 0 -d 0 -i 0 -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+