From fb0d4231f99ab2f078cfaf8c58dc49272e502658 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 7 Nov 2008 10:42:23 +0000 Subject: [PATCH] finished assemble_tag_contigs git-svn-id: http://biopieces.googlecode.com/svn/trunk@297 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/assemble_tag_contigs | 6 ++ code_perl/Maasha/Biopieces.pm | 149 ++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100755 bp_bin/assemble_tag_contigs diff --git a/bp_bin/assemble_tag_contigs b/bp_bin/assemble_tag_contigs new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/assemble_tag_contigs @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 51821ac..c5c3c9d 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -185,6 +185,7 @@ sub run_script elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } + elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } @@ -2076,6 +2077,154 @@ sub script_read_mysql } +sub script_assemble_tag_contigs +{ + # Martin A. Hansen, November 2008. + + # Assemble tags from the stream into + # tag contigs. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record, %fh_hash, $fh_out, $chr, $array, $pos, $beg, $end, $score, $file, $id ); + + while ( $record = get_record( $in ) ) + { + $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; + $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; + $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; + + if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; + + $fh_out = $fh_hash{ $record->{ "CHR" } }; + + Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); + } + } + + map { close $_ } keys %fh_hash; + + foreach $chr ( sort keys %fh_hash ) + { + $array = tag_contigs_assemble( "$BP_TMP/$chr" ); + + $pos = 0; + $id = 0; + + while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg ) + { + $new_record = { + CHR => $chr, + CHR_BEG => $beg, + CHR_END => $end - 1, + Q_ID => sprintf( "TC%06d", $id ), + SCORE => $score, + STRAND => $record->{ 'STRAND' } || '+', + }; + + put_record( $new_record, $out ); + + $pos = $end; + $id++; + } + + unlink "$BP_TMP/$chr"; + } +} + + +sub tag_contigs_assemble +{ + # Martin A. Hansen, November 2008. + + # Given a BED file with entries from only one + # chromosome assembles tag contigs from these + # ignoring strand information. Only tags with + # a score higher than the clone count over + # genomic loci (the SCORE field) is included + # in the tag contigs. + + # ----------- tags + # ------------- + # ---------- + # ---------- + # ======================== tag contig + + + my ( $path, # full path to BED file + ) = @_; + + # Returns an arrayref. + + my ( $fh, $entry, $clones, $score, @array ); + + $fh = Maasha::Common::read_open( $path ); + + while ( $entry = Maasha::UCSC::bed_get_entry( $fh ) ) + { + if ( $entry->{ 'Q_ID' } =~ /(\d+)$/ ) + { + $clones = $1; + + $score = int( $clones / $entry->{ 'SCORE' } ); + + map { $array[ $_ ] += $score } $entry->{ 'CHR_BEG' } .. $entry->{ 'CHR_END' } if $score >= 1; + } + } + + close $fh; + + return wantarray ? @array : \@array; +} + + +sub tag_contigs_scan +{ + # Martin A. Hansen, November 2008. + + # Scans an array with tag contigs and locates + # the next contig from a given position. The + # score of the tag contig is determined as the + # maximum value of the tag contig. If a tag contig + # is found a triple is returned with beg, end and score + # otherwise an empty triple is returned. + + my ( $array, # array to scan + $beg, # position to start scanning from + ) = @_; + + # Returns an arrayref. + + my ( $end, $score ); + + $score = 0; + + while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ } + + $end = $beg; + + while ( $array->[ $end ] ) + { + $score = Maasha::Calc::max( $score, $array->[ $end ] ); + + $end++ + } + + if ( $score > 0 ) { + return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ]; + } else { + return wantarray ? () : []; + } +} + + sub script_format_genome { # Martin A. Hansen, Juli 2008. -- 2.39.5