From a1f5e35425fe8b9582aed69da721c5c640d65870 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Sat, 14 Feb 2009 02:30:57 +0000 Subject: [PATCH] add latexdiff and txt2xls --- latexdiff | 3344 ++++++++++++++++++++++++++++++++++++++++++++++ latexize_invoice | 66 +- txt2xls | 146 ++ 3 files changed, 3548 insertions(+), 8 deletions(-) create mode 100755 latexdiff create mode 100755 txt2xls diff --git a/latexdiff b/latexdiff new file mode 100755 index 0000000..ee6c741 --- /dev/null +++ b/latexdiff @@ -0,0 +1,3344 @@ +#!/usr/bin/perl -w +# latexdiff - differences two latex files on the word level +# and produces a latex file with the differences marked up. +# +# Copyright (C) 2004-2007 F J Tilmann (tilmann@esc.cam.ac.uk) +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License Version 2 as published by +# the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Detailed usage information at the end of the file +# +# Version 0.5 A number of minor improvements based on feedback +# Deleted blocks are now shown before added blocks +# Package specific processing +# +# Version 0.43 unreleased typo in list of styles at the end +# Add protect to all \cbstart, \cbend commands +# More robust substitution of deleted math commands +# +# Version 0.42 November 06 Bug fixes only +# +# Version 0.4 March 06 option for fast differencing using UNIX diff command, several minor bug fixes (\par bug, improved highlighting of textcmds) +# +# Version 0.3 August 05 improved parsing of displayed math, --allow-spaces +# option, several minor bug fixes +# +# Version 0.25 October 04 Fix bug with deleted equations, add math mode commands to safecmd, add | to allowed interpunctuation signs +# Version 0.2 September 04 extension to utf-8 and variable encodings +# Version 0.1 August 04 First public release + +# Inserted block for differenceing +# use Algorithm::Diff qw(traverse_sequences); +# in standard version +# The following BEGIN block contains a verbatim copy of +# Ned Konz' Algorithm::Diff package version 1.15 except +# that subroutine _longestCommonSubsequence has been replace by +# a routine which internally uses the UNIX diff command for +# the differencing rather than the Perl routines if the +# length of the sequences exceeds some threshold. +# Also, all POD documentation has been stripped out. +# +# (the distribution on which this modification is based is available +# from http://search.cpan.org/~nedkonz/Algorithm-Diff-1.15 +# the most recent version can be found via http://search.cpan.org/search?module=Algorithm::Diff ) +# Please note that the LICENCSE for Algorithm::Diff : +# "Copyright (c) 2000-2002 Ned Konz. All rights reserved. +# This program is free software; +# you can redistribute it and/or modify it under the same terms +# as Perl itself." +# The fast-differencing version of latexdiff is provided as a convenience +# for latex users under Unix-like systems which have a 'diff' command. +# If you believe +# the inlining of Algorithm::Diff violates its license please contact +# me and I will modify the latexdiff distribution accordingly. +# Frederik Tilmann (tilmann@esc.cam.ac.uk) +# Jonathan Paisley is acknowledged for the idea of using the system diff +# command to achieve shorter running times +BEGIN { +package Algorithm::Diff; +use strict; +use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); +use integer; # see below in _replaceNextLargerWith() for mod to make + # if you don't use this +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff); +$VERSION = sprintf('%d.%02d fast', (q$Revision: 1.15 $ =~ /\d+/g)); + +# Global parameters + +use File::Temp qw/tempfile/; +# if larger number of elements in longestCommonSubsequence smaller than +# this number, then use internal algorithm, otherwise use UNIX diff +use constant THRESHOLD => 100 ; +# Detect whether diff --minimal option is available +# if yes we use it +use constant MINIMAL => ( system('diff','--minimal','/dev/null','/dev/null') >> 8 ==0 ? "--minimal" : "" ) ; + + + +# McIlroy-Hunt diff algorithm +# Adapted from the Smalltalk code of Mario I. Wolczko, +# by Ned Konz, perl@bike-nomad.com + + +# Create a hash that maps each element of $aCollection to the set of positions +# it occupies in $aCollection, restricted to the elements within the range of +# indexes specified by $start and $end. +# The fourth parameter is a subroutine reference that will be called to +# generate a string to use as a key. +# Additional parameters, if any, will be passed to this subroutine. +# +# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen ); + +sub _withPositionsOfInInterval +{ + my $aCollection = shift; # array ref + my $start = shift; + my $end = shift; + my $keyGen = shift; + my %d; + my $index; + for ( $index = $start ; $index <= $end ; $index++ ) + { + my $element = $aCollection->[$index]; + my $key = &$keyGen( $element, @_ ); + if ( exists( $d{$key} ) ) + { + unshift ( @{ $d{$key} }, $index ); + } + else + { + $d{$key} = [$index]; + } + } + return wantarray ? %d : \%d; +} + +# Find the place at which aValue would normally be inserted into the array. If +# that place is already occupied by aValue, do nothing, and return undef. If +# the place does not exist (i.e., it is off the end of the array), add it to +# the end, otherwise replace the element at that point with aValue. +# It is assumed that the array's values are numeric. +# This is where the bulk (75%) of the time is spent in this module, so try to +# make it fast! + +sub _replaceNextLargerWith +{ + my ( $array, $aValue, $high ) = @_; + $high ||= $#$array; + + # off the end? + if ( $high == -1 || $aValue > $array->[-1] ) + { + push ( @$array, $aValue ); + return $high + 1; + } + + # binary search for insertion point... + my $low = 0; + my $index; + my $found; + while ( $low <= $high ) + { + $index = ( $high + $low ) / 2; + + # $index = int(( $high + $low ) / 2); # without 'use integer' + $found = $array->[$index]; + + if ( $aValue == $found ) + { + return undef; + } + elsif ( $aValue > $found ) + { + $low = $index + 1; + } + else + { + $high = $index - 1; + } + } + + # now insertion point is in $low. + $array->[$low] = $aValue; # overwrite next larger + return $low; +} + +# This method computes the longest common subsequence in $a and $b. + +# Result is array or ref, whose contents is such that +# $a->[ $i ] == $b->[ $result[ $i ] ] +# foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined. + +# An additional argument may be passed; this is a hash or key generating +# function that should return a string that uniquely identifies the given +# element. It should be the case that if the key is the same, the elements +# will compare the same. If this parameter is undef or missing, the key +# will be the element as a string. + +# By default, comparisons will use "eq" and elements will be turned into keys +# using the default stringizing operator '""'. + +# Additional parameters, if any, will be passed to the key generation routine. + +sub _longestCommonSubsequence +{ + my $a = shift; # array ref + my $b = shift; # array ref + my $keyGen = shift; # code ref + my $compare; # code ref + + # set up code refs + # Note that these are optimized. + if ( !defined($keyGen) ) # optimize for strings + { + $keyGen = sub { $_[0] }; + $compare = sub { my ( $a, $b ) = @_; $a eq $b }; + } + else + { + $compare = sub { + my $a = shift; + my $b = shift; + &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ ); + }; + } + + my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) = + ( 0, $#$a, 0, $#$b, [] ); + + # Check whether to use internal routine (small number of elements) + # or use it as a wrapper for UNIX diff + if ( ( $#$a > $#$b ? $#$a : $#$b) < THRESHOLD ) { + ### print STDERR "DEBUG: regular longestCommonSubsequence\n"; + # First we prune off any common elements at the beginning + while ( $aStart <= $aFinish + and $bStart <= $bFinish + and &$compare( $a->[$aStart], $b->[$bStart], @_ ) ) + { + $matchVector->[ $aStart++ ] = $bStart++; + } + + # now the end + while ( $aStart <= $aFinish + and $bStart <= $bFinish + and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) ) + { + $matchVector->[ $aFinish-- ] = $bFinish--; + } + + # Now compute the equivalence classes of positions of elements + my $bMatches = + _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ ); + my $thresh = []; + my $links = []; + + my ( $i, $ai, $j, $k ); + for ( $i = $aStart ; $i <= $aFinish ; $i++ ) + { + $ai = &$keyGen( $a->[$i], @_ ); + if ( exists( $bMatches->{$ai} ) ) + { + $k = 0; + for $j ( @{ $bMatches->{$ai} } ) + { + + # optimization: most of the time this will be true + if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j ) + { + $thresh->[$k] = $j; + } + else + { + $k = _replaceNextLargerWith( $thresh, $j, $k ); + } + + # oddly, it's faster to always test this (CPU cache?). + if ( defined($k) ) + { + $links->[$k] = + [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; + } + } + } + } + + if (@$thresh) + { + for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] ) + { + $matchVector->[ $link->[1] ] = $link->[2]; + } + } + } + else { + my ($fha,$fhb,$fna,$fnb,$ele,$key); + my ($alines,$blines,$alb,$alf,$blb,$blf); + my ($minimal)=MINIMAL; + # large number of elements, use system diff + ### print STDERR "DEBUG: fast (diff) longestCommonSubsequence\n"; + + ($fha,$fna)=tempfile("DiffA-XXXX") or die "_longestCommonSubsequence: Cannot open tempfile for sequence A"; + ($fhb,$fnb)=tempfile("DiffB-XXXX") or die "_longestCommonSubsequence: Cannot open tempfile for sequence B"; + # prepare sequence A + foreach $ele ( @$a ) { + $key=&$keyGen( $ele, @_ ); + $key =~ s/\\/\\\\/g ; + $key =~ s/\n/\\n/sg ; + print $fha "$key\n" ; + } + close($fha); + # prepare sequence B + foreach $ele ( @$b ) { + $key=&$keyGen( $ele, @_ ); + $key =~ s/\\/\\\\/g ; + $key =~ s/\n/\\n/sg ; + print $fhb "$key\n" ; + } + close($fhb); + + open(DIFFPIPE, "diff $minimal $fna $fnb |") or die "_longestCommonSubsequence: Cannot launch diff process. $!" ; + # The diff line numbering begins with 1, but Perl subscripts start with 0 + # We follow the diff numbering but substract 1 when assigning to matchVector + $aStart++; $bStart++ ; $aFinish++ ; $bFinish++ ; + while( ) { + if ( ($alines,$blines) = ( m/^(\d*(?:,\d*)?)?c(\d*(?:,\d*)?)?$/ ) ) { + ($alb,$alf)=split(/,/,$alines); + ($blb,$blf)=split(/,/,$blines); + $alf=$alb unless defined($alf); + $blf=$blb unless defined($blf); + while($aStart < $alb ) { + $matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ; + } + # check for consistency + $bStart==$blb or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in changed sequence"; + $aStart=$alf+1; + $bStart=$blf+1; + } + elsif ( ($alb,$blines) = ( m/^(\d*)a(\d*(?:,\d*)?)$/ ) ) { + ($blb,$blf)=split(/,/,$blines); + $blf=$blb unless defined($blf); + while ( $bStart < $blb ) { + $matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ; + } + $aStart==$alb+1 or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in appended sequence near elements $aStart and $bStart"; + $bStart=$blf+1; + } + elsif ( ($alines,$blb) = ( m/^(\d*(?:,\d*)?)d(\d*)$/ ) ) { + ($alb,$alf)=split(/,/,$alines); + $alf=$alb unless defined($alf); + while ( $aStart < $alb ) { + $matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ; + } + $bStart==$blb+1 or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in deleted sequence near elements $aStart and $bStart"; + $aStart=$alf+1; + } + elsif ( m/^Binary files/ ) { + # if diff reports it is a binary file force --text mode. I do not like + # to always use this option because it is probably only available in GNU diff + open(DIFFPIPE, "diff --text $fna $fnb |") or die "Cannot launch diff process. $!" ; + } + # Default: just skip line + } + while ($aStart <= $aFinish ) { + $matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ; + } + $bStart==$bFinish+1 or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency at end"; + close DIFFPIPE; + # check whether a system error has occurred or return status is greater than or equal to 5 + if ( $! || ($? >> 8) > 5) { + print STDERR "diff process failed with exit code ", ($? >> 8), " $!\n"; + die; + } + unlink $fna,$fnb ; + } + return wantarray ? @$matchVector : $matchVector; +} + +sub traverse_sequences +{ + my $a = shift; # array ref + my $b = shift; # array ref + my $callbacks = shift || {}; + my $keyGen = shift; + my $matchCallback = $callbacks->{'MATCH'} || sub { }; + my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; + my $finishedACallback = $callbacks->{'A_FINISHED'}; + my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; + my $finishedBCallback = $callbacks->{'B_FINISHED'}; + my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); + + # Process all the lines in @$matchVector + my $lastA = $#$a; + my $lastB = $#$b; + my $bi = 0; + my $ai; + + for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ ) + { + my $bLine = $matchVector->[$ai]; + if ( defined($bLine) ) # matched + { + &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; + &$matchCallback( $ai, $bi++, @_ ); + } + else + { + &$discardACallback( $ai, $bi, @_ ); + } + } + + # The last entry (if any) processed was a match. + # $ai and $bi point just past the last matching lines in their sequences. + + while ( $ai <= $lastA or $bi <= $lastB ) + { + + # last A? + if ( $ai == $lastA + 1 and $bi <= $lastB ) + { + if ( defined($finishedACallback) ) + { + &$finishedACallback( $lastA, @_ ); + $finishedACallback = undef; + } + else + { + &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB; + } + } + + # last B? + if ( $bi == $lastB + 1 and $ai <= $lastA ) + { + if ( defined($finishedBCallback) ) + { + &$finishedBCallback( $lastB, @_ ); + $finishedBCallback = undef; + } + else + { + &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA; + } + } + + &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA; + &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB; + } + + return 1; +} + +sub traverse_balanced +{ + my $a = shift; # array ref + my $b = shift; # array ref + my $callbacks = shift || {}; + my $keyGen = shift; + my $matchCallback = $callbacks->{'MATCH'} || sub { }; + my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; + my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; + my $changeCallback = $callbacks->{'CHANGE'}; + my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); + + # Process all the lines in match vector + my $lastA = $#$a; + my $lastB = $#$b; + my $bi = 0; + my $ai = 0; + my $ma = -1; + my $mb; + + while (1) + { + + # Find next match indices $ma and $mb + do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] ); + + last if $ma > $#$matchVector; # end of matchVector? + $mb = $matchVector->[$ma]; + + # Proceed with discard a/b or change events until + # next match + while ( $ai < $ma || $bi < $mb ) + { + + if ( $ai < $ma && $bi < $mb ) + { + + # Change + if ( defined $changeCallback ) + { + &$changeCallback( $ai++, $bi++, @_ ); + } + else + { + &$discardACallback( $ai++, $bi, @_ ); + &$discardBCallback( $ai, $bi++, @_ ); + } + } + elsif ( $ai < $ma ) + { + &$discardACallback( $ai++, $bi, @_ ); + } + else + { + + # $bi < $mb + &$discardBCallback( $ai, $bi++, @_ ); + } + } + + # Match + &$matchCallback( $ai++, $bi++, @_ ); + } + + while ( $ai <= $lastA || $bi <= $lastB ) + { + if ( $ai <= $lastA && $bi <= $lastB ) + { + + # Change + if ( defined $changeCallback ) + { + &$changeCallback( $ai++, $bi++, @_ ); + } + else + { + &$discardACallback( $ai++, $bi, @_ ); + &$discardBCallback( $ai, $bi++, @_ ); + } + } + elsif ( $ai <= $lastA ) + { + &$discardACallback( $ai++, $bi, @_ ); + } + else + { + + # $bi <= $lastB + &$discardBCallback( $ai, $bi++, @_ ); + } + } + + return 1; +} + +sub LCS +{ + my $a = shift; # array ref + my $matchVector = _longestCommonSubsequence( $a, @_ ); + my @retval; + my $i; + for ( $i = 0 ; $i <= $#$matchVector ; $i++ ) + { + if ( defined( $matchVector->[$i] ) ) + { + push ( @retval, $a->[$i] ); + } + } + return wantarray ? @retval : \@retval; +} + +sub diff +{ + my $a = shift; # array ref + my $b = shift; # array ref + my $retval = []; + my $hunk = []; + my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) }; + my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) }; + my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; + traverse_sequences( $a, $b, + { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ ); + &$match(); + return wantarray ? @$retval : $retval; +} + +sub sdiff +{ + my $a = shift; # array ref + my $b = shift; # array ref + my $retval = []; + my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) }; + my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) }; + my $change = sub { + push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] ); + }; + my $match = sub { + push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] ); + }; + traverse_balanced( + $a, + $b, + { + MATCH => $match, + DISCARD_A => $discard, + DISCARD_B => $add, + CHANGE => $change, + }, + @_ + ); + return wantarray ? @$retval : $retval; +} + +1; +} +import Algorithm::Diff qw(traverse_sequences); +# End of inserted block for stand-alone version + + +use Getopt::Long ; +use strict ; +use utf8 ; + +my ($algodiffversion)=split(/ /,$Algorithm::Diff::VERSION); + + +my ($versionstring)=< \$type, + 'subtype|s=s' => \$subtype, + 'floattype|f=s' => \$floattype, + 'config|c=s' => \@configlist, + 'preamble|p=s' => \$preamblefile, + 'encoding|e=s' => \$encoding, + 'exclude-safecmd|A=s' => \@excludesafelist, + 'replace-safecmd=s' => \$replacesafe, + 'append-safecmd|a=s' => \@appendsafelist, + 'exclude-textcmd|X=s' => \@excludetextlist, + 'replace-textcmd=s' => \$replacetext, + 'append-textcmd|x=s' => \@appendtextlist, + 'replace-context1cmd=s' => \$replacecontext1, + 'append-context1cmd=s' => \@appendcontext1list, + 'replace-context2cmd=s' => \$replacecontext2, + 'append-context2cmd=s' => \@appendcontext2list, + 'show-preamble' => \$showpreamble, + 'show-safecmd' => \$showsafe, + 'show-textcmd' => \$showtext, + 'show-config' => \$showconfig, + 'show-all' => \$showall, + 'packages=s' => \@packagelist, + 'verbose|V' => \$verbose, + 'ignore-warnings' => \$ignorewarnings, + 'allow-spaces' => \$allowspaces, + 'flatten' => \$flatten, + 'version' => \$version, + 'help|h|H' => \$help); + +if ( $help ) { + usage() ; +} + + +if ( $version ) { + die $versionstring ; +} + +print STDERR $versionstring if $verbose; + +if (defined($showall)){ + $showpreamble=$showsafe=$showtext=$showconfig=1; +} + +# setting extra preamble commands +if (defined($preamblefile)) { + $latexdiffpreamble=join "\n",(extrapream($preamblefile),""); +} else { + $latexdiffpreamble=join "\n",(extrapream($type,$subtype,$floattype),""); +} + +# setting up @SAFECMDLIST and @SAFECMDEXCL +if (defined($replacesafe)) { + init_regex_arr_ext(\@SAFECMDLIST,$replacesafe); +} else { + init_regex_arr_data(\@SAFECMDLIST, "SAFE COMMANDS"); +} +foreach $appendsafe ( @appendsafelist ) { + init_regex_arr_ext(\@SAFECMDLIST, $appendsafe); +} +foreach $excludesafe ( @excludesafelist ) { + init_regex_arr_ext(\@SAFECMDEXCL, $excludesafe); +} + +# Special: treat all cite commands as safe except in UNDERLINE and FONTSTRIKE mode +# (there is a conflict between citation and ulem package, see +# package documentation) +if ( uc($type) ne "UNDERLINE" && uc($type) ne "FONTSTRIKE" && uc($type) ne "CULINECHBAR" ) { + push (@SAFECMDLIST, qr/^cite.*$/); +} + +# setting up @TEXTCMDLIST and @TEXTCMDEXCL +if (defined($replacetext)) { + init_regex_arr_ext(\@TEXTCMDLIST,$replacetext); +} else { + init_regex_arr_data(\@TEXTCMDLIST, "TEXT COMMANDS"); +} +foreach $appendtext ( @appendtextlist ) { + init_regex_arr_ext(\@TEXTCMDLIST, $appendtext); +} +foreach $excludetext ( @excludetextlist ) { + init_regex_arr_ext(\@TEXTCMDEXCL, $excludetext); +} + + +# setting up @CONTEXT1CMDLIST ( @CONTEXT1CMDEXCL exist but is always empty ) +if (defined($replacecontext1)) { + init_regex_arr_ext(\@CONTEXT1CMDLIST,$replacecontext1); +} else { + init_regex_arr_data(\@CONTEXT1CMDLIST, "CONTEXT1 COMMANDS"); +} +foreach $appendcontext1 ( @appendcontext1list ) { + init_regex_arr_ext(\@CONTEXT1CMDLIST, $appendcontext1); +} + + +# setting up @CONTEXT2CMDLIST ( @CONTEXT2CMDEXCL exist but is always empty ) +if (defined($replacecontext2)) { + init_regex_arr_ext(\@CONTEXT2CMDLIST,$replacecontext2); +} else { + init_regex_arr_data(\@CONTEXT2CMDLIST, "CONTEXT2 COMMANDS"); +} +foreach $appendcontext2 ( @appendcontext2list ) { + init_regex_arr_ext(\@CONTEXT2CMDLIST, $appendcontext2); +} + + + + +# setting configuration variables +@config=(); +foreach $config ( @configlist ) { + if (-f $config ) { + open(FILE,$config) or die ("Couldn't open configuration file $config: $!"); + while () { + chomp; + next if /^\s*#/ || /^\s*%/ || /^\s*$/ ; + push (@config,$_); + } + close(FILE); + } + else { +# foreach ( split(",",$config) ) { +# push @config,$_; +# } + push @config,split(",",$config) + } +} +foreach $assign ( @config ) { + $assign=~ m/\s*(\w*)\s*=\s*(\S*)\s*$/ or die "Illegal assignment $assign in configuration list (must be variable=value)"; + if ( $1 eq "MINWORDSBLOCK" ) { $MINWORDSBLOCK = $2; } + elsif ( $1 eq "FLOATENV" ) { $FLOATENV = $2 ; } + elsif ( $1 eq "PICTUREENV" ) { $PICTUREENV = $2 ; } + elsif ( $1 eq "MATHENV" ) { $MATHENV = $2 ; } + elsif ( $1 eq "MATHREPL" ) { $MATHREPL = $2 ; } + elsif ( $1 eq "MATHARRENV" ) { $MATHARRENV = $2 ; } + elsif ( $1 eq "MATHARRREPL" ) { $MATHARRREPL = $2 ; } + elsif ( $1 eq "ARRENV" ) { $ARRENV = $2 ; } + elsif ( $1 eq "COUNTERCMD" ) { $COUNTERCMD = $2 ; } + else { die "Unknown variable $1 in assignment.";} +} + + +foreach $pkg ( @packagelist ) { + map { $packages{$_}="" } split(/,/,$pkg) ; +} + +if ($showpreamble) { + print "\nPreamble commands:\n"; + print $latexdiffpreamble ; +} + +if ($showsafe) { + print "\nCommands safe within scope of $ADDOPEN $ADDCLOSE and $DELOPEN $DELCLOSE (unless excluded):\n"; + print_regex_arr(@SAFECMDLIST); + print "\nCommands not safe within scope of $ADDOPEN $ADDCLOSE and $DELOPEN $DELCLOSE :\n"; + print_regex_arr(@SAFECMDEXCL); +} + +if ($showtext) { + print "\nCommands with last argument textual (unless excluded) and safe in every context:\n"; + print_regex_arr(@TEXTCMDLIST); + print "\nContext1 commands (last argument textual, command will be disabled in deleted passages, last argument will be shown as plain text):\n"; + print_regex_arr(@CONTEXT1CMDLIST); + print "\nContext2 commands (last argument textual, command ant its argument will be disabled in deleted passages):\n"; + print_regex_arr(@CONTEXT2CMDLIST); + print "\nExclude list of Commands with last argument not textual (overrides patterns above):\n"; + print_regex_arr(@TEXTCMDEXCL); +} + +if ($showconfig) { + print "Configuration variables:\n"; + print "MINWORDSBLOCK=$MINWORDSBLOCK\n"; + print "FLOATENV=$FLOATENV\n"; + print "PICTUREENV=$PICTUREENV\n"; + print "MATHENV=$MATHENV\n"; + print "MATHREPL=$MATHREPL\n"; + print "MATHARRENV=$MATHARRENV\n"; + print "MATHARRREPL=$MATHARRREPL\n"; + print "ARRENV=$ARRENV\n"; + print "COUNTERCMD=$COUNTERCMD\n"; +} +if ($showconfig || $showtext || $showsafe || $showpreamble) { + exit 0; } +if ( @ARGV != 2 ) { + print STDERR "2 and only 2 non-option arguments required. Write latexdiff -h to get help\n"; + exit(2); +} + +# Are extra spaces between command arguments permissible? +my $extraspace; +if ($allowspaces) { + $extraspace='\s*'; +} else { + $extraspace=''; +} + +# append context lists to text lists (as text property is implied) +push @TEXTCMDLIST, @CONTEXT1CMDLIST; +push @TEXTCMDLIST, @CONTEXT2CMDLIST; + + +# Patterns. These are used by some of the subroutines, too +# I can only define them down here because value of extraspace depends on an option + my $pat0 = '(?:[^{}]|\\\{|\\\})*'; + my $pat1 = '(?:[^{}]|\\\{|\\\}|\{'.$pat0.'\})*'; + my $pat2 = '(?:[^{}]|\\\{|\\\}|\{'.$pat1.'\})*'; # + my $pat3 = '(?:[^{}]|\\\{|\\\}|\{'.$pat2.'\})*'; + my $pat4 = '(?:[^{}]|\\\{|\\\}|\{'.$pat3.'\})*'; + my $brat0 = '(?:[^\[\]]|\\\[|\\\])*'; + + my $quotemarks = '(?:\'\')|(?:\`\`)'; + my $punct='[0.,\/\'\`:;\"\?\(\)\[\]!~\p{IsNonAsciiPunct}\p{IsNonAsciiS}]'; + my $number='-?\d*\.\d*'; + my $mathpunct='[+=<>\-\|]'; + my $and = '&'; + my $coords= '[\-.,\s\d]*'; +# word: sequence of letters or accents followed by letter + my $word='(?:[-\w\d*]|\\\\[\"\'\`~^][A-Za-z\*])+'; + my $cmdleftright='\\\\(?:left|right)\s*(?:[()\[\]|]|\\\\(?:[|{}]|\w+))'; + + my $cmdoptseq='\\\\[\w\d\*]+'.$extraspace.'(?:(?:\['.$brat0.'\]|\{'. $pat4 . '\}|\(' . $coords .'\))'.$extraspace.')*'; + my $oneletcmd='\\\\.(?:\['.$brat0.'\]|\{'. $pat4 . '\})*'; + my $math='\$(?:[^$]|\\\$)*?\$|\\\\[(].*?\\\\[)]'; +# my $math='\$(?:[^$]|\\\$)*\$'; + my $comment='%.*?\n'; + my $pat=qr/(?:\A\s*)?(?:${and}|${quotemarks}|${number}|${word}|$cmdleftright|${cmdoptseq}|${math}|${oneletcmd}|${comment}|${punct}|${mathpunct}|\{|\})\s*/ ; + + + +# now we are done setting up and can start working +my ($oldfile, $newfile) = @ARGV; + +$encoding=guess_encoding($newfile) unless defined($encoding); + +$encoding = "utf8" if $encoding =~ m/^utf8/i ; +if (lc($encoding) eq "utf8" ) { + binmode(STDOUT, ":utf8"); + binmode(STDERR, ":utf8"); +} + +$old=read_file_with_encoding($oldfile,$encoding); +$new=read_file_with_encoding($newfile,$encoding); + + + + +# reset time +exetime(1); +($oldpreamble,$oldbody,$oldpost)=splitdoc($old,'\\\\begin\{document\}','\\\\end\{document\}'); + + +($newpreamble,$newbody,$newpost)=splitdoc($new,'\\\\begin\{document\}','\\\\end\{document\}'); + + +if ($flatten) { + $oldbody=flatten($oldbody,$oldpreamble,$oldfile,$encoding); + $newbody=flatten($newbody,$newpreamble,$newfile,$encoding); +} + +if ( length $oldpreamble && length $newpreamble ) { + @oldpreamble = split /\n/, $oldpreamble; + @newpreamble = split /\n/, $newpreamble; + + %packages=list_packages(@newpreamble) unless %packages; + if (defined $packages{"hyperref"} ) { + print STDERR "hyperref package detected.\n" if $verbose ; + $latexdiffpreamble =~ s/\{\\DIFadd\}/{\\DIFaddtex}/g; + $latexdiffpreamble =~ s/\{\\DIFdel\}/{\\DIFdeltex}/g; + $latexdiffpreamble .= join "\n",(extrapream("HYPERREF"),""); + } + # insert dummy first line such that line count begins with line 1 (rather than perl's line 0) - just so that line numbers inserted by linediff are correct + unshift @newpreamble,''; + unshift @oldpreamble,''; + print STDERR "Differencing preamble.\n" if $verbose; + @diffpreamble = linediff(\@oldpreamble, \@newpreamble); + # remove dummy line again + shift @diffpreamble; + push @diffpreamble,$latexdiffpreamble; + push @diffpreamble,'\begin{document}'; +} +elsif ( !length $oldpreamble && !length $newpreamble ) { + @diffpreamble=(); +} else { + print STDERR "Either both texts must have preamble or neither text must have the preamble.\n"; + exit(2); +} + +if (defined $packages{"amsmath"} ) { + print STDERR "amsmath package detected.\n" if $verbose ; + $MATHARRREPL='align*'; +} + +print STDERR "Preprocessing body. " if $verbose; +my ($oldleadin,$newleadin)=preprocess($oldbody,$newbody); + + +# run difference algorithm +@diffbody=bodydiff($oldbody, $newbody); +$diffbo=join("",@diffbody); +print STDERR "(",exetime()," s)\n","Postprocessing body. \n " if $verbose; +postprocess($diffbo); +$diffall =join("\n",@diffpreamble) ; +$diffall .= "$newleadin$diffbo" ; +$diffall .= "\\end{document}$newpost" if length $newpreamble ; +if ( lc($encoding) ne "utf8" && lc($encoding) ne "ascii" ) { + print STDERR "Encoding output file to $encoding\n" if $verbose; + $diffall=Encode::encode($encoding,$diffall); + binmode STDOUT; +} +print $diffall; + + +print STDERR "(",exetime()," s)\n","Done.\n" if $verbose; + + + +## guess_encoding(filename) +## reads the first 20 lines of filename and looks for call of inputenc package +## if found, return the option of this package (encoding), otherwise return ascii +sub guess_encoding { + my ($filename)=@_; + my ($i,$enc); + open (FH, $filename) or die("Couldn't open $filename: $!"); + $i=0; + while () { + next if /^\s*%/; # skip comment lines + if (m/\\usepackage\[(\w*?)\]\{inputenc\}/) { + close(FH); + return($1); + } + last if (++$i > 20 ); # scan at most 20 non-comment lines + } + close(FH); + return("ascii"); +} + + +sub read_file_with_encoding { + my ($output); + my ($filename, $encoding) = @_; + + if (lc($encoding) eq "utf8" ) { + open (FILE, "<:utf8",$filename) or die("Couldn't open $filename: $!"); + local $/ ; # locally set record operator to undefined, ie. enable whole-file mode + $output=; + } elsif ( lc($encoding) eq "ascii") { + open (FILE, $filename) or die("Couldn't open $filename: $!"); + local $/ ; # locally set record operator to undefined, ie. enable whole-file mode + $output=; + } else { + require Encode; + open (FILE, "<",$filename) or die("Couldn't open $filename: $!"); + local $/ ; # locally set record operator to undefined, ie. enable whole-file mode + $output=; + print STDERR "Converting $filename from $encoding to utf8\n" if $verbose; + $output=Encode::decode($encoding,$output); + } + close FILE; + return $output; +} + +# %packages=list_packages(@preamble) +# scans the arguments for \documentclass and \usepackage statements and constructs a hash +# whose keys are the included packages, and whose values are the associated optional arguments +sub list_packages { + my (@preamble)=@_; + my %packages=(); + foreach $line ( @preamble ) { + # get rid of comments + $line=~s/(?catfile($dirname,$fname), "\n"; + # content of file becomes replacement value, add \newpage if the command was include + $replacement=read_file_with_encoding(File::Spec->catfile($dirname,$fname), $encoding) or die + "Couldn't find file ",File::Spec->catfile($dirname,$fname),": $!"; + $newpage=(defined($2)? " \\newpage " : "") ; + "$newpage$replacement$newpage"; + }/exg; + + return($text); +} + + +# print_regex_arr(@arr) +# prints regex array without x-ism expansion put in by pearl to stdout +sub print_regex_arr { + my $dumstring; + $dumstring = join(" ",@_); # PERL generates string (?-xism:^ref$) for quoted refex ^ref$ + $dumstring =~ s/\(\?-xism:\^(.*?)\$\)/$1/g; # remove string and ^,$ marks before output + print $dumstring,"\n"; +} + + +# @lines=extrapream($type) +# reads line from appendix (end of file after __END__ token) +sub extrapream { + my $type; + my @retval=("%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF") ; + my ($copy); + + while (@_) { + $copy=0; + $type=shift ; + if ( -f $type ) { + open (FILE,$type) or die "Cannot open preamble file $type: $!"; + print STDERR "Reading preamble file $type\n" if $verbose ; + while () { + chomp ; + if ( $_ =~ m/%DIF PREAMBLE/ ) { + push (@retval,"$_"); + } else { + push (@retval,"$_ %DIF PREAMBLE"); + } + } + } + else { # not (-f $type) + $type=uc($type); # upcase argument + print STDERR "Preamble Internal Type $type\n" if $verbose; + while () { + if ( m/^%DIF $type/ ) { + $copy=1; } + elsif ( m/^%DIF END $type/ ) { + last; } + chomp; + push (@retval,"$_ %DIF PREAMBLE") if $copy; + } + if ( $copy == 0 ) { + print STDERR "\nPreamble style $type not implemented.\n"; + print STDERR "Write latexdiff -h to get help with available styles\n"; + exit(2); + } + seek DATA,0,0; # rewind DATA handle to file begin + } + } + push (@retval,"%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF") ; + return @retval; +} + + +# ($part1,$part2,$part3)=splitdoc($text,$word1,$word2) +# splits $text into 3 parts at $word1 and $word2. +# if neither $word1 nor $word2 exist, $part1 and $part3 are empty, $part2 is $text +# If only $word1 or $word2 exist but not the other, output an error message. + +# NB this version avoids $` and $' for performance reason although it only makes a tiny difference +# (in one test gain a tenth of a second for a 30s run) +sub splitdoc { + my ($text,$word1,$word2)=@_; + my ($part1,$part2,$part3)=("","",""); + my ($rest,$pos); + + if ( $text =~ m/(^[^%]*)($word1)/mg ) { + $pos=pos $text; + $part1=substr($text,0,$pos-length($2)); + $rest=substr($text,$pos); + if ( $rest =~ m/(^[^%]*)($word2)/mg ) { + $pos=pos $rest; + $part2=substr($rest,0,$pos-length($2)); + $part3=substr($rest,$pos); + } + else { + die "$word1 and $word2 not in the correct order or not present as a pair." ; + } + } else { + $part2=$text; + die "$word2 present but not $word1." if ( $text =~ m/(^[^%]*)$word2/ms ); + } + return ($part1,$part2,$part3); +} + + + + + +# bodydiff($old,$new) +sub bodydiff { + my ($oldwords, $newwords) = @_; + my @retwords; + + print STDERR "(",exetime()," s)\n","Splitting into latex tokens \n" if $verbose; + print STDERR "Parsing $oldfile \n" if $verbose; + my @oldwords = splitlatex($oldwords); + print STDERR "Parsing $newfile \n" if $verbose; + my @newwords = splitlatex($newwords); + my $token; + + print STDERR "(",exetime()," s)\n","Pass 1: Expanding text commands and merging isolated identities with changed blocks " if $verbose; + pass1(\@oldwords, \@newwords); + + + print STDERR "(",exetime()," s)\n","Pass 2: inserting DIF tokens and mark up. " if $verbose; + + @retwords=pass2(\@oldwords, \@newwords); + + return(@retwords); +} + + + + +# @words=splitlatex($string) +# split string according to latex rules +# Each element of words is either +# a word (including trailing spaces and punctuation) +# a latex command +sub splitlatex { + my ($string) = @_ ; + my @retval=($string =~ m/$pat/osg); + + if (length($string) != length(join("",@retval))) { + print STDERR "\nWARNING: Inconsistency in length of input string and parsed string:\n This often indicates faulty or non-standard latex code.\n In many cases you can ignore this and the following warning messages.\n Note that character numbers in the following are counted beginning after \\begin{document} and are only approximate." unless $ignorewarnings; + print STDERR "DEBUG Original length ",length($string)," Parsed length ",length(join("",@retval)),"\n" if $debug; + print STDERR "DEBUG Input string: |$string|\n" if (length($string)<500) && $debug; + print STDERR "DEBUG Token parsing: |",join("+",@retval),"|\n" if (length($string)<500) && $debug ; + @retval=(); + # slow way only do this if other m//sg method fails + my $last = 0; + while ( $string =~ m/$pat/osg ) { + my $match=$&; + if ($last + length $& != pos $string ) { + my $pos=pos($string); + my $offset=30<$last ? 30 : $last; + my $dum=substr($string,$last-$offset,$pos-$last+2*$offset); + my $dum1=$dum; + my $cnt=$#retval; + my $i; + $dum1 =~ s/\n/ /g; + unless ($ignorewarnings) { + print STDERR "\n$dum1\n"; + print STDERR " " x 30,"^" x ($pos-$last)," " x 30,"\n"; + print STDERR "Missing characters near word " . (scalar @retval) . " character index: " . $last . "-" . pos($string) . " Length: " . length($match) . " Match: |$match| (expected match marked above).\n"; + } + # put in missing characters `by hand' + push (@retval, substr($dum,$offset,$pos-$last-length($match))); +# Note: there seems to be a bug in substr with utf8 that made the following line output substr which were too long, +# using dum instead appears to work +# push (@retval, substr($string,$last, pos($string)-$last-length($match))); + } + push (@retval, $match); + $last=pos $string; + } + + } + return @retval; +} + + +# pass1( \@seq1,\@seq2) +# Look for differences between seq1 and seq2. +# Where an common-subsequence block is flanked by deleted or appended blocks, +# and is shorter than $MINWORDSBLOCK words it is appended +# to the last deleted or appended word. If the block contains tokens other than words +# or punctuation it is not merged. +# Deleted or appended block consisting of words and safe commands only are +# also merged, to prevent break-up in pass2 (after previous isolated words have been removed) +# If there are commands with textual arguments (e.g. \caption) both in corresponding +# appended and deleted blocks split them such that the command and opening bracket +# are one token, then the rest is split up following standard rules, and the closing +# bracket is a separate token, ie. turn +# "\caption{This is a textual argument}" into +# ("\caption{","This ","is ","a ","textual ","argument","}") +# No return value. Destructively changes sequences +sub pass1 { + my $seq1 = shift ; + my $seq2 = shift ; + + my $len1 = scalar @$seq1; + my $len2 = scalar @$seq2; + my $wpat=qr/^(?:[a-zA-Z.,'`:;?()!]*)[\s~]*$/; #' + + my ($last1,$last2)=(-1,-1) ; + my $cnt=0; + my $block=[]; + my $addblock=[]; + my $delblock=[]; + my $todo=[]; + my $instruction=[]; + my $i; + my (@delmid,@addmid,@dummy); + + my ($addcmds,$delcmds,$matchindex); + my ($addtextblocks,$deltextblocks); + my ($addtokcnt,$deltokcnt,$mattokcnt)=(0,0,0); + my ($addblkcnt,$delblkcnt,$matblkcnt)=(0,0,0); + + my $adddiscard = sub { + if ($cnt > 0 ) { + $matblkcnt++; + # just after an unchanged block +# print STDERR "Unchanged block $cnt, $last1,$last2 \n"; + if ($cnt < $MINWORDSBLOCK + && $cnt==scalar ( + grep { /^$wpat/ || ( /^\\([\w\d\*]+)((?:\[$brat0\]|\{$pat4\})*)/o + && iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL) + && scalar(@dummy=split(" ",$2))<3 ) } + @$block) ) { + # merge identical blocks shorter than $MINWORDSBLOCK + # and only containing ordinary words + # with preceding different word + # We cannot carry out this merging immediately as this + # would change the index numbers of seq1 and seq2 and confuse + # the algorithm, instead we store in @$todo where we have to merge + push(@$todo, [ $last1,$last2,$cnt,@$block ]); + } + $block = []; + $cnt=0; $last1=-1; $last2=-1; + } + }; + my $discard=sub { $deltokcnt++; + &$adddiscard; #($_[0],$_[1]); + push(@$delblock,[ $seq1->[$_[0]],$_[0] ]); + $last1=$_[0] }; + + my $add = sub { $addtokcnt++; + &$adddiscard; #($_[0],$_[1]); + push(@$addblock,[ $seq2->[$_[1]],$_[1] ]); + $last2=$_[1] }; + + my $match = sub { $mattokcnt++; + if ($cnt==0) { # first word of matching sequence after changed sequence or at beginning of word sequence + $deltextblocks = extracttextblocks($delblock); + $delblkcnt++ if scalar @$delblock; + $addtextblocks = extracttextblocks($addblock); + $addblkcnt++ if scalar @$addblock; + + $delcmds = extractcommands($delblock); + $addcmds = extractcommands($addblock); + # keygen(third argument of _longestCommonSubsequence) implies to sort on command (0th elements of $addcmd elements) + # the calling format for longestCommonSubsequence has changed between versions of + # Algorithm::Diff so we need to check which one we are using + if ( $algodiffversion > 1.15 ) { + ### Algorithm::Diff 1.19 + $matchindex=Algorithm::Diff::_longestCommonSubsequence($delcmds,$addcmds, 0, sub { $_[0]->[0] } ); + } else { + ### Algorithm::Diff 1.15 + $matchindex=Algorithm::Diff::_longestCommonSubsequence($delcmds,$addcmds, sub { $_[0]->[0] } ); + } + + for ($i=0 ; $i<=$#$matchindex ; $i++) { + if (defined($matchindex->[$i])){ + $j=$matchindex->[$i]; + @delmid=splitlatex($delcmds->[$i][3]); + @addmid=splitlatex($addcmds->[$j][3]); + while (scalar(@$deltextblocks) && $deltextblocks->[0][0]<$delcmds->[$i][1]) { + my ($index,$block,$cnt)=@{ shift(@$deltextblocks) }; + push(@$todo, [$index,-1,$cnt,@$block]); + } + push(@$todo, [ $delcmds->[$i][1],-1,-1,$delcmds->[$i][2],@delmid,$delcmds->[$i][4]]); + + while (scalar(@$addtextblocks) && $addtextblocks->[0][0]<$addcmds->[$j][1]) { + my ($index,$block,$cnt)=@{ shift(@$addtextblocks) }; + push(@$todo, [-1,$index,$cnt,@$block]); + } + push(@$todo, [ -1,$addcmds->[$j][1],-1,$addcmds->[$j][2],@addmid,$addcmds->[$j][4]]); + } + } + # mop up remaining textblocks + while (scalar(@$deltextblocks)) { + my ($index,$block,$cnt)=@{ shift(@$deltextblocks) } ; + push(@$todo, [$index,-1,$cnt,@$block]); + } + while (scalar(@$addtextblocks)) { + my ($index,$block,$cnt)=@{ shift(@$addtextblocks) }; + push(@$todo, [-1,$index,$cnt,@$block]); + } + + $addblock=[]; + $delblock=[]; + } + push(@$block,$seq2->[$_[1]]); + $cnt++ }; + + my $keyfunc = sub { join(" ",split(" ",shift())) }; + + traverse_sequences($seq1,$seq2, { MATCH=>$match, DISCARD_A=>$discard, DISCARD_B=>$add }, $keyfunc ); + + + # now carry out the merging/splitting. Refer to elements relative from + # the end (with negative indices) as these offsets don't change before the instruction is executed + # cnt>0: merged small unchanged groups with previous changed blocks + # cnt==-1: split textual commands into components + foreach $instruction ( @$todo) { + ($last1,$last2,$cnt,@$block)=@$instruction ; + if ($cnt>=0) { + splice(@$seq1,$last1-$len1,1+$cnt,join("",$seq1->[$last1-$len1],@$block)) if $last1>=0; + splice(@$seq2,$last2-$len2,1+$cnt,join("",$seq2->[$last2-$len2],@$block)) if $last2>=0; + } else { + splice(@$seq1,$last1-$len1,1,@$block) if $last1>=0; + splice(@$seq2,$last2-$len2,1,@$block) if $last2>=0; + } + } + + if ($verbose) { + print STDERR "\n"; + print STDERR " $mattokcnt matching tokens in $matblkcnt blocks.\n"; + print STDERR " $deltokcnt discarded tokens in $delblkcnt blocks.\n"; + print STDERR " $addtokcnt appended tokens in $addblkcnt blocks.\n"; + } +} + + +# extracttextblocks(\@blockindex) +# $blockindex has the following format +# [ [ token1, index1 ], [token2, index2],.. ] +# where index refers to the index in the original old or new word sequence +# Returns: reference to an array of the form +# [[ $index, $textblock, $cnt ], .. +# where $index index of block to be merged +# $textblock contains all the words to be merged with the word at $index (but does not contain this word) +# $cnt is length of block +# +# requires: iscmd +# +sub extracttextblocks { + my $block=shift; + my ($i,$token,$index); + my $textblock=[]; + my $last=-1; + my $wpat=qr/^(?:[a-zA-Z.,'`:;?()!]*)[\s~]*$/; #' + my $retval=[]; + + for ($i=0;$i< scalar @$block;$i++) { + ($token,$index)=@{ $block->[$i] }; + # store pure text blocks + if ($token =~ /$wpat/ || ( $token =~/^\\([\w\d\*]+)((?:${extraspace}\[$brat0\]${extraspace}|${extraspace}\{$pat4\})*)/o + && iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL) + && !iscmd($1,\@TEXTCMDLIST,\@TEXTCMDEXCL))) { + # we have text or a command which can be treated as text + if ($last<0) { + # new pure-text block + $last=$index; + } else { + # add to pure-text block + push(@$textblock, $token); + } + } else { + # it is not text + if (scalar(@$textblock)) { + push(@$retval,[ $last, $textblock, scalar(@$textblock) ]); + } + $textblock=[]; + $last=-1; + } + } + # finish processing a possibly unfinished block before returning + if (scalar(@$textblock)) { + push(@$retval,[ $last, $textblock, scalar(@$textblock) ]); + } + return($retval) +} + + + +# extractcommands( \@blockindex ) +# $blockindex has the following format +# [ [ token1, index1 ], [token2, index2],.. ] +# where index refers to the index in the original old or new word sequence +# Returns: reference to an array of the form +# [ [ "\cmd1", index, "\cmd1[optarg]{arg1}{", "arg2" ,"} " ],.. +# where index is just taken from input array +# command must have a textual argument as last argument +# +# requires: iscmd +# +sub extractcommands { + my $block=shift; + my ($i,$token,$index,$cmd,$open,$mid,$closing); + my $retval=[]; + + for ($i=0;$i< scalar @$block;$i++) { + ($token,$index)=@{ $block->[$i] }; + # check if token is an alphanumeric command sequence with at least one non-optional argument + # \cmd[...]{...}{last argument} + # Capturing in the following results in these associations + # $1: \cmd[...]{...}{ + # $2: \cmd + # $3: last argument + # $4: } + trailing spaces + if ( ( $token =~ m/^(\\([\w\d\*]+)(?:${extraspace}\[$brat0\]|${extraspace}\{$pat4\})*${extraspace}\{)($pat4)(\}\s*)$/so ) + && iscmd($2,\@TEXTCMDLIST,\@TEXTCMDEXCL) ) { + # push(@$retval,[ $2,$index,$1,$3,$4 ]); + ($cmd,$open,$mid,$closing) = ($2,$1,$3,$4) ; + $closing =~ s/\}/\\RIGHTBRACE/ ; + push(@$retval,[ $cmd,$index,$open,$mid,$closing ]); + } + } + return $retval; +} + +# iscmd($cmd,\@regexarray,\@regexexcl) checks +# return 1 if $cmd matches any of the patterns in the +# array $@regexarray, and none of the patterns in \@regexexcl, otherwise return 0 +sub iscmd { + my ($cmd,$regexar,$regexexcl)=@_; + my ($ret)=0; + foreach $pat ( @$regexar ) { + if ( $cmd =~ m/^${pat}$/ ) { + $ret=1 ; + last; + } + } + return 0 unless $ret; + foreach $pat ( @$regexexcl ) { + return 0 if ( $cmd =~ m/^${pat}$/ ); + } + return 1; +} + + +# pass2( \@seq1,\@seq2) +# Look for differences between seq1 and seq2. +# Mark begin and end of deleted and appended sequences with tags $DELOPEN and $DELCLOSE +# and $ADDOPEN and $ADDCLOSE, respectively, however exclude { } & and all comands, unless +# they match an element of the whitelist (SAFECMD) +# For words in TEXTCMD but not in SAFECMD, enclose interior with $ADDOPEN and $ADDCLOSE brackets +# Deleted comment lines are marked with %DIF < +# Added comment lines are marked with %DIF > +sub pass2 { + my $seq1 = shift ; + my $seq2 = shift ; + + my ($addtokcnt,$deltokcnt,$mattokcnt)=(0,0,0); + my ($addblkcnt,$delblkcnt,$matblkcnt)=(0,0,0); + + my $retval = []; + my $delhunk = []; + my $addhunk = []; + + my $discard = sub { $deltokcnt++; + push ( @$delhunk, $seq1->[$_[0]]) }; + + my $add = sub { $addtokcnt++; + push ( @$addhunk, $seq2->[$_[1]]) }; + + my $match = sub { $mattokcnt++; + if ( scalar @$delhunk ) { + $delblkcnt++; + # mark up changes, but comment out commands + push @$retval,marktags($DELMARKOPEN,$DELMARKCLOSE,$DELOPEN,$DELCLOSE,$DELCMDOPEN,$DELCMDCLOSE,$DELCOMMENT,$delhunk); + $delhunk = []; + } + if ( scalar @$addhunk ) { + $addblkcnt++; + # we mark up changes, but simply quote commands + push @$retval,marktags($ADDMARKOPEN,$ADDMARKCLOSE,$ADDOPEN,$ADDCLOSE,"","",$ADDCOMMENT,$addhunk); + $addhunk = []; + } + push(@$retval,$seq2->[$_[1]]) }; + + my $keyfunc = sub { join(" ",split(" ",shift())) }; + + traverse_sequences($seq1,$seq2, { MATCH=>$match, DISCARD_A=>$discard, DISCARD_B=>$add }, $keyfunc ); + # clear up unprocessed hunks + push @$retval,marktags($DELMARKOPEN,$DELMARKCLOSE,$DELOPEN,$DELCLOSE,$DELCMDOPEN,$DELCMDCLOSE,$DELCOMMENT,$delhunk) if scalar @$delhunk; + push @$retval,marktags($ADDMARKOPEN,$ADDMARKCLOSE,$ADDOPEN,$ADDCLOSE,"","",$ADDCOMMENT,$addhunk) if scalar @$addhunk; + + + if ($verbose) { + print STDERR "\n"; + print STDERR " $mattokcnt matching tokens. \n"; + print STDERR " $deltokcnt discarded tokens in $delblkcnt blocks.\n"; + print STDERR " $addtokcnt appended tokens in $addblkcnt blocks.\n"; + } + + return(@$retval); +} + +# marktags($openmark,$closemark,$open,$close,$opencmd,$closecmd,$comment,\@block) +# returns ($openmark,$open,$block,$close,$closemark) if @block only contains no commands (except white-listed ones), +# braces, ampersands, or comments +# mark comments with $comment +# exclude all other exceptions from scope of open, close like this +# ($openmark, $open,...,$close, $opencomd,command, command,$closecmd, $open, ..., $close, $closemark) +# If $opencmd begins with "%" marktags assumes it is operating on a deleted block, otherwise on an added block +sub marktags { + my ($openmark,$closemark,$open,$close,$opencmd,$closecmd,$comment,$block)=@_; + my $word; + my (@argtext); + my $retval=[]; + my $noncomment=0; + my $cmd=-1; # -1 at beginning 0: last token written is a ordinary word + # 1: last token written is a command + # for keeping track whether we are just in a command sequence or in a word sequence + my $cmdcomment= ($opencmd =~ m/^%/); # Flag to indicate whether opencmd is a comment (i.e. if we intend to simply comment out changed commands) + my ($command,$commandword,$closingbracket) ; # temporary variables needed below to remember sub-pattern matches + +# split this block to flatten out sequences joined in pass1 + @$block=splitlatex(join "",@$block); + foreach (@$block) { + $word=$_; + if ( $word =~ s/^%/%$comment/ ) { + # a comment + if ($cmd==1) { + push (@$retval,$closecmd) ; + $cmd=-1; + } + push (@$retval,$word); + next; + } + if (! $noncomment) { + push (@$retval,$openmark); + $noncomment=1; + } + # negative lookahead pattern (?!) in second clause is put in to avoid mathcing \( .. \) patterns + if ( $word =~ /^[&{}\[\]]/ || ( $word =~ /^\\(?!\()([\w*@]*)/ && !iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL)) ) { + # word is a command or other significant token (not in SAFECMDLIST) + ## same conditions as in subroutine extractcommand: + # check if token is an alphanumeric command sequence with at least one non-optional argument + # \cmd[...]{...}{last argument} + # Capturing in the following results in these associations + # $1: \cmd[...]{...}{ + # $2: \cmd + # $3: last argument + # $4: } + trailing spaces + ### pre-0.3 if ( ( $token =~ m/^(\\([\w\d\*]+)(?:\[$brat0\]|\{$pat4\})*\{)($pat4)(\}\s*)$/so ) + if ( ( $word =~ m/^(\\([\w\d\*]+)(?:${extraspace}\[$brat0\]|${extraspace}\{$pat4\})*${extraspace}\{)($pat4)(\}\s*)$/so ) + && iscmd($2,\@TEXTCMDLIST,\@TEXTCMDEXCL) && ( !$cmdcomment || !iscmd($2,\@CONTEXT2CMDLIST, \@CONTEXT2CMDEXCL) ) ) { + # word is a text command - we mark up the interior of the word. But if we are in a deleted block ($cmdcomment=1) and + # $2 (the command) is in context2, just treat it as an ordinary command (i.e. comment it open with $opencmd) + # Because we do not want to disable this command + # here we do not use $opencmd and $closecmd($opencmd is empty) + if ($cmd==1) { + push (@$retval,$closecmd) ; + } elsif ($cmd==0) { + push (@$retval,$close) ; + } + $command=$1; $commandword=$2; $closingbracket=$4; + @argtext=splitlatex($3); # split textual argument into tokens + # and mark it up (but we do not need openmark and closemark) + # insert command with initial arguments, marked-up final argument, and closing bracket + if ( $cmdcomment && iscmd($commandword,\@CONTEXT1CMDLIST, \@CONTEXT1CMDEXCL) ) { + # context1cmd in a deleted environment; delete command itself but keep last argument, marked up + push (@$retval,$opencmd); + $command =~ s/\n/\n${opencmd}/sg ; # repeat opencmd at the beginning of each line + # argument, note that the additional comment character is included + # to suppress linebreak after opening parentheses, which is important + # for latexrevise + push (@$retval,$command,"%\n{$AUXCMD\n",marktags("","",$open,$close,$opencmd,$closecmd,$comment,\@argtext),$closingbracket); + } else { + # normal textcmd or context1cmd in an added block + push (@$retval,$command,marktags("","",$open,$close,$opencmd,$closecmd,$comment,\@argtext),$closingbracket); + } + push (@$retval,$AUXCMD,"\n") if $cmdcomment ; + $cmd=-1 ; + } else { + # ordinary command + push (@$retval,$opencmd) if $cmd==-1 ; + push (@$retval,$close,$opencmd) if $cmd==0 ; + $word =~ s/\n/\n${opencmd}/sg if $cmdcomment ; # if opencmd is a comment, repeat this at the beginning of every line + push (@$retval,$word); + $cmd=1; + } + } else { + # just an ordinary word or word in SAFECMD + push (@$retval,$open) if $cmd==-1 ; + push (@$retval,$closecmd,$open) if $cmd==1 ; + push (@$retval,$word); + $cmd=0; + } + } + push (@$retval,$close) if $cmd==0; + push (@$retval,$closecmd) if $cmd==1; + + push (@$retval,$closemark) if ($noncomment); + return @$retval; +} + +# preprocess($string, ..) +# carry out the following pre-processing steps for all arguments: +# 1. Remove leading white-space +# 2. mark all first empty line (in block of several) with \PAR tokens +# 3. Convert all '\%' into '\PERCENTAGE ' to make parsing regular expressions easier +# 4. Convert all \verb|some verbatim text| commands (where | can be an arbitrary character) +# into \verb{hash} +# 5. Convert \begin{verbatim} some verbatim text \end{verbatim} into \verbatim{hash} +# 6. Convert _n into \SUBSCRIPTNB{n} and _{nnn} into \SUBSCRIPT{nn} +# 7. Convert ^n into \SUPERSCRIPTNB{n} and ^{nnn} into \SUPERSCRIPT{nn} +# 8. a. Convert $$ $$ into \begin{DOLLARDOLLAR} \end{DOLLARDOLLAR} +# b. Convert \[ \] into \begin{SQUAREBRACKET} \end{SQUAREBRACKET} + +# 9. Add final token STOP to the very end. This is put in because the algorithm works better if the last token is identical. This is removed again in postprocessing. +# +# NB: step 6 and 7 is likely to convert some "_" inappropriately, e.g. in file +# names or labels but it does not matter because they are converted back in the postprocessing step +# Returns: leading white space removed in step 1 +sub preprocess { + my @leadin=() ; + for (@_) { + s/^(\s*)//s; + push(@leadin,$1); + s/\n(\s*?)\n((?:\s*\n)*)/\n$1\\PAR\n$2/g ; + s/(?{$hstr}) && $string ne $hash->{$hstr}) { + warn "Repeated hash value for verbatim mode in spite of different content."; + $hstr="-$hstr"; + } + $hash->{$hstr}=$string; + return($hstr); +} + +#string=fromhash(\%hash,$fromstring) +# restores string value stored in hash +#string=fromhash(\%hash,$fromstring,$prependstring) +# additionally begins each line with prependstring +sub fromhash { + my ($hash,$hstr)=($_[0],$_[1]); + my $retstr=$hash->{$hstr}; + if ( $#_ >= 2) { + $retstr =~ s/^/$_[2]/mg; + } + return $retstr; +} + + +# postprocess($string, ..) +# carry out the following post-processing steps for all arguments: +# * Remove STOP token from the end +# * Replace \RIGHTBRACE by } +# 1. Check all deleted blocks: +# a.where a deleted block contains a matching \begin and +# \end environment (these will be disabled by a %DIFDELCMD statements), for selected environment enable enable +# these commands again (such that for example displayed math in a deleted equation +# is properly within math mode. For math mode environments replace numbered equation +# environments with their display only variety (so that equation numbers in new file and +# diff file are identical). Where the correct type of math environment cannot be determined +# use a place holder MATHMODE +# b.where one of the commands matching $COUNTERCMD is used as a DIFAUXCMD, add a statement +# subtracting one from the respective counter to keep numbering consistent with new file +# Replace all MATHMODE environment commands by the correct environment to achieve matching +# pairs +# c. If in-line math mode contains array environment, enclose the whole environment in \mbox'es +# 2. Remove DIFadd, DIFdel, DIFFaddbegin , ... from picture environments +# 3. Convert DIFadd, DIFdel, DIFFaddbegin , ... into FL varieties +# within floats (currently recognised float environments: plate,table,figure +# plus starred varieties). +# 4. Remove empty %DIFDELCMD < lines +# 4. Convert \begin{SQUAREBRACKET} \end{SQUAREBRACKET} into \[ \] +# Convert \begin{DOLLARDOLLAR} \end{DOLLARDOLLAR} into $$ $$ +# 5. Convert \SUPERSCRIPTNB{n} into ^n and \SUPERSCRIPT{nn} into ^{nnn} +# 6. Convert \SUBSCRIPTNB{n} into _n and \SUBCRIPT{nn} into _{nnn} +# 7. Expand hashes of verb and verbatim environments +# 8. Convert '\PERCENTAGE ' back into '\%' +# 9.. remove all \PAR tokens +# 10. package specific processing: endfloat: make sure \begin{figure} and \end{figure} are always +# on a line by themselves, similarly for table environment +# 4, undo renaming of the \begin and \end in comments + +# +# Note have to manually synchronize substitution commands below and +# DIF.. command names in the header +sub postprocess { + my ($begin,$len,$cnt,$float,$delblock,$addblock); + # second level blocks + my ($begin2,$cnt2,$len2,$eqarrayblock,$mathblock); + + for (@_) { + + # change $'s in comments to something harmless + 1 while s/(%.*)\$/$1DOLLARDIF/mg ; + + # Remove final STOP token + s/ STOP$//; + # Replace \RIGHTBRACE by } + s/\\RIGHTBRACE/}/g; + # Check all deleted blocks: where a deleted block contains a matching \begin and + # \end environment (these will be disabled by a %DIFDELCMD statements), enable + # these commands again (such that for example displayed math in a deleted equation + # is properly within math mode. For math mode environments replace numbered equation + # environments with their display only variety (so that equation numbers in new file and + # diff file are identical + while ( m/\\DIFdelbegin.*?\\DIFdelend/sg ) { + $cnt=0; + $len=length($&); + $begin=pos($_) - $len; + $delblock=$&; + + + ### (.*?[^\n]?)\n? construct is necessary to avoid empty lines in math mode, which result in + ### an error + # displayed math environments + $delblock=~ s/(\%DIFDELCMD < \s*\\begin\{((?:$MATHENV)|SQUAREBRACKET)\}\s*?(?:$DELCMDCLOSE|\n))(.*?[^\n]?)\n?(\%DIFDELCMD < \s*\\end\{\2\})/\\begin{$MATHREPL}$AUXCMD\n$1$3\n\\end{$MATHREPL}$AUXCMD\n$4/sg; + # also transform the opposite pair \end{displaymath} .. \begin{displaymath} but we have to be careful not to interfere with the results of the transformation in the line directly above + $delblock=~ s/(?[$_[0]]) }; + my $add = sub { if (! scalar @$block) { + @begin=('a',$_[0],$_[1]) ;} + elsif ( $begin[0] eq 'd' ) { + $begin[0]='c'; $begin[2]=$_[1]; + push(@$block, "%DIF -------") } + push(@$block, $seq2->[$_[1]] . " %DIF > " ) }; + my $match = sub { if ( scalar @$block ) { + if ( $begin[0] eq 'd' && $begin[1]!=$_[0]-1) { + $instring = sprintf "%%DIF %d-%dd%d",$begin[1],$_[0]-1,$begin[2]; } + elsif ( $begin[0] eq 'a' && $begin[2]!=$_[1]-1) { + $instring = sprintf "%%DIF %da%d-%d",$begin[1],$begin[2],$_[1]-1; } + elsif ( $begin[0] eq 'c' ) { + $instring = sprintf "%%DIF %sc%s", + ($begin[1]==$_[0]-1) ? "$begin[1]" : $begin[1]."-".($_[0]-1) , + ($begin[2]==$_[1]-1) ? "$begin[2]" : $begin[2]."-".($_[1]-1) ; } + else { + $instring = sprintf "%%DIF %d%s%d",$begin[1],$begin[0],$begin[2]; } + push @$retseq, $instring,@$block, "%DIF -------" ; + $block = []; + } + push @$retseq, $seq2->[$_[1]] + }; + # key function: remove multiple spaces (such that insertion or deletion of redundant white space is not reported) + my $keyfunc = sub { join(" ",split(" ",shift())) }; + + traverse_sequences($seq1,$seq2, { MATCH=>$match, DISCARD_A=>$discard, DISCARD_B=>$add }, $keyfunc ); + push @$retseq, @$block if scalar @$block; + + return wantarray ? @$retseq : $retseq ; +} + + + +# init_regex_arr_data(\@array,"TOKEN INIT") +# scans DATA file handel for line "%% TOKEN INIT" line +# then appends each line not beginning with % into array (as a quoted regex) +sub init_regex_arr_data { + my ($arr,$token)=@_; + my ($copy); + while () { + if ( m/^%%BEGIN $token\s*$/ ) { + $copy=1; } + elsif ( m/^%%END $token\s*/ ) { + last; } + chomp; + push (@$arr,qr/^$_$/) if ( $copy && !/^%/ ) ; + } + seek DATA,0,0; # rewind DATA handle to file begin +} + + +# init_regex_arr_ext(\@array,$arg) +# fills array with regular expressions. +# if arg is a file name, then read in list of regular expressions from that file +# (one expression per line) +# Otherwise treat arg as a comma separated list of regular expressions +sub init_regex_arr_ext { + my ($arr,$arg)=@_; + my $regex; + if ( -f $ arg ) { + open(FILE,"$arg") or die ("Couldn't open $arg: $!"); + while () { + chomp; + next if /^\s*#/ || /^\s*%/ || /^\s*$/ ; + push (@$arr,qr/^$_$/); + } + close(FILE); + } + else { + # assume it is a comma-separated list of reg-ex + foreach $regex (split(qr/(?=1) { + $reset=shift; + } + if ($reset) { + $lasttime=times(); + } + else { + $retval=times()-$lasttime; + $lasttime=$lasttime+$retval; + return($retval); + } +} + + +sub usage { + die <<"EOF"; +Usage: $0 [options] old.tex new.tex > diff.tex + +Compares two latex files and writes tex code to stdout, which has the same +format as new.tex but has all changes relative to old.tex marked up or commented. + +--type=markupstyle +-t markupstyle Add code to preamble for selected markup style + Available styles: UNDERLINE CTRADITIONAL TRADITIONAL CFONT FONTSTRIKE INVISIBLE + CHANGEBAR CCHANGEBAR CULINECHBAR CFONTCBHBAR + [ Default: UNDERLINE ] + +--subtype=markstyle +-s markstyle Add code to preamble for selected style for bracketing + commands (e.g. to mark changes in margin) + Available styles: SAFE MARGINAL DVIPSCOL + [ Default: SAFE ] + +--floattype=markstyle +-f markstyle Add code to preamble for selected style which + replace standard marking and markup commands within floats + (e.g., marginal remarks cause an error within floats + so marginal marking can be disabled thus) + Available styles: FLOATSAFE IDENTICAL + [ Default: FLOATSAFE ] + +--encoding=enc +-e enc Specify encoding of old.tex and new.tex. Typical encodings are + ascii, utf8, latin1, latin9. A list of available encodings can be + obtained by executing + perl -MEncode -e 'print join ("\\n",Encode->encodings( ":all" )) ;' + [Default encoding is utf8 unless the first few lines of the preamble contain + an invocation "\\usepackage[..]{inputenc} in which case the + encoding chosen by this command is asssumed. Note that ASCII (standard + latex) is a subset of utf8] + +--preamble=file +-p file Insert file at end of preamble instead of auto-generating + preamble. The preamble must define the following commands + \\DIFaddbegin,\\DIFaddend,\\DIFadd{..}, + \\DIFdelbegin,\\DIFdelend,\\DIFdel{..}, + and varieties for use within floats + \\DIFaddbeginFL,\\DIFaddendFL,\\DIFaddFL{..}, + \\DIFdelbeginFL,\\DIFdelendFL,\\DIFdelFL{..} + (If this option is set -t, -s, and -f options + are ignored.) + +--exclude-safecmd=exclude-file +--exclude-safecmd="cmd1,cmd2,..." +-A exclude-file +--replace-safecmd=replace-file +--append-safecmd=append-file +--append-safecmd="cmd1,cmd2,..." +-a append-file Exclude from, replace or append to the list of regex + matching commands which are safe to use within the + scope of a \\DIFadd or \\DIFdel command. The file must contain + one Perl-RegEx per line (Comment lines beginning with # or % are + ignored). A literal comma within the comma-separated list must be + escaped thus "\\,", Note that the RegEx needs to match the whole of + the token, i.e., /^regex\$/ is implied and that the initial + "\\" of the command is not included. The --exclude-safecmd + and --append-safecmd options can be combined with the --replace-safecmd + option and can be used repeatedly to add cumulatively to the lists. + +--exclude-textcmd=exclude-file +--exclude-textcmd="cmd1,cmd2,..." +-X exclude-file +--replace-textcmd=replace-file +--append-textcmd=append-file +--append-textcmd="cmd1,cmd2,..." +-x append-file Exclude from, replace or append to the list of regex + matching commands whose last argument is text. See + entry for --exclude-safecmd directly above for further details. + +--replace-context1cmd=replace-file +--append-context1cmd=append-file +--append-context1cmd="cmd1,cmd2,..." + Replace or append to the list of regex matching commands + whose last argument is text but which require a particular + context to work, e.g. \\caption will only work within a figure + or table. These commands behave like text commands, except when + they occur in a deleted section, when they are disabled, but their + argument is shown as deleted text. + +--replace-context2cmd=replace-file +--append-context2cmd=append-file +--append-context2cmd="cmd1,cmd2,..." + As corresponding commands for context1. The only difference is that + context2 commands are completely disabled in deleted sections, including + their arguments. + + +--config var1=val1,var2=val2,... +-c var1=val1,.. Set configuration variables. +-c configfile Available variables: + MINWORDSBLOCK (integer) + FLOATENV (RegEx) + PICTUREENV (RegEx) + MATHENV (RegEx) + MATHREPL (String) + MATHARRENV (RegEx) + MATHARRREPL (String) + ARRENV (RegEx) + COUNTERCMD (RegEx) + This option can be repeated. + + +--packages=pkg1,pkg2,.. + Tell latexdiff that .tex file is processed with the packages in list + loaded. This is normally not necessary if the .tex file includes the + preamble, as the preamble is automatically scanned for \\usepackage commands. + Use of the --packages option disables automatic scanning, so if for any + reason package specific parsing needs to be switched off, use --packages=none. + The following packages trigger special behaviour: + endfloat hyperref amsmath + [ Default: scan the preamble for \\usepackage commands to determine + loaded packages.] + +--show-preamble Print generated or included preamble commands to stdout. + +--show-safecmd Print list of regex matching and excluding safe commands. + +--show-textcmd Print list of regex matching and excluding commands with text argument. + +--show-config Show values of configuration variables + +--show-all Show all of the above + + NB For all --show commands, no old.tex or new.tex file needs to be given, and no + differencing takes place. + +--allow-spaces Allow spaces between bracketed or braced arguments to commands + [Default requires arguments to directly follow each other without + intervening spaces] + +--flatten Replace \\input and \\include commands within body by the content + of the files in their argument. If \\includeonly is present in the + preamble, only those files are expanded into the document. However, + no recursion is done, i.e. \\input and \\include commands within + included sections are not expanded. The included files are assumed to + be located in the same directories as the old and new master files, + respectively, making it possible to organise files into old and new directories. + +--help +-h Show this help text. + +--ignore-warnings Suppress warnings about inconsistencies in length between input + and parsed strings and missing characters. + +--verbose +-V Output various status information to stderr during processing. + Default is to work silently. + +--version Show version number. +EOF +} + +=head1 NAME + +latexdiff - determine and markup differences between two latex files + +=head1 SYNOPSIS + +B [ B ] F F > F + +=head1 DESCRIPTION + +Briefly, I is a utility program to aid in the management of +revisions of latex documents. It compares two valid latex files, here +called C and C, finds significant differences +between them (i.e., ignoring the number of white spaces and position +of line breaks), and adds special commands to highlight the +differences. Where visual highlighting is not possible, e.g. for changes +in the formatting, the differences are +nevertheless marked up in the source. + +The program treats the preamble differently from the main document. +Differences between the preambles are found using line-based +differencing (similarly to the Unix diff command, but ignoring white +spaces). A comment, "S>>" is appended to each added line, i.e. a +line present in C but not in C. Discarded lines + are deactivated by prepending "S>>". Changed blocks are preceded by +comment lines giving information about line numbers in the original files. Where there are insignificant +differences, the resulting file C will be similar to +C. At the end of the preamble, the definitions for I markup commands are inserted. +In differencing the main body of the text, I attempts to +satisfy the following guidelines (in order of priority): + +=over 3 + +=item 1 + +If both C and C are valid LaTeX, then the resulting +C should also be valid LateX. (NB If a few plain TeX commands +are used within C or C then C is not +guaranteed to work but usually will). + +=item 2 + +Significant differences are determined on the level of +individual words. All significant differences, including differences +between comments should be clearly marked in the resulting source code +C. + +=item 3 + +If a changed passage contains text or text-producing commands, then +running C through LateX should produce output where added +and discarded passages are highlighted. + +=item 4 + +Where there are insignificant differences, e.g. in the positioning of +line breaks, C should follow the formatting of C + +=back + +For differencing the same algorithm as I is used but words +instead of lines are compared. An attempt is made to recognize +blocks which are completely changed such that they can be marked up as a unit. +Comments are differenced line by line +but the number of spaces within comments is ignored. Commands including +all their arguments are generally compared as one unit, i.e., no mark-up +is inserted into the arguments of commands. However, for a selected +number of commands (for example, C<\caption> and all sectioning +commands) the last argument is known to be text. This text is +split into words and differenced just as ordinary text (use options to +show and change the list of text commands, see below). As the +algorithm has no detailed knowledge of LaTeX, it assumes all pairs of +curly braces immediately following a command (i.e. a sequence of +letters beginning with a backslash) are arguments for that command. +As a restriction to condition 1 above it is thus necessary to surround +all arguments with curly braces, and to not insert +extraneous spaces. For example, write + + \section{\textem{This is an emphasized section title}} + +and not + + \section {\textem{This is an emphasized section title}} + +or + + \section\textem{This is an emphasized section title} + +even though all varieties are the same to LaTeX (but see +B<--allow-spaces> option which allows the second variety). + +For environments whose content does not conform to standard LaTeX or +where graphical markup does not make sense all markup commands can be +removed by setting the PICTUREENV configuration variable, set by +default to C and C environments; see B<--config> +option). The latter environment (C) can be used to +protect parts of the latex file where the markup results in illegal +markup. You have to surround the offending passage in both the old and +new file by C<\begin{DIFnomarkup}> and C<\end{DIFnomarkup}>. You must +define the environment in the preambles of both old and new +documents. I prefer to define it as a null-environment, + +C<\newenvironment{DIFnomarkup}{}{}> + +but the choice is yours. Any markup within the environment will be +removed, and generally everything within the environment will just be +taken from the new file. + +It is also possible to difference files which do not have a preamble. + In this case, the file is processed in the main document +mode, but the definitions of the markup commands are not inserted. + +All markup commands inserted by I begin with "C<\DIF>". Added +blocks containing words, commands or comments which are in C +but not in C are marked by C<\DIFaddbegin> and C<\DIFaddend>. +Discarded blocks are marked by C<\DIFdelbegin> and C<\DIFdelend>. +Within added blocks all text is highlighted with C<\DIFadd> like this: +C<\DIFadd{Added text block}> +Selected `safe' commands can be contained in these text blocks as well +(use options to show and change the list of safe commands, see below). +All other commands as well as braces "{" and "}" are never put within +the scope of C<\DIFadd>. Added comments are marked by prepending +"S >>". + +Within deleted blocks text is highlighted with C<\DIFdel>. Deleted +comments are marked by prepending "S >>". Non-safe command +and curly braces within deleted blocks are commented out with +"S >>". + + + +=head1 OPTIONS + +=head2 Preamble + +The following options determine the visual markup style by adding the appropriate +command definitions to the preamble. See the end of this section for a description of +available styles. + +=over 4 + +=item B<--type=markupstyle> or +B<-t markupstyle> + +Add code to preamble for selected markup style. This option defines +C<\DIFadd> and C<\DIFdel> commands. +Available styles: + +C + +[ Default: C ] + +=item B<--subtype=markstyle> or +B<-s markstyle> + +Add code to preamble for selected style for bracketing +commands (e.g. to mark changes in margin). This option defines +C<\DIFaddbegin>, C<\DIFaddend>, C<\DIFdelbegin> and C<\DIFdelend> commands. +Available styles: C + +[ Default: C ] + +=item B<--floattype=markstyle> or +B<-f markstyle> + +Add code to preamble for selected style which +replace standard marking and markup commands within floats +(e.g., marginal remarks cause an error within floats +so marginal marking can be disabled thus). This option defines all +C<\DIF...FL> commands. +Available styles: C + +[ Default: C ] + +=item B<--encoding=enc> or +B<-e enc> + +Specify encoding of old.tex and new.tex. Typical encodings are +C, C, C, C. A list of available encodings can be +obtained by executing + +Cencodings( ":all" )) ;' > + +[Default encoding is utf8 unless the first few lines of the preamble contain +an invocation C<\usepackage[..]{inputenc}> in which case the +encoding chosen by this command is asssumed. Note that ASCII (standard +latex) is a subset of utf8] + +=item B<--preamble=file> or +B<-p file> + +Insert file at end of preamble instead of generating +preamble. The preamble must define the following commands +C<\DIFaddbegin, \DIFaddend, \DIFadd{..}, +\DIFdelbegin,\DIFdelend,\DIFdel{..},> +and varieties for use within floats +C<\DIFaddbeginFL, \DIFaddendFL, \DIFaddFL{..}, +\DIFdelbeginFL, \DIFdelendFL, \DIFdelFL{..}> +(If this option is set B<-t>, B<-s>, and B<-f> options +are ignored.) + +=item B<--packages=pkg1,pkg2,..> + +Tell latexdiff that .tex file is processed with the packages in list +loaded. This is normally not necessary if the .tex file includes the +preamble, as the preamble is automatically scanned for C<\usepackage> commands. +Use of the B<--packages> option disables automatic scanning, so if for any +reason package specific parsing needs to be switched off, use B<--packages=none>. +The following packages trigger special behaviour: + +=over 8 + +=item C + +Configuration variable amsmath is set to C (Default: C) + +=item C + +Ensure that C<\begin{figure}> and C<\end{figure}> always appear by themselves on a line. + +=item C + +Change name of C<\DIFadd> and C<\DIFdel> commands to C<\DIFaddtex> and C<\DIFdeltex> and +define new C<\DIFadd> and C<\DIFdel> commands, which provide a wrapper for these commands, +using them for the text but not for the link defining command (where any markup would cause +errors). + +=back + +[ Default: scan the preamble for C<\\usepackage> commands to determine + loaded packages.] + + + +=item B<--show-preamble> + +Print generated or included preamble commands to stdout. + +=back + +=head2 Configuration + +=over 4 + +=item B<--exclude-safecmd=exclude-file> or +B<-A exclude-file> or B<--exclude-safecmd="cmd1,cmd2,..."> + +=item B<--replace-safecmd=replace-file> + +=item B<--append-safecmd=append-file> or +B<-a append-file> or B<--append-safecmd="cmd1,cmd2,..."> + +Exclude from, replace or append to the list of regular expressions (RegEx) +matching commands which are safe to use within the +scope of a C<\DIFadd> or C<\DIFdel> command. The file must contain +one Perl-RegEx per line (Comment lines beginning with # or % are +ignored). Note that the RegEx needs to match the whole of +the token, i.e., /^regex$/ is implied and that the initial +"\" of the command is not included. +The B<--exclude-safecmd> and B<--append-safecmd> options can be combined with the -B<--replace-safecmd> +option and can be used repeatedly to add cumulatively to the lists. + B<--exclude-safecmd> +and B<--append-safecmd> can also take a comma separated list as input. If a +comma for one of the regex is required, escape it thus "\,". In most cases it +will be necessary to protect the comma-separated list from the shell by putting +it in quotation marks. + +=item B<--exclude-textcmd=exclude-file> or +B<-X exclude-file> or B<--exclude-textcmd="cmd1,cmd2,..."> + +=item B<--replace-textcmd=replace-file> + +=item B<--append-textcmd=append-file> or +B<-x append-file> or B<--append-textcmd="cmd1,cmd2,..."> + +Exclude from, replace or append to the list of regular expressions +matching commands whose last argument is text. See +entry for B<--exclude-safecmd> directly above for further details. + + +=item B<--replace-context1cmd=replace-file> + +=item B<--append-context1cmd=append-file> or +=item B<--append-context1cmd="cmd1,cmd2,..."> + +Replace or append to the list of regex matching commands +whose last argument is text but which require a particular +context to work, e.g. \caption will only work within a figure +or table. These commands behave like text commands, except when +they occur in a deleted section, when they are disabled, but their +argument is shown as deleted text. + +=item B<--replace-context1cmd=replace-file> + +=item B<--append-context2cmd=append-file> or +=item B<--append-context2cmd="cmd1,cmd2,..."> +As corresponding commands for context1. The only difference is that +context2 commands are completely disabled in deleted sections, including +their arguments. + + + +=item B<--config var1=val1,var2=val2,...> or B<-c var1=val1,..> + +=item B<-c configfile> + +Set configuration variables. The option can be repeated to set different +variables (as an alternative to the comma-separated list). +Available variables (see below for further explanations): + +C (integer) + +C (RegEx) + +C (RegEx) + +C (RegEx) + +C (String) + +C (RegEx) + +C (String) + +C (RegEx) + +C (RegEx) + +=item B<--show-safecmd> + +Print list of RegEx matching and excluding safe commands. + +=item B<--show-textcmd> + +Print list of RegEx matching and excluding commands with text argument. + +=item B<--show-config> + +Show values of configuration variables. + +=item B<--show-all> + +Combine all --show commands. + +NB For all --show commands, no C or C file needs to be specified, and no +differencing takes place. + + +=back + +=head2 Miscellaneous + +=over 4 + +=item B<--verbose> or B<-V> + +Output various status information to stderr during processing. +Default is to work silently. + +=item B<--ignore-warnings> + +Suppress warnings about inconsistencies in length between input and +parsed strings and missing characters. These warning messages are +often related to non-standard latex or latex constructions with a +syntax unknown to C but the resulting difference argument +is often fully functional anyway, particularly if the non-standard +latex only occurs in parts of the text which have not changed. + +=item B<--allow-spaces> + +Allow spaces between bracketed or braced arguments to commands. Note +that this option might have undesirable side effects (unrelated scope +might get lumpeded with preceding commands) so should only be used if the +default produces erroneous results. (Default requires arguments to +directly follow each other without intervening spaces). + +=item B<--flatten> + +Replace C<\input> and C<\include> commands within body by the content +of the files in their argument. If C<\includeonly> is present in the +preamble, only those files are expanded into the document. However, +no recursion is done, i.e. C<\input> and C<\include> commands within +included sections are not expanded. The included files are assumed to + be located in the same directories as the old and new master files, +respectively, making it possible to organise files into old and new directories. + +Use of this option is not recommended +primarily the processing time for the large documents is prohibitive, and +the resulting difference document no longer reflects the structure of the +input documents. + +=item B<--help> or +B<-h> + +Show help text + +=item B<--version> + +Show version number + +=back + +=head2 Predefined styles + +=head2 Major types + +The major type determine the markup of plain text and some selected latex commands outside floats by defining the markup commands C<\DIFadd{...}> and C<\DIFdel{...}> . + +=over 10 + +=item C + +Added text is wavy-underlined and blue, discarded text is struck out and red +(Requires color and ulem packages). Overstriking does not work in displayed math equations such that deleted parts of equation are underlined, not struck out (this is a shortcoming inherent to the ulem package). + +=item C + +Added text is blue and set in sans-serif, and a red footnote is created for each discarded +piece of text. (Requires color package) + +=item C + +Like C but without the use of color. + +=item C + +Added text is blue and set in sans-serif, and discarded text is red and very small size. + +=item C + +Added tex is set in sans-serif, discarded text small and struck out + +=item C + +Added text is blue, and discarded text is red. Additionally, the changed text is marked with a bar in the margin (Requires color and changebar packages). + +=item C + +Like C but with additional changebars (Requires color and changebar packages). + +=item C + +Like C but with additional changebars (Requires color, ulem and changebar packages). + +=item C + +No mark up of text, but mark margins with changebars (Requires changebar package). + +=item C + +No visible markup (but generic markup commands will still be inserted. + +=back + +=head2 Subtypes + +The subtype defines the commands that are inserted at the begin and end of added or discarded blocks, irrespectively of whether these blocks contain text or commands (Defined commands: C<\DIFaddbegin, \DIFaddend, \DIFdelbegin, \DIFdelend>) + +=over 10 + +=item C + +No additional markup (Recommended choice) + +=item C + +Mark beginning and end of changed blocks with symbols in the margin nearby (using +the standard C<\marginpar> command - note that this sometimes moves somewhat +from the intended position. + +=item C + +An alternative way of marking added passages in blue, and deleted ones in red. Note +that C only works with the dvips converter, e.g. not pdflatex. +(it is recommeneded to use instead the main types to effect colored markup, +although in some cases coloring with dvipscol can be more complete). + +=back + +=head2 Float Types + +Some of the markup used in the main text might cause problems when used within +floats (e.g. figures or tables). For this reason alternative versions of all +markup commands are used within floats. The float type defines these alternative commands. + +=over 10 + +=item C + +Use identical markup for text as in the main body, but set all commands marking the begin and end of changed blocks to null-commands. You have to choose this float type if your subtype is C as C<\marginpar> does not work properly within floats. + +=item C + +Mark additions the same way as in the main text. Deleted environments are marked by angular brackets \[ and \] and the deleted text is set in scriptscript size. This float type should always be used with the C and C markup types as the \footnote command does not work properly in floating environments. + +=item C + +Make no difference between the main text and floats. + +=back + + +=head2 Configuration Variables + +=over 10 + +=item C + +Minimum number of tokens required to form an independent block. This value is +used in the algorithm to detect changes of complete blocks by merging identical text parts of less than C to the preceding added and discarded parts. + +[ Default: 3 ] + +=item C + +Environments whose name matches the regular expression in C are +considered floats. Within these environments, the I markup commands +are replaced by their FL variaties. + +[ Default: S >] + +=item C + +Within environments whose name matches the regular expression in C +all latexdiff markup is removed (in pathologic cases this might lead to + inconsistent markup but this situation should be rare). + +[ Default: S >] + +=item C,C + +If both \begin and \end for a math environment (environment name matching C +or \[ and \]) +are within the same deleted block, they are replaced by a \begin and \end commands for C +rather than being commented out. + +[ Default: C=S >, C=S >] + +=item C,C + +as C,C but for equation arrays + +[ Default: C=S >, C=S >] + +=item C + +If a match to C is found within an inline math environment within a deleted or added block, then the inlined math +is surrounded by C<\mbox{>...C<}>. This is necessary as underlining does not work within inlined array environments. + +[ Default: C=S > + +=item C + +If a command in a deleted block which is also in the textcmd list matches C then an +additional command C<\addtocounter{>FC<}{-1}>, where F is the matching command, is appended in the diff file such that the numbering in the diff file remains synchronized with the +numbering in the new file. + +[ Default: C=C<(?:footnote|part|section|subsection> ... + +C<|subsubsection|paragraph|subparagraph)> ] + +=back + +=head1 BUGS + +UTF-8 support requires a relatively new version of perl (5.8.0 is sufficient +but 5.6.2 up would work OK, too). + +Option allow-spaces not implemented entirely consistently. It breaks +the rules that number and type of white space does not matter, as +different numbers of inter-argument spaces are treated as significant. + +Please send bug reports +to I. Include the serial number of I +(from comments at the top of the source or use B<--version>). If you come across latex +files that are error-free and conform to the specifications set out +above, and whose differencing still does not result in error-free +latex, please send me those files, ideally edited to only contain the +offending passage as long as that still reproduces the problem. + +=head1 SEE ALSO + +L + +=head1 PORTABILITY + +I does not make use of external commands and thus should run +on any platform supporting Perl 5.6 or higher. If files with encodings +other than ASCII or UTF-8 are processed, Perl 5.8 or higher is required. + +The standard version of I requires installation of the Perl package +C (available from I - +I) but a stand-alone +version, I, which has this package inlined, is available, too. +I requires the I command to be present. + +=head1 AUTHOR + +Copyright (C) 2004-2007 Frederik Tilmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License Version 2 + +Contributors of fixes and additions: V. Kuhlmann, J. Paisley, N. Becker, T. Doerges, K. Huebner +Thanks to everyone who send in bug reports. + +=cut + +__END__ +%%BEGIN SAFE COMMANDS +% Regex matching commands which can safely be in the +% argument of a \DIFadd or \DIFdel command (leave out the \) +arabic +dashbox +emph +fbox +framebox +hspace +math.* +makebox +mbox +pageref +ref +symbol +raisebox +rule +text.* +shortstack +usebox +dag +ddag +copyright +pounds +S +P +oe +OE +ae +AE +aa +AA +o +O +l +L +frac +ss +sqrt +ldots +cdots +vdots +ddots +alpha +beta +gamma +delta +epsilon +varepsilon +zeta +eta +theta +vartheta +iota +kappa +lambda +mu +nu +xi +pi +varpi +rho +varrho +sigma +varsigma +tau +upsilon +phi +varphi +chi +psi +omega +Gamma +Delta +Theta +Lambda +Xi +Pi +Sigma +Upsilon +Phi +Psi +Omega +ps +mp +times +div +ast +star +circ +bullet +cdot +cap +cup +uplus +sqcap +vee +wedge +setminus +wr +diamond +(?:big)?triangle.* +lhd +rhd +unlhd +unrhd +oplus +ominus +otimes +oslash +odot +bigcirc +d?dagger +amalg +leq +prec +preceq +ll +(?:sq)?su[bp]set(?:eq)? +in +vdash +geq +succ(?:eq)? +gg +ni +dashv +equiv +sim(?:eq)? +asymp +approx +cong +neq +doteq +propto +models +perp +mid +parallel +bowtie +Join +smile +frown +.*arrow +(?:long)?mapsto +.*harpoon.* +leadsto +aleph +hbar +imath +jmath +ell +wp +Re +Im +mho +prime +emptyset +nabla +surd +top +bot +angle +forall +exists +neg +flat +natural +sharp +backslash +partial +infty +Box +Diamond +triangle +clubsuit +diamondsuit +heartsuit +spadesuit +sum +prod +coprod +int +oint +big(?:sq)?c[au]p +bigvee +bigwedge +bigodot +bigotimes +bigoplus +biguplus +(?:arc)?(?:cos|sin|tan|cot)h? +csc +arg +deg +det +dim +exp +gcd +hom +inf +ker +lg +lim +liminf +limsup +ln +log +max +min +Pr +sec +sup +(SUPER|SUB)SCRIPTNB +(SUPER|SUB)SCRIPT +%%END SAFE COMMANDS + +%%BEGIN TEXT COMMANDS +% Regex matching commands with a text argument (leave out the \) +addcontents.* +cc +closing +chapter +dashbox +emph +encl +fbox +framebox +footnote +footnotetext +framebox +part +(sub){0,2}section\*? +(sub)?paragraph\*? +makebox +mbox +opening +parbox +raisebox +savebox +sbox +shortstack +signature +text.* +value +underline +sqrt +(SUPER|SUB)SCRIPT +%%END TEXT COMMANDS + +%%BEGIN CONTEXT1 COMMANDS +% Regex matching commands with a text argument (leave out the \), which will fail out of context, but who's arguemtn should be printed as plain text +caption +%%END CONTEXT1 COMMANDS + +%%BEGIN CONTEXT2 COMMANDS +% Regex matching commands with a text argument (leave out the \), which will fail out of context, but who's arguemtn should be printed as plain text +title +author +%%END CONTEXT2 COMMANDS + + +%% TYPES (Commands for highlighting changed blocks) + +%DIF UNDERLINE PREAMBLE +\RequirePackage[normalem]{ulem} +\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} +\providecommand{\DIFadd}[1]{{\protect\color{blue}\uwave{#1}}} +\providecommand{\DIFdel}[1]{{\protect\color{red}\sout{#1}}} +%DIF END UNDERLINE PREAMBLE + +%DIF CTRADITIONAL PREAMBLE +\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} +\RequirePackage[stable]{footmisc} +\providecommand{\DIFadd}[1]{{\protect\color{blue} \sf #1}} +\providecommand{\DIFdel}[1]{{\protect\color{red} [..\footnote{removed: #1} ]}} +%DIF END CTRADITIONAL PREAMBLE + +%DIF TRADITIONAL PREAMBLE +\RequirePackage[stable]{footmisc} +\providecommand{\DIFadd}[1]{{\sf #1}} +\providecommand{\DIFdel}[1]{{[..\footnote{removed: #1} ]}} +%DIF END TRADITIONAL PREAMBLE + +%DIF CFONT PREAMBLE +\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} +\providecommand{\DIFadd}[1]{{\protect\color{blue} \sf #1}} +\providecommand{\DIFdel}[1]{{\protect\color{red} \scriptsize #1}} +%DIF END CFONT PREAMBLE + +%DIF FONTSTRIKE PREAMBLE +\RequirePackage[normalem]{ulem} +\providecommand{\DIFadd}[1]{{\sf #1}} +\providecommand{\DIFdel}[1]{{\footnotesize \sout{#1}}} +%DIF END FONTSTRIKE PREAMBLE + +%DIF CCHANGEBAR PREAMBLE +\RequirePackage[dvips]{changebar} +\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} +\providecommand{\DIFadd}[1]{\protect\cbstart{\protect\color{blue}#1}\protect\cbend} +\providecommand{\DIFdel}[1]{\protect\cbdelete{\protect\color{red}#1}\protect\cbdelete} +%DIF END CCHANGEBAR PREAMBLE + +%DIF CFONTCHBAR PREAMBLE +\RequirePackage[dvips]{changebar} +\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} +\providecommand{\DIFadd}[1]{\protect\cbstart{\protect\color{blue}\sf #1}\protect\cbend} +\providecommand{\DIFdel}[1]{\protect\cbdelete{\protect\color{red}\scriptsize #1}\protect\cbdelete} +%DIF END CFONTCHBAR PREAMBLE + +%DIF CULINECHBAR PREAMBLE +\RequirePackage[normalem]{ulem} +\RequirePackage[dvips]{changebar} +\RequirePackage{color} +\providecommand{\DIFadd}[1]{\protect\cbstart{\protect\color{blue}\uwave{#1}}\protect\cbend} +\providecommand{\DIFdel}[1]{\protect\cbdelete{\protect\color{red}\sout{#1}}\protect\cbdelete} +%DIF END CULINECHBAR PREAMBLE + +%DIF CHANGEBAR PREAMBLE +\RequirePackage[dvips]{changebar} +\providecommand{\DIFadd}[1]{\protect\cbstart{#1}\protect\cbend} +\providecommand{\DIFdel}[1]{\protect\cbdelete} +%DIF END CHANGEBAR PREAMBLE + +%DIF INVISIBLE PREAMBLE +\providecommand{\DIFadd}[1]{#1} +\providecommand{\DIFdel}[1]{} +%DIF END INVISIBLE PREAMBLE + + +%% SUBTYPES (Markers for beginning and end of changed blocks) + +%DIF SAFE PREAMBLE +\providecommand{\DIFaddbegin}{} +\providecommand{\DIFaddend}{} +\providecommand{\DIFdelbegin}{} +\providecommand{\DIFdelend}{} +%DIF END SAFE PREAMBLE + +%DIF MARGIN PREAMBLE +\providecommand{\DIFaddbegin}{\protect\marginpar{a[}} +\providecommand{\DIFaddend}{\protect\marginpar{]}} +\providecommand{\DIFdelbegin}{\protect\marginpar{d[}} +\providecommand{\DIFdelend}{\protect\marginpar{]}} +%DIF END BRACKET PREAMBLE + +%DIF DVIPSCOL PREAMBLE +%Note: only works with dvips converter +\RequirePackage{color} +\RequirePackage{dvipscol} +\providecommand{\DIFaddbegin}{\protect\nogroupcolor{blue}} +\providecommand{\DIFaddend}{\protect\nogroupcolor{black}} +\providecommand{\DIFdelbegin}{\protect\nogroupcolor{red}} +\providecommand{\DIFdelend}{\protect\nogroupcolor{black}} +%DIF END DVIPSCOL PREAMBLE + + +%% FLOAT TYPES + +%DIF FLOATSAFE PREAMBLE +\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} +\providecommand{\DIFdelFL}[1]{\DIFdel{#1}} +\providecommand{\DIFaddbeginFL}{} +\providecommand{\DIFaddendFL}{} +\providecommand{\DIFdelbeginFL}{} +\providecommand{\DIFdelendFL}{} +%DIF END FLOATSAFE PREAMBLE + +%DIF IDENTICAL PREAMBLE +\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} +\providecommand{\DIFdelFL}[1]{\DIFdel{#1}} +\providecommand{\DIFaddbeginFL}{\DIFaddbegin} +\providecommand{\DIFaddendFL}{\DIFaddend} +\providecommand{\DIFdelbeginFL}{\DIFdelbegin} +\providecommand{\DIFdelendFL}{\DIFdelend} +%DIF END IDENTICAL PREAMBLE + +%DIF TRADITIONALSAFE PREAMBLE +% procidecommand color to make this work for TRADITIONAL and CTRADITIONAL +\providecommand{\color}[1]{} +\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} +\providecommand{\DIFdel}[1]{{\protect\color{red}[..{\scriptsize {removed: #1}} ]}} +\providecommand{\DIFaddbeginFL}{} +\providecommand{\DIFaddendFL}{} +\providecommand{\DIFdelbeginFL}{} +\providecommand{\DIFdelendFL}{} +%DIF END FLOATSAFE PREAMBLE + +%% SPECIAL PACKAGE PREAMBLE COMMANDS + +% Standard \DIFadd and \DIFdel are redefined as \DIFaddtex and \DIFdeltex +% when hyperref package is included. +%DIF HYPERREF PREAMBLE +\providecommand{\DIFadd}[1]{\texorpdfstring{\DIFaddtex{#1}}{#1}} +\providecommand{\DIFdel}[1]{\texorpdfstring{\DIFdeltex{#1}}{}} +%DIF END HYPERREF PACKAGE diff --git a/latexize_invoice b/latexize_invoice index 3ab22f0..e40789a 100755 --- a/latexize_invoice +++ b/latexize_invoice @@ -10,14 +10,36 @@ # \end{itemize*} # }{50.00}{0.94} +use warnings; +use strict; + +use Params::Validate qw(validate_with :types); + my $time = undef; my $date = undef; + +my $hourly_rate = 50.00; + +my $total = 0; + my @events; +print<<'EOF'; +\setlength\LTleft{0pt plus 1fill minus 1fill}% +\let\LTright\LTleft +\begin{longtable}{|p{9cm}|r|r|r|r|}% +% \caption*{} +\hline + Description & Item Cost & Quantity & Cost & Total \\ +EOF + while (<>){ if (/^\s*\*\s*([^-]+)\s+-.+\[([^\]]+)\]\s*\[[^\]]+\]/) { if (defined $time) { - print format_events($date,$time,@events); + print format_events(date => $date, + time => $time, + total => \$total, + events => \@events); } @events = (); $date = $1; @@ -30,15 +52,43 @@ while (<>){ } } if (defined $time) { - print format_events($date,$time,@events); + print format_events(date => $date, + time => $time, + total => \$total, + events => \@events); } sub format_events{ - my ($date,$time,@events) = @_; - $date =~ s/\s+\d+\:\d+\:\d+\s+[A-Z]{0,3}\s*//; - my $output = ' \Fee{'.$date."\n". - ' \begin{itemize*}'."\n"; - $output .= join('',map {" \\item $_\n"} @events); - $output .= ' \end{itemize*}}{50.00}{'.$time.'}'."\n"; + my %param = validate_with(params => \@_, + spec => {time => {type => SCALAR, + }, + date => {type => SCALAR, + }, + total => {type => SCALARREF, + }, + events => {type => ARRAYREF, + }, + }, + ); + ${$param{total}} += $param{time} * $hourly_rate; + + $param{date} =~ s/\s+\d+\:\d+\:\d+\s+[A-Z]{0,3}\s*//; + my $output = '\hline'."\n".' \mbox{'.$param{date}."}\n\n". + ' \begin{itemize*}'."\n"; + $output .= join('',map {" \\item $_\n"} @{$param{events}}); + $output .= ' \end{itemize*} & \$'.sprintf('%.2f',$hourly_rate).' & '.sprintf('%.2f',$param{time}). + ' & \$'.sprintf('%.2f',$param{time}*$hourly_rate).' & \$'. + sprintf('%.2f',${$param{total}}) . + ' \\'."\n"; return $output; } +print<<'EOF'; +\hline\hline +\multicolumn{4}{|r|}{\textbf{Total}} & \$% +EOF +print sprintf('%.2f',$total)."%\n"; +print<<'EOF'; +\\ +\hline +\end{longtable} +EOF diff --git a/txt2xls b/txt2xls new file mode 100755 index 0000000..97dc8de --- /dev/null +++ b/txt2xls @@ -0,0 +1,146 @@ +#! /usr/bin/perl +# txt2xls turns text files into excel workbooks, and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. +# Copyright 2008 by Don Armstrong . +# $Id: perl_script 1153 2008-04-08 00:04:20Z don $ + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +txt2xls - Turns a (set of) text file(s) into an excel workbook + +=head1 SYNOPSIS + + [options] + + Options: + --tsv, -t tab separated value mode (Default) + --ssv, -s space separated value mode + --csv, -c comma separated value mode + --r-mode, -r R mode (Default) + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + +=cut + + +use vars qw($DEBUG); + +use Text::CSV; +use Spreadsheet::WriteExcel; + +my %options = (debug => 0, + help => 0, + man => 0, + rmode => 1, + remove_name => [], + ); + +GetOptions(\%options, + 'tsv|t', + 'ssv|s', + 'csv|c', + 'rmode|r-mode|r!', + 'remove_name|remove-name=s@', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +if (0 == grep {exists $options{$_}} qw(tsv ssv csv)) { + $options{tsv} = 1 +} +if (1 < grep {exists $options{$_}} qw(tsv ssv csv)) { + push @USAGE_ERRORS,"You can only pass one of --tsv, --ssv, or --csv"; +} + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +my @columns = ('A'..'Z','AA'..'ZZ'); + +if (not @ARGV) { + # we'll use this as a special indicator to read stdin + push @ARGV,undef; +} + +my $sep_char = "\t"; +if ($options{csv}) { + $sep_char = ','; +} +elsif ($options{ssv}) { + $sep_char = ' '; +} + +my $csv = Text::CSV->new({sep_char=>$sep_char}); +my $wb = Spreadsheet::WriteExcel->new(\*STDOUT); +for my $file (@ARGV) { + my $fh; + if (not defined $file) { + $fh = \*STDIN; + $file = "STDIN"; + } + else { + $fh = IO::File->new($file,'r') or + die "Unable to open $file for reading: $!"; + } + my $ws_name = $file; + foreach my $remove (@{$options{remove_name}}) { + $ws_name =~ s{\Q$remove\E}{}g; + } + $ws_name =~ s{\.[^\.]+$}{}g; + $ws_name =~ s/_+/ /g; + $ws_name =~ s{[\]:*?\/\] ]+}{ }g; + $ws_name =~ s{(?:^\s+|\s+$)}{}g; + $ws_name =~ s{^(.{0,31}).*$}{$1}; + my $ws = $wb->add_worksheet($ws_name) or + die "Unable to add worksheet to workbook"; + my $row = 1; + while (<$fh>) { + chomp; + # parse the line + my @row; + die "Unable to parse line $. of $file" unless $csv->parse($_); + if ($row==1) { + push @row,''; + } + push @row,$csv->fields(); + for my $i (0..$#row) { + $ws->write($columns[$i].$row,$row[$i]); + } + $row++; + } +} + +__END__ -- 2.39.2