(file) Return to Diff.pm CVS log (file) (dir) Up to [RizwankCVS] / geekymedia_web / twiki / lib / Algorithm

  1 rizwank 1.1 package Algorithm::Diff;
  2             use strict;
  3             use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
  4             use integer;    # see below in _replaceNextLargerWith() for mod to make
  5                             # if you don't use this
  6             require Exporter;
  7             @ISA       = qw(Exporter);
  8             @EXPORT    = qw();
  9             @EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff);
 10             $VERSION = sprintf('%d.%02d', (q$Revision: 1.2 $ =~ /\d+/g));
 11             
 12             # McIlroy-Hunt diff algorithm
 13             # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
 14             # by Ned Konz, perl@bike-nomad.com
 15             
 16             =head1 NAME
 17             
 18             Algorithm::Diff - Compute `intelligent' differences between two files / lists
 19             
 20             =head1 SYNOPSIS
 21             
 22 rizwank 1.1   use Algorithm::Diff qw(diff sdiff LCS traverse_sequences
 23                                      traverse_balanced);
 24             
 25               @lcs    = LCS( \@seq1, \@seq2 );
 26             
 27               @lcs    = LCS( \@seq1, \@seq2, $key_generation_function );
 28             
 29               $lcsref = LCS( \@seq1, \@seq2 );
 30             
 31               $lcsref = LCS( \@seq1, \@seq2, $key_generation_function );
 32             
 33               @diffs = diff( \@seq1, \@seq2 );
 34             
 35               @diffs = diff( \@seq1, \@seq2, $key_generation_function );
 36             
 37               @sdiffs = sdiff( \@seq1, \@seq2 );
 38             
 39               @sdiffs = sdiff( \@seq1, \@seq2, $key_generation_function );
 40               
 41               traverse_sequences( \@seq1, \@seq2,
 42                                  { MATCH => $callback,
 43 rizwank 1.1                        DISCARD_A => $callback,
 44                                    DISCARD_B => $callback,
 45                                  } );
 46             
 47               traverse_sequences( \@seq1, \@seq2,
 48                                  { MATCH => $callback,
 49                                    DISCARD_A => $callback,
 50                                    DISCARD_B => $callback,
 51                                  },
 52                                  $key_generation_function );
 53             
 54               traverse_balanced( \@seq1, \@seq2,
 55                                  { MATCH => $callback,
 56                                    DISCARD_A => $callback,
 57                                    DISCARD_B => $callback,
 58                                    CHANGE    => $callback,
 59                                  } );
 60             
 61             =head1 INTRODUCTION
 62             
 63             (by Mark-Jason Dominus)
 64 rizwank 1.1 
 65             I once read an article written by the authors of C<diff>; they said
 66             that they hard worked very hard on the algorithm until they found the
 67             right one.
 68             
 69             I think what they ended up using (and I hope someone will correct me,
 70             because I am not very confident about this) was the `longest common
 71             subsequence' method.  in the LCS problem, you have two sequences of
 72             items:
 73             
 74                     a b c d f g h j q z
 75             
 76                     a b c d e f g i j k r x y z
 77             
 78             and you want to find the longest sequence of items that is present in
 79             both original sequences in the same order.  That is, you want to find
 80             a new sequence I<S> which can be obtained from the first sequence by
 81             deleting some items, and from the secend sequence by deleting other
 82             items.  You also want I<S> to be as long as possible.  In this case
 83             I<S> is
 84             
 85 rizwank 1.1         a b c d f g j z
 86             
 87             From there it's only a small step to get diff-like output:
 88             
 89                     e   h i   k   q r x y 
 90                     +   - +   +   - + + +
 91             
 92             This module solves the LCS problem.  It also includes a canned
 93             function to generate C<diff>-like output.
 94             
 95             It might seem from the example above that the LCS of two sequences is
 96             always pretty obvious, but that's not always the case, especially when
 97             the two sequences have many repeated elements.  For example, consider
 98             
 99             	a x b y c z p d q
100             	a b c a x b y c z
101             
102             A naive approach might start by matching up the C<a> and C<b> that
103             appear at the beginning of each sequence, like this:
104             
105             	a x b y c         z p d q
106 rizwank 1.1 	a   b   c a b y c z
107             
108             This finds the common subsequence C<a b c z>.  But actually, the LCS
109             is C<a x b y c z>:
110             
111             	      a x b y c z p d q
112             	a b c a x b y c z
113             
114             =head1 USAGE
115             
116             This module provides three exportable functions, which we'll deal with in
117             ascending order of difficulty: C<LCS>, 
118             C<diff>, C<sdiff>, C<traverse_sequences>, and C<traverse_balanced>.
119             
120             =head2 C<LCS>
121             
122             Given references to two lists of items, LCS returns an array containing their
123             longest common subsequence.  In scalar context, it returns a reference to
124             such a list.
125             
126               @lcs    = LCS( \@seq1, \@seq2 );
127 rizwank 1.1   $lcsref = LCS( \@seq1, \@seq2 );
128             
129             C<LCS> may be passed an optional third parameter; this is a CODE
130             reference to a key generation function.  See L</KEY GENERATION
131             FUNCTIONS>.
132             
133               @lcs    = LCS( \@seq1, \@seq2, $keyGen );
134               $lcsref = LCS( \@seq1, \@seq2, $keyGen );
135             
136             Additional parameters, if any, will be passed to the key generation
137             routine.
138             
139             =head2 C<diff>
140             
141               @diffs     = diff( \@seq1, \@seq2 );
142               $diffs_ref = diff( \@seq1, \@seq2 );
143             
144             C<diff> computes the smallest set of additions and deletions necessary
145             to turn the first sequence into the second, and returns a description
146             of these changes.  The description is a list of I<hunks>; each hunk
147             represents a contiguous section of items which should be added,
148 rizwank 1.1 deleted, or replaced.  The return value of C<diff> is a list of
149             hunks, or, in scalar context, a reference to such a list.
150             
151             Here is an example:  The diff of the following two sequences:
152             
153               a b c e h j l m n p
154               b c d e f j k l m r s t
155             
156             Result:
157             
158              [ 
159                [ [ '-', 0, 'a' ] ],       
160             
161                [ [ '+', 2, 'd' ] ],
162             
163                [ [ '-', 4, 'h' ] , 
164                  [ '+', 4, 'f' ] ],
165             
166                [ [ '+', 6, 'k' ] ],
167             
168                [ [ '-', 8, 'n' ], 
169 rizwank 1.1      [ '-', 9, 'p' ], 
170                  [ '+', 9, 'r' ], 
171                  [ '+', 10, 's' ], 
172                  [ '+', 11, 't' ],
173                ]
174              ]
175             
176             There are five hunks here.  The first hunk says that the C<a> at
177             position 0 of the first sequence should be deleted (C<->).  The second
178             hunk says that the C<d> at position 2 of the second sequence should
179             be inserted (C<+>).  The third hunk says that the C<h> at position 4
180             of the first sequence should be removed and replaced with the C<f>
181             from position 4 of the second sequence.  The other two hunks similarly. 
182             
183             C<diff> may be passed an optional third parameter; this is a CODE
184             reference to a key generation function.  See L</KEY GENERATION
185             FUNCTIONS>.
186             
187             Additional parameters, if any, will be passed to the key generation
188             routine.
189             
190 rizwank 1.1 =head2 C<sdiff>
191             
192               @sdiffs     = sdiff( \@seq1, \@seq2 );
193               $sdiffs_ref = sdiff( \@seq1, \@seq2 );
194             
195             C<sdiff> computes all necessary components to show two sequences
196             and their minimized differences side by side, just like the 
197             Unix-utility I<sdiff> does:
198             
199                 same             same
200                 before     |     after
201                 old        <     -
202                 -          >     new
203             
204             It returns a list of array refs, each pointing to an array of 
205             display instructions. In scalar context it returns a reference
206             to such a list.
207             
208             Display instructions consist of three elements: A modifier indicator
209             (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified, 
210             C<c>: Element changed) and the value of the old and new elements, to
211 rizwank 1.1 be displayed side by side.
212             
213             An C<sdiff> of the following two sequences:
214             
215               a b c e h j l m n p
216               b c d e f j k l m r s t
217             
218             results in
219             
220             [ [ '-', 'a', ''  ],
221               [ 'u', 'b', 'b' ],
222               [ 'u', 'c', 'c' ],
223               [ '+', '',  'd' ],
224               [ 'u', 'e', 'e' ],
225               [ 'c', 'h', 'f' ],
226               [ 'u', 'j', 'j' ],
227               [ '+', '',  'k' ],
228               [ 'u', 'l', 'l' ],
229               [ 'u', 'm', 'm' ],
230               [ 'c', 'n', 'r' ],
231               [ 'c', 'p', 's' ],
232 rizwank 1.1   [ '+', '', 't' ] ]
233             
234             C<sdiff> may be passed an optional third parameter; this is a CODE
235             reference to a key generation function.  See L</KEY GENERATION
236             FUNCTIONS>.
237             
238             Additional parameters, if any, will be passed to the key generation
239             routine.
240             
241             =head2 C<traverse_sequences>
242             
243             C<traverse_sequences> is the most general facility provided by this
244             module; C<diff> and C<LCS> are implemented as calls to it.
245             
246             Imagine that there are two arrows.  Arrow A points to an element of sequence A,
247             and arrow B points to an element of the sequence B.  Initially, the arrows
248             point to the first elements of the respective sequences.  C<traverse_sequences>
249             will advance the arrows through the sequences one element at a time, calling an
250             appropriate user-specified callback function before each advance.  It
251             willadvance the arrows in such a way that if there are equal elements C<$A[$i]>
252             and C<$B[$j]> which are equal and which are part of the LCS, there will be
253 rizwank 1.1 some moment during the execution of C<traverse_sequences> when arrow A is
254             pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>.  When this happens,
255             C<traverse_sequences> will call the C<MATCH> callback function and then it will
256             advance both arrows. 
257             
258             Otherwise, one of the arrows is pointing to an element of its sequence that is
259             not part of the LCS.  C<traverse_sequences> will advance that arrow and will
260             call the C<DISCARD_A> or the C<DISCARD_B> callback, depending on which arrow it
261             advanced.  If both arrows point to elements that are not part of the LCS, then
262             C<traverse_sequences> will advance one of them and call the appropriate
263             callback, but it is not specified which it will call.
264             
265             The arguments to C<traverse_sequences> are the two sequences to traverse, and a
266             hash which specifies the callback functions, like this:
267             
268               traverse_sequences( \@seq1, \@seq2,
269                                  { MATCH => $callback_1,
270                                    DISCARD_A => $callback_2,
271                                    DISCARD_B => $callback_3,
272                                  } );
273             
274 rizwank 1.1 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least the
275             indices of the two arrows as their arguments.  They are not expected to return
276             any values.  If a callback is omitted from the table, it is not called.
277             
278             Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
279             corresponding index in A or B.
280             
281             If arrow A reaches the end of its sequence, before arrow B does,
282             C<traverse_sequences> will call the C<A_FINISHED> callback when it advances
283             arrow B, if there is such a function; if not it will call C<DISCARD_B> instead.
284             Similarly if arrow B finishes first.  C<traverse_sequences> returns when both
285             arrows are at the ends of their respective sequences.  It returns true on
286             success and false on failure.  At present there is no way to fail.
287             
288             C<traverse_sequences> may be passed an optional fourth parameter; this is a
289             CODE reference to a key generation function.  See L</KEY GENERATION FUNCTIONS>.
290             
291             Additional parameters, if any, will be passed to the key generation function.
292             
293             =head2 C<traverse_balanced>
294             
295 rizwank 1.1 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
296             uses a different algorithm to iterate through the entries in the
297             computed LCS. Instead of sticking to one side and showing element changes
298             as insertions and deletions only, it will jump back and forth between
299             the two sequences and report I<changes> occurring as deletions on one
300             side followed immediatly by an insertion on the other side.
301             
302             In addition to the 
303             C<DISCARD_A>,
304             C<DISCARD_B>, and
305             C<MATCH>
306             callbacks supported by C<traverse_sequences>, C<traverse_balanced> supports
307             a C<CHANGE> callback indicating that one element got C<replaced> by another:
308             
309               traverse_sequences( \@seq1, \@seq2,
310                                  { MATCH => $callback_1,
311                                    DISCARD_A => $callback_2,
312                                    DISCARD_B => $callback_3,
313                                    CHANGE    => $callback_4,
314                                  } );
315             
316 rizwank 1.1 If no C<CHANGE> callback is specified, C<traverse_balanced>
317             will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
318             therefore resulting in a similar behaviour as C<traverse_sequences>
319             with different order of events.
320             
321             C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
322             noticable only while processing huge amounts of data.
323             
324             The C<sdiff> function of this module 
325             is implemented as call to C<traverse_balanced>.
326             
327             =head1 KEY GENERATION FUNCTIONS
328             
329             C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter.
330             This is a CODE reference to a key generating (hashing) function that should
331             return a string that uniquely identifies a given element.  It should be the
332             case that if two elements are to be considered equal, their keys should be the
333             same (and the other way around).  If no key generation function is provided,
334             the key will be the element as a string.
335             
336             By default, comparisons will use "eq" and elements will be turned into keys
337 rizwank 1.1 using the default stringizing operator '""'.
338             
339             Where this is important is when you're comparing something other than strings.
340             If it is the case that you have multiple different objects that should be
341             considered to be equal, you should supply a key generation function. Otherwise,
342             you have to make sure that your arrays contain unique references.
343             
344             For instance, consider this example:
345             
346               package Person;
347             
348               sub new
349               {
350                 my $package = shift;
351                 return bless { name => '', ssn => '', @_ }, $package;
352               }
353             
354               sub clone
355               {
356                 my $old = shift;
357                 my $new = bless { %$old }, ref($old);
358 rizwank 1.1   }
359             
360               sub hash
361               {
362                 return shift()->{'ssn'};
363               }
364             
365               my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
366               my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
367               my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
368               my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
369               my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
370             
371             If you did this:
372             
373               my $array1 = [ $person1, $person2, $person4 ];
374               my $array2 = [ $person1, $person3, $person4, $person5 ];
375               Algorithm::Diff::diff( $array1, $array2 );
376             
377             everything would work out OK (each of the objects would be converted
378             into a string like "Person=HASH(0x82425b0)" for comparison).
379 rizwank 1.1 
380             But if you did this:
381             
382               my $array1 = [ $person1, $person2, $person4 ];
383               my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
384               Algorithm::Diff::diff( $array1, $array2 );
385             
386             $person4 and $person4->clone() (which have the same name and SSN)
387             would be seen as different objects. If you wanted them to be considered
388             equivalent, you would have to pass in a key generation function:
389             
390               my $array1 = [ $person1, $person2, $person4 ];
391               my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
392               Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
393             
394             This would use the 'ssn' field in each Person as a comparison key, and
395             so would consider $person4 and $person4->clone() as equal.
396             
397             You may also pass additional parameters to the key generation function
398             if you wish.
399             
400 rizwank 1.1 =head1 AUTHOR
401             
402             This version by Ned Konz, perl@bike-nomad.com
403             
404             =head1 LICENSE
405             
406             Copyright (c) 2000-2002 Ned Konz.  All rights reserved.
407             This program is free software;
408             you can redistribute it and/or modify it under the same terms
409             as Perl itself.
410             
411             =head1 CREDITS
412             
413             Versions through 0.59 (and much of this documentation) were written by:
414             
415             Mark-Jason Dominus, mjd-perl-diff@plover.com
416             
417             This version borrows the documentation and names of the routines
418             from Mark-Jason's, but has all new code in Diff.pm.
419             
420             This code was adapted from the Smalltalk code of
421 rizwank 1.1 Mario Wolczko <mario@wolczko.com>, which is available at
422             ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
423             
424             C<sdiff> and C<traverse_balanced> were written by Mike Schilli
425             <m@perlmeister.com>.
426             
427             The algorithm is that described in 
428             I<A Fast Algorithm for Computing Longest Common Subsequences>,
429             CACM, vol.20, no.5, pp.350-353, May 1977, with a few
430             minor improvements to improve the speed.
431             
432             =cut
433             
434             # Create a hash that maps each element of $aCollection to the set of positions
435             # it occupies in $aCollection, restricted to the elements within the range of
436             # indexes specified by $start and $end.
437             # The fourth parameter is a subroutine reference that will be called to
438             # generate a string to use as a key.
439             # Additional parameters, if any, will be passed to this subroutine.
440             #
441             # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
442 rizwank 1.1 
443             sub _withPositionsOfInInterval
444             {
445             	my $aCollection = shift;    # array ref
446             	my $start       = shift;
447             	my $end         = shift;
448             	my $keyGen      = shift;
449             	my %d;
450             	my $index;
451             	for ( $index = $start ; $index <= $end ; $index++ )
452             	{
453             		my $element = $aCollection->[$index];
454             		my $key = &$keyGen( $element, @_ );
455             		if ( exists( $d{$key} ) )
456             		{
457             			unshift ( @{ $d{$key} }, $index );
458             		}
459             		else
460             		{
461             			$d{$key} = [$index];
462             		}
463 rizwank 1.1 	}
464             	return wantarray ? %d : \%d;
465             }
466             
467             # Find the place at which aValue would normally be inserted into the array. If
468             # that place is already occupied by aValue, do nothing, and return undef. If
469             # the place does not exist (i.e., it is off the end of the array), add it to
470             # the end, otherwise replace the element at that point with aValue.
471             # It is assumed that the array's values are numeric.
472             # This is where the bulk (75%) of the time is spent in this module, so try to
473             # make it fast!
474             
475             sub _replaceNextLargerWith
476             {
477             	my ( $array, $aValue, $high ) = @_;
478             	$high ||= $#$array;
479             
480             	# off the end?
481             	if ( $high == -1 || $aValue > $array->[-1] )
482             	{
483             		push ( @$array, $aValue );
484 rizwank 1.1 		return $high + 1;
485             	}
486             
487             	# binary search for insertion point...
488             	my $low = 0;
489             	my $index;
490             	my $found;
491             	while ( $low <= $high )
492             	{
493             		$index = ( $high + $low ) / 2;
494             
495             		#		$index = int(( $high + $low ) / 2);		# without 'use integer'
496             		$found = $array->[$index];
497             
498             		if ( $aValue == $found )
499             		{
500             			return undef;
501             		}
502             		elsif ( $aValue > $found )
503             		{
504             			$low = $index + 1;
505 rizwank 1.1 		}
506             		else
507             		{
508             			$high = $index - 1;
509             		}
510             	}
511             
512             	# now insertion point is in $low.
513             	$array->[$low] = $aValue;    # overwrite next larger
514             	return $low;
515             }
516             
517             # This method computes the longest common subsequence in $a and $b.
518             
519             # Result is array or ref, whose contents is such that
520             # 	$a->[ $i ] == $b->[ $result[ $i ] ]
521             # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
522             
523             # An additional argument may be passed; this is a hash or key generating
524             # function that should return a string that uniquely identifies the given
525             # element.  It should be the case that if the key is the same, the elements
526 rizwank 1.1 # will compare the same. If this parameter is undef or missing, the key
527             # will be the element as a string.
528             
529             # By default, comparisons will use "eq" and elements will be turned into keys
530             # using the default stringizing operator '""'.
531             
532             # Additional parameters, if any, will be passed to the key generation routine.
533             
534             sub _longestCommonSubsequence
535             {
536             	my $a      = shift;    # array ref
537             	my $b      = shift;    # array ref
538             	my $keyGen = shift;    # code ref
539             	my $compare;           # code ref
540             
541             	# set up code refs
542             	# Note that these are optimized.
543             	if ( !defined($keyGen) )    # optimize for strings
544             	{
545             		$keyGen = sub { $_[0] };
546             		$compare = sub { my ( $a, $b ) = @_; $a eq $b };
547 rizwank 1.1 	}
548             	else
549             	{
550             		$compare = sub {
551             			my $a = shift;
552             			my $b = shift;
553             			&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
554             		};
555             	}
556             
557             	my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) =
558             	  ( 0, $#$a, 0, $#$b, [] );
559             
560             	# First we prune off any common elements at the beginning
561             	while ( $aStart <= $aFinish
562             		and $bStart <= $bFinish
563             		and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
564             	{
565             		$matchVector->[ $aStart++ ] = $bStart++;
566             	}
567             
568 rizwank 1.1 	# now the end
569             	while ( $aStart <= $aFinish
570             		and $bStart <= $bFinish
571             		and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
572             	{
573             		$matchVector->[ $aFinish-- ] = $bFinish--;
574             	}
575             
576             	# Now compute the equivalence classes of positions of elements
577             	my $bMatches =
578             	  _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
579             	my $thresh = [];
580             	my $links  = [];
581             
582             	my ( $i, $ai, $j, $k );
583             	for ( $i = $aStart ; $i <= $aFinish ; $i++ )
584             	{
585             		$ai = &$keyGen( $a->[$i], @_ );
586             		if ( exists( $bMatches->{$ai} ) )
587             		{
588             			$k = 0;
589 rizwank 1.1 			for $j ( @{ $bMatches->{$ai} } )
590             			{
591             
592             				# optimization: most of the time this will be true
593             				if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
594             				{
595             					$thresh->[$k] = $j;
596             				}
597             				else
598             				{
599             					$k = _replaceNextLargerWith( $thresh, $j, $k );
600             				}
601             
602             				# oddly, it's faster to always test this (CPU cache?).
603             				if ( defined($k) )
604             				{
605             					$links->[$k] =
606             					  [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
607             				}
608             			}
609             		}
610 rizwank 1.1 	}
611             
612             	if (@$thresh)
613             	{
614             		for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
615             		{
616             			$matchVector->[ $link->[1] ] = $link->[2];
617             		}
618             	}
619             
620             	return wantarray ? @$matchVector : $matchVector;
621             }
622             
623             sub traverse_sequences
624             {
625             	my $a                 = shift;                                  # array ref
626             	my $b                 = shift;                                  # array ref
627             	my $callbacks         = shift || {};
628             	my $keyGen            = shift;
629             	my $matchCallback     = $callbacks->{'MATCH'} || sub { };
630             	my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
631 rizwank 1.1 	my $finishedACallback = $callbacks->{'A_FINISHED'};
632             	my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
633             	my $finishedBCallback = $callbacks->{'B_FINISHED'};
634             	my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
635             
636             	# Process all the lines in @$matchVector
637             	my $lastA = $#$a;
638             	my $lastB = $#$b;
639             	my $bi    = 0;
640             	my $ai;
641             
642             	for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
643             	{
644             		my $bLine = $matchVector->[$ai];
645             		if ( defined($bLine) )    # matched
646             		{
647             			&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
648             			&$matchCallback( $ai,    $bi++, @_ );
649             		}
650             		else
651             		{
652 rizwank 1.1 			&$discardACallback( $ai, $bi, @_ );
653             		}
654             	}
655             
656             	# The last entry (if any) processed was a match.
657             	# $ai and $bi point just past the last matching lines in their sequences.
658             
659             	while ( $ai <= $lastA or $bi <= $lastB )
660             	{
661             
662             		# last A?
663             		if ( $ai == $lastA + 1 and $bi <= $lastB )
664             		{
665             			if ( defined($finishedACallback) )
666             			{
667             				&$finishedACallback( $lastA, @_ );
668             				$finishedACallback = undef;
669             			}
670             			else
671             			{
672             				&$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
673 rizwank 1.1 			}
674             		}
675             
676             		# last B?
677             		if ( $bi == $lastB + 1 and $ai <= $lastA )
678             		{
679             			if ( defined($finishedBCallback) )
680             			{
681             				&$finishedBCallback( $lastB, @_ );
682             				$finishedBCallback = undef;
683             			}
684             			else
685             			{
686             				&$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
687             			}
688             		}
689             
690             		&$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
691             		&$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
692             	}
693             
694 rizwank 1.1 	return 1;
695             }
696             
697             sub traverse_balanced
698             {
699             	my $a                 = shift;                                  # array ref
700             	my $b                 = shift;                                  # array ref
701             	my $callbacks         = shift || {};
702             	my $keyGen            = shift;
703             	my $matchCallback     = $callbacks->{'MATCH'} || sub { };
704             	my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
705             	my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
706             	my $changeCallback    = $callbacks->{'CHANGE'};
707             	my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
708             
709             	# Process all the lines in match vector
710             	my $lastA = $#$a;
711             	my $lastB = $#$b;
712             	my $bi    = 0;
713             	my $ai    = 0;
714             	my $ma    = -1;
715 rizwank 1.1 	my $mb;
716             
717             	while (1)
718             	{
719             
720             		# Find next match indices $ma and $mb
721             		do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] );
722             
723             		last if $ma > $#$matchVector;    # end of matchVector?
724             		$mb = $matchVector->[$ma];
725             
726             		# Proceed with discard a/b or change events until
727             		# next match
728             		while ( $ai < $ma || $bi < $mb )
729             		{
730             
731             			if ( $ai < $ma && $bi < $mb )
732             			{
733             
734             				# Change
735             				if ( defined $changeCallback )
736 rizwank 1.1 				{
737             					&$changeCallback( $ai++, $bi++, @_ );
738             				}
739             				else
740             				{
741             					&$discardACallback( $ai++, $bi, @_ );
742             					&$discardBCallback( $ai, $bi++, @_ );
743             				}
744             			}
745             			elsif ( $ai < $ma )
746             			{
747             				&$discardACallback( $ai++, $bi, @_ );
748             			}
749             			else
750             			{
751             
752             				# $bi < $mb
753             				&$discardBCallback( $ai, $bi++, @_ );
754             			}
755             		}
756             
757 rizwank 1.1 		# Match
758             		&$matchCallback( $ai++, $bi++, @_ );
759             	}
760             
761             	while ( $ai <= $lastA || $bi <= $lastB )
762             	{
763             		if ( $ai <= $lastA && $bi <= $lastB )
764             		{
765             
766             			# Change
767             			if ( defined $changeCallback )
768             			{
769             				&$changeCallback( $ai++, $bi++, @_ );
770             			}
771             			else
772             			{
773             				&$discardACallback( $ai++, $bi, @_ );
774             				&$discardBCallback( $ai, $bi++, @_ );
775             			}
776             		}
777             		elsif ( $ai <= $lastA )
778 rizwank 1.1 		{
779             			&$discardACallback( $ai++, $bi, @_ );
780             		}
781             		else
782             		{
783             
784             			# $bi <= $lastB
785             			&$discardBCallback( $ai, $bi++, @_ );
786             		}
787             	}
788             
789             	return 1;
790             }
791             
792             sub LCS
793             {
794             	my $a = shift;                                           # array ref
795             	my $matchVector = _longestCommonSubsequence( $a, @_ );
796             	my @retval;
797             	my $i;
798             	for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
799 rizwank 1.1 	{
800             		if ( defined( $matchVector->[$i] ) )
801             		{
802             			push ( @retval, $a->[$i] );
803             		}
804             	}
805             	return wantarray ? @retval : \@retval;
806             }
807             
808             sub diff
809             {
810             	my $a      = shift;    # array ref
811             	my $b      = shift;    # array ref
812             	my $retval = [];
813             	my $hunk   = [];
814             	my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) };
815             	my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) };
816             	my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
817             	traverse_sequences( $a, $b,
818             		{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
819             	&$match();
820 rizwank 1.1 	return wantarray ? @$retval : $retval;
821             }
822             
823             sub sdiff
824             {
825             	my $a      = shift;    # array ref
826             	my $b      = shift;    # array ref
827             	my $retval = [];
828             	my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
829             	my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
830             	my $change = sub {
831             		push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
832             	};
833             	my $match = sub {
834             		push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
835             	};
836             	traverse_balanced(
837             		$a,
838             		$b,
839             		{
840             			MATCH     => $match,
841 rizwank 1.1 			DISCARD_A => $discard,
842             			DISCARD_B => $add,
843             			CHANGE    => $change,
844             		},
845             		@_
846             	);
847             	return wantarray ? @$retval : $retval;
848             }
849             
850             1;

Rizwan Kassim
Powered by
ViewCVS 0.9.2