#!/usr/bin/perl -w

use strict;

use WeakRef;

use LWP::Simple;   # Testing/timing bit swiped from Peter Scott
use Time::HiRes qw(time);

my $data = shift || get("http://www.dfan.org/constitution.txt") ||
get("http://www.ibiblio.org/gutenberg/etext97/shndy10.txt");

if( $data eq '-' ) {
  undef $/;
  $data = <>;
}

my $then = time;
print "Result on ",(length $data)," character string:\n------\n";
print repeated_substring($data);
print "\n------\n";
print time - $then, "\n";


# Find the longest repeated (non-overlapping) substring of $_[0].
# Implemented using suffix trees to achieve O(n) runtime.
sub repeated_substring
{
  &stDriver(@_);
}

#&stDriver(@ARGV);

1;

### var Txt='',    // the input text string
###     root=null, // root of the suffix tree
###     infinity;  // quite a big number
###     nForks=0;  // number of branching nodes in the suffix tree

my $DEBUG=1;
my $txt='';
my $len=0;
my $root;
my $infinity;
my $nForks=0;

sub subscript {
  my ($i) = @_;
  return '' if $i >= $len;
  return substr($txt, $i, 1);
}


### function pair(a, b) { this.fst = a; this.snd = b; } // i.e. <fst, snd>
### // NB. most of Ukkonen's functions return a pair (s,w)
### 
### function isEmptyStrng() { return this.right < this.left; }
### 
### function Strng(left, right) // represents Txt[left..right]
###  { this.left=left; this.right=right;
###    this.isEmpty = isEmptyStrng;
### }//constructor

#sub Strng {
  #my ($left, $right) = @_;
  #return {left => $left, right => $right}; # XXX left out the isEmpty part
#}

### function addTrnstn(left, right, s) // this['a'] >---(left..right)---> s
### // add a transition to `this' state
###  { this[Txt.charAt(left)] = new pair(new Strng(left,right), s);
###    this.isLeaf = false;
###  }
### 
### function State() // i.e. a new leaf node in the suffix tree
###  { this.addTransition = addTrnstn; this.isLeaf = true; }

#sub State {
#  my %h = ();
#  my $isLeaf = 1;
#  return {hash => \%h,
#          addTransition => sub {
#            my ($left, $right, $s) = @_;
#            #$h{substr($txt, $left, 1)} = [&Strng($left, $right), $s];
#            $h{&subscript($left)} = [[$left, $right], $s];
#            $isLeaf = 0;
#          }};
#}

sub State {
  my %h = (hash => {});
  return \%h;
}
sub addTransition {
  my ($h, $left, $right, $s) = @_;
  &erase($h->{hash}->{&subscript($left)});
  $h->{hash}->{&subscript($left)} = [[$left, $right], $s];
}

# stupid garbage collector
sub erase {
  my ($ref) = @_;
  return unless defined $ref;
  for (values %{$ref->[2]}) {
    my (undef, $s) = @{$_};
    delete($s->{sLink});
    &erase($_);
  }
}

### function show(T, str, arc) // print the suffix tree
###  { if(T == null)//should not happen!
###     { document.theForm.opt.value += str+arc+'NULL !!!\n';
###       return;//should not be here
###     }
###  //else
###    if(T.isLeaf)
###     { document.theForm.opt.value += str+arc+'leaf\n';
###       return;//llewop d
###     }
###  //else
###       nForks++;
###       var attr, iter = 0;
###       var spaces = '';  var i;
###       for(i=1; i < arc.length; i++) spaces += ' ';
###       spaces += '|';   // |spaces|==|arc|
###       var str2 = str+spaces;//nosilla l
### 
###       for(attr in T)//each subtree
###       if(attr.length == 1)//a char attribute selects a suffix-tree branch
###        { iter++;//ics pmoc hsanom
###          var wAndT2 = T[attr];
###          var w = wAndT2.fst, T2 = wAndT2.snd;
###          var myStr = '('+(w.left+1)+':'+Txt.substring(w.left, w.right+1)+')|';
###          if(iter > 1)//must get to at least 2 if suffix tree is correct.
###             document.theForm.opt.value += (iter==2 ? str+arc : str2)+'\n';
###          show(T2, str2, myStr)
###        }
###  }//show


### // from E.Ukkonen, On-Line Construction of Suffix Trees ***                   C
### //                 Algorithmica 14(3) pp 249-260, 1995  ***                   o
### // (U. Helsinki, Finland)                                                     m
###                                                                            // p
###                                                                            // .
### function upDate(s, k, i)                                                   // S
### // (s, (k, i-1)) is the canonical reference pair for the active point         c
###  { var oldr = root;                                                        // i
###    var endAndr = test_and_split(s, k, i-1, Txt.charAt(i))                  // M
###    var endPoint = endAndr.fst; var r = endAndr.snd                         // o
###                                                                            // n
###    while (!endPoint)                                                       // n
###     { r.addTransition(i, infinity, new State());                           // a
###       if (oldr != root) oldr.sLink = r;                                    // s
###                                                                            // h
###       oldr = r;
###       var sAndk = canonize(s.sLink, k, i-1)                                // A
###       s = sAndk.fst; k = sAndk.snd;                                        // l
###       endAndr = test_and_split(s, k, i-1, Txt.charAt(i))                   // l
###       endPoint = endAndr.fst; r = endAndr.snd;                             // i
###     }                                                                      // s
###                                                                            // o
###    if(oldr != root) oldr.sLink = s;                                        // n
### 
###    return new pair(s, k);
###  }//upDate

sub upDate {
  my ($s, $k, $i) = @_;
  my $oldr = $root;

  while( 1 ) {
    my ($endPoint, $r) = &test_and_split($s, $k, $i-1, &subscript($i));
    last if $endPoint;

    &addTransition($r, $i, $infinity, &State());
    weaken($oldr->{sLink} = $r) unless $oldr == $root;

    $oldr = $r;
    ($s, $k) = &canonize($s->{sLink}, $k, $i-1);
  }

  weaken($oldr->{sLink} = $s) unless $oldr == $root;

  return ($s, $k);
}


### function test_and_split(s, k, p, t)                                        // P
###  { if(k<=p)                                                                // o
###     { // find the t_k transition g'(s,(k',p'))=s' from s                   // w
###       // k1 is k'  p1 is p'                                                // e
###       var w1ands1 = s[Txt.charAt(k)];          // s --(w1)--> s1              l
###       var s1 = w1ands1.snd;                                                // l
###       var k1 = w1ands1.fst.left;  var p1 = w1ands1.fst.right;
### 
###       if (t == Txt.charAt(k1 + p - k + 1))
###          return new pair(true, s);
###       else
###        { var r = new State()
###          s.addTransition(k1, k1+p-k,   r);     // s ----> r ----> s1
###          r.addTransition(    k1+p-k+1, p1, s1);
###          return new pair(false, r)
###        }
###     }
###    else // k > p;  ? is there a t-transition from s ?
###       return new pair(s[t] != null, s);
###  }//test_and_split

sub test_and_split {
  my ($s, $k, $p, $t) = @_;

  return (exists($s->{hash}->{$t}), $s) if $k > $p;

  my ($w1, $s1) = @{$s->{hash}->{&subscript($k)}};
  my ($k1, $p1) = @{$w1};

  return (1, $s) if $t eq &subscript($k1 + $p - $k + 1);

  my $r = &State();
  &addTransition($s, $k1, $k1 + $p - $k, $r);
  &addTransition($r, $k1 + $p - $k + 1, $p1, $s1);
  return (0, $r);
}

### function canonize(s, k, p)
###  { if(p < k) return new pair (s, k);
### 
###    // find the t_k transition g'(s,(k',p'))=s' from s
###    // k1 is k',  p1 is p'
###    var w1ands1 = s[Txt.charAt(k)];                            // s --(w1)--> s1
###    var s1 = w1ands1.snd;
###    var k1 = w1ands1.fst.left;  var p1 = w1ands1.fst.right;
### 
###    while(p1-k1 <= p-k)                               // s --(w1)--> s1 ---> ...
###     { k += p1 - k1 + 1;                    // remove |w1| chars from front of w
###       s = s1;
###       if(k <= p)
###        { w1ands1 = s[Txt.charAt(k)];                          // s --(w1)--> s1
###          s1 = w1ands1.snd;
###          k1 = w1ands1.fst.left; p1 = w1ands1.fst.right;
###        }
###      }
###     return new pair(s, k);
###   }//canonize

sub canonize {
  my ($s, $k, $p) = @_;
  return ($s, $k) if $p < $k;

  my ($w1, $s1) = @{$s->{hash}->{&subscript($k)}};
  my ($k1, $p1) = @{$w1};

  while( $p1 - $k1 <= $p - $k ) {
    $k += $p1 - $k1 + 1;
    $s = $s1;
    if( $k <= $p ) {
      ($w1, $s1) = @{$s->{hash}->{&subscript($k)}};
      ($k1, $p1) = @{$w1};
    }
  }
  return ($s, $k);
}

### function algorithm2()
###  { var s, k, i;
###    var bt;
### 
###    root = new State();
###    bt = new State();                                      // bt (bottom or _|_)
### 
###    // Want to create transitions for all possible chars
###    // from bt to root
###    for (i=0; i<Txt.length; i++)
###       bt.addTransition(i,i, root);
### 
###    root.sLink = bt;
###    s=root; k=0;  // NB. Start k=0, unlike Ukkonen paper our strings are 0 based
### 
###    for(i=0; i < Txt.length; i++)
###     { var sAndk = upDate(s, k, i);   // (s,k) < - upDate(...)
###       s = sAndk.fst; k = sAndk.snd;
###       sAndk = canonize(s, k, i);     // (s,k) < - canonize(...)
###       s = sAndk.fst; k = sAndk.snd;
###     }
###  }//algorithm2

sub algorithm2 {
  my $bt = &State();
  $root = &State();

  print "Starting\n";
  for (0 .. $len) {
    &addTransition($bt, $_, $_, $root);
  }
  print "Continuing\n";

  weaken($root->{sLink} = $bt);
  my ($s, $k) = ($root, 0);

  for (0 .. $len) {
    ($s, $k) = &upDate($s, $k, $_);
    ($s, $k) = &canonize($s, $k, $_);
  }
}

### // ----------------------------------------------------------------------------
### // The following sorts the suffixes so you can see what the STree should be.
### 
### function insertionSort(Txt) // NB. O(n**2) unacceptable for long input strings!
###  { if(Txt.length > 11) return;//too long for sorting
### 
###    var A = new Array(), len = Txt.length;
###    var i;
###    for(i = 0; i < Txt.length; i++) A[i] = i;
###    for(i = 0; i < Txt.length-1; i++)
###     { var j,  small = i;
###       for(j = i+1; j < Txt.length; j++)
###          if(Txt.substring(A[j],len) < Txt.substring(A[small], len))
###             small = j;
###       var temp = A[i]; A[i] = A[small]; A[small] = temp;
###     }
###    for(i = 0; i < len; i++)
###     { var numbr = '    '+(1+A[i])+': ';
###       numbr = numbr.substring(numbr.length-4, numbr.length);
###       document.theForm.opt.value += numbr+Txt.substring(A[i], len)+'\n';
###     }
###    document.theForm.opt.value += '\n';
###  }//insertionSort
### 
### // ----------------------------------------------------------------------------

### function stDriver()
###  { Txt = document.theForm.inp.value;
###    infinity = Txt.length + 1000; // well it's quite big :-)
###    nForks = 0;
### 
###    document.theForm.opt.value = '';
###    insertionSort(Txt);
### 
###    algorithm2();  // ------------ the business
###    show(root, '', 'tree:|');
###    document.theForm.opt.value += nForks + ' branching nodes';
###  }//stDriver

sub stDriver {
  $txt = shift;
  $len = length $txt;
  $infinity = $len + 1000;
  $nForks = 0;

  $DEBUG=1;

  $len++; # this is adding the "$" to the end of the string

  &algorithm2();
  #print "$nForks branching nodes\n";
  print "Root has ",scalar(keys %{$root->{hash}}), "\n";
  my ($count, @string) = &deepest($root,'');
  print "Got repeated substring \"", join('|', @string), "\" of length ",
    $count, "\n";
  return join('', @string);
}

sub deepest {
  my ($s, $space) = @_;
  my %h = %{$s->{hash}};
  my ($count, @result) = (0, (''));
  print "${space}Node has ",scalar(values %h)," branches\n" if $DEBUG;
  if( values %h > 1 ) {
    for (values %h) {
      my ($boundsref, $s) = @{$_};
      my $diff = $boundsref->[1] - $boundsref->[0] + 1;
      my $string = substr($txt, $boundsref->[0], $diff);
      print "${space}Substring '$string' is from $boundsref->[0] to $boundsref->[1]\n" if $DEBUG;
      my ($c, @x) = &deepest($s, $space.' ');
      if( $boundsref->[1] < $infinity && $count < $c + $diff ) {
        $count = $c + $diff;
        @result = ($string, @x);
      }
    }
  }
  return ($count, @result);
}



__END__

=head1 A

The algorithm ...

    http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Tree/Suffix/


=head1 Specification (from MJD's Expert Quiz 14):

    Manufacture a function, 'repeated_substring'.  The argument to the
    function is a string.  The function should return the longest
    substring of the argument that appears at least twice.  For example,
    given "123 1234", the function should return "123".

    If there is more than one repeated substring of maximum length, the
    function may return any of the substrings.  If there is no repeated
    substring, the function should return either undef or an empty string.

    The repetitions may *not* overlap.  For example, "ana" is *not*
    a repeated substring of "banana".  The longest repeated substrings in
    "banana" are "an" and "na".

    The function should be efficient, even for very large input strings.


=cut

