#!/usr/bin/env perl

# This program was written by Robin Lee Powell, primarily on 16 Apr
# 2010.
# 
# It is hereby placed in the public domain, to the fullest extent
# permitted by low.

use strict;
use warnings;

use Data::Dumper;

$| = 1;

unless (@ARGV >= 1) {
	print STDERR qq(Usage:
$0 string [filename]
				
	If no filename is given, supply XML on STDIN.
);
	exit;
}

my $string = shift(@ARGV);

use XML::LibXML;

my $parser = XML::LibXML->new();

my $dom;

if( @ARGV )
{
  my $file = shift(@ARGV);
  $dom = $parser->load_html( location => $file,
    recover => 2, suppress_warnings => 1, suppress_errors => 1);

  if( ! $dom )
  {
    print "Whoops.  Parsing failed; retrying with errors turned on, and exiting, in case this will be of some help.  You might want to try running your HTML through tidy ( http://www.w3.org/People/Raggett/tidy/ )\n";
    $dom = $parser->load_html( location => $file );
    exit 1;
  }
} else {
  $dom = $parser->load_html( IO => \*STDIN,
    recover => 2, suppress_warnings => 1, suppress_errors => 1);

  if( ! $dom )
  {
    print "Whoops.  Parsing failed; can't retry with STDIN.  Please put the data in a file and run against the file name.\n";
    exit 1;
  }
}

#$Data::Dumper::Maxdepth=5;
#print "dom: ".Dumper(\$dom->documentElement()->toString())."\n";
#exit;

my $matching_nodes = find_matching_nodes( $dom->documentElement() );

## print "nodes: ".Dumper($matching_nodes)."\n";

foreach my $matching_node (@{$matching_nodes}) {
  print "***********************************************************************\n";
  print "Matching Block:\n        ".$matching_node->toString()."\n";
  print "=======================================================================\n";

  my $parent_num=0;
  my $total_xpath="";
  my $current = $matching_node;
  print "Walking the tree upwards, generating xpaths, starting with the lement itself which we'll call 'parent 0' because it makes the code easier.\n";
  while( $current )
  {
    last if( $current->nodeName() eq "html" );
    
    print "    parent $parent_num looks a bit like this: <".$current->nodeName();
    foreach my $attr ($current->attributes) {
      if( $attr->nodeName() =~ m{onclick}i )
      {
        print " ".$attr->nodeName()."='...'";
      } else {
        print " ".$attr->nodeName()."='".(substr $attr->getValue(), 0, 50)."'";
      }
    }
    print ">\n";
    my $unrooted_xpath = find_unrooted_xpath($current);
    print "            Final unrooted XPath for parent $parent_num: $unrooted_xpath\n";
    my $parental_xpath = find_parental_xpath($current, $current->parentNode);
    print "            Final parent-rooted XPath for parent $parent_num: $parental_xpath\n";
    # Use the $total_xpath from the last run
    if( ! $total_xpath )
    {
      print "        Final dual-rooted XPath for parent $parent_num: $unrooted_xpath\n";
    } else {
      print "        Final dual-rooted XPath for parent $parent_num: $unrooted_xpath$total_xpath\n";
    }

    # Construct the next $total_xpath
    if( ! $total_xpath )
    {
      $total_xpath = "/$parental_xpath";
    } else {
      $total_xpath = "/$parental_xpath$total_xpath";
    }

    $parent_num++;
    $current = $current->parentNode;
  }
}

sub find_matching_nodes {
  my $input_node=shift;

  my $found_one=0;

  my @nodes;

  foreach my $node ($input_node->childNodes())
  {
    next if( ! $node->isa( 'XML::LibXML::Element' ) );
    if( $node->toString() =~ m{$string} )
    {
      $found_one = 1;

      push @nodes, @{ find_matching_nodes( $node ) };
    }
  }

  # If none of our children had it, return ourself
  if( ! $found_one || ! $input_node->childNodes() ) {
    return [ $input_node ];
  } else {
    return \@nodes;
  }
}

sub find_unrooted_xpath {
  my $xpc = XML::LibXML::XPathContext->new( $dom->documentElement() );

  my $node = shift;

  print "        Finding an unrooted xpath for this node\n";
  # **************************************************
  # Find an absolute XPath for it that isn't rooted.
  # **************************************************

  # First, find a basic attribute based XPath that'll catch it at
  # all
  my $xpath="//".$node->nodeName();
  my $attr_xpath="";
  if( $node->attributes ) {
    my $attr_num = 1;

    foreach my $attr ($node->attributes) {
      next if( $attr->nodeName() =~ m{onclick}i );

      if( $attr_num > 1 )
      {
        $attr_xpath .= " and ";
      }
      $attr_xpath .= "@".$attr->nodeName()."='".$attr->getValue()."'";

      $attr_num++;
    }
  }

  if( $attr_xpath )
  {
    $xpath .= "[$attr_xpath]";
  }

  # Now tack a [number] on the end if we have to
  print "            An initial xpath: $xpath\n";

  my @results = $xpc->findnodes($xpath);

  print "            Which returns ".@results." results\n";

  ## print "results: ".Dumper(\@results)."\n";

  if( @results == 1 )
  {
    # Oh, it already works; fantastic.
    return $xpath;
  }

  # Still here; must be more than one
  for( my $i = 0; $i < @results; $i++) {
    my $new_xpath="(".$xpath.")[".($i+1)."]";

    my @new_results = $xpc->findnodes($new_xpath);

    if( $new_results[0]->isSameNode( $node ) )
    {
      ## print "        A better xpath; one result only: $new_xpath\n";
      return $new_xpath;
    }
  }
}

sub find_parental_xpath {
  my $node = shift;
  my $parent = shift;
  my $xpc = XML::LibXML::XPathContext->new( $parent );
  ## print "node: ".Dumper($node)."\n";
  ## print "parent: ".Dumper($parent)."\n";

  print "        Finding an xpath for this node based on the parent\n";
  # **************************************************
  # Find an absolute XPath for it rooted at the parent
  # **************************************************

  # First, find a basic attribute based XPath that'll catch it at
  # all
  my $xpath=$node->nodeName();
  my $attr_xpath="";

  # Now tack a [number] on the end if we have to
  print "            An initial xpath: $xpath\n";

  my @results = $xpc->findnodes($xpath);

  print "            Which returns ".@results." results\n";

  ## print "results: ".Dumper(\@results)."\n";

  if( @results == 1 )
  {
    # Oh, it already works; fantastic.
    return $xpath;
  }

  # Still here; must be more than one
  for( my $i = 0; $i < @results; $i++) {
    my $new_xpath=$xpath."[".($i+1)."]";

    ## print "new xpath: $new_xpath\n";

    my @new_results = $xpc->findnodes($new_xpath);

    if( $new_results[0]->isSameNode( $node ) )
    {
      ## print "        A better xpath; one result only: $new_xpath\n";
      return $new_xpath;
    }
  }
}