#!/usr/bin/env perl # This program was written by Robin Lee Powell, primarily on 16 Apr # 2010. # # It is hereby placed in the public domain, to the fullest extent # permitted by low. use strict; use warnings; use Data::Dumper; $| = 1; unless (@ARGV >= 1) { print STDERR qq(Usage: $0 string [filename] If no filename is given, supply XML on STDIN. ); exit; } my $string = shift(@ARGV); use XML::LibXML; my $parser = XML::LibXML->new(); my $dom; if( @ARGV ) { my $file = shift(@ARGV); $dom = $parser->load_html( location => $file, recover => 2, suppress_warnings => 1, suppress_errors => 1); if( ! $dom ) { print "Whoops. Parsing failed; retrying with errors turned on, and exiting, in case this will be of some help. You might want to try running your HTML through tidy ( http://www.w3.org/People/Raggett/tidy/ )\n"; $dom = $parser->load_html( location => $file ); exit 1; } } else { $dom = $parser->load_html( IO => \*STDIN, recover => 2, suppress_warnings => 1, suppress_errors => 1); if( ! $dom ) { print "Whoops. Parsing failed; can't retry with STDIN. Please put the data in a file and run against the file name.\n"; exit 1; } } #$Data::Dumper::Maxdepth=5; #print "dom: ".Dumper(\$dom->documentElement()->toString())."\n"; #exit; my $matching_nodes = find_matching_nodes( $dom->documentElement() ); ## print "nodes: ".Dumper($matching_nodes)."\n"; foreach my $matching_node (@{$matching_nodes}) { print "***********************************************************************\n"; print "Matching Block:\n ".$matching_node->toString()."\n"; print "=======================================================================\n"; my $parent_num=0; my $total_xpath=""; my $current = $matching_node; print "Walking the tree upwards, generating xpaths, starting with the lement itself which we'll call 'parent 0' because it makes the code easier.\n"; while( $current ) { last if( $current->nodeName() eq "html" ); print " parent $parent_num looks a bit like this: <".$current->nodeName(); foreach my $attr ($current->attributes) { if( $attr->nodeName() =~ m{onclick}i ) { print " ".$attr->nodeName()."='...'"; } else { print " ".$attr->nodeName()."='".(substr $attr->getValue(), 0, 50)."'"; } } print ">\n"; my $unrooted_xpath = find_unrooted_xpath($current); print " Final unrooted XPath for parent $parent_num: $unrooted_xpath\n"; my $parental_xpath = find_parental_xpath($current, $current->parentNode); print " Final parent-rooted XPath for parent $parent_num: $parental_xpath\n"; # Use the $total_xpath from the last run if( ! $total_xpath ) { print " Final dual-rooted XPath for parent $parent_num: $unrooted_xpath\n"; } else { print " Final dual-rooted XPath for parent $parent_num: $unrooted_xpath$total_xpath\n"; } # Construct the next $total_xpath if( ! $total_xpath ) { $total_xpath = "/$parental_xpath"; } else { $total_xpath = "/$parental_xpath$total_xpath"; } $parent_num++; $current = $current->parentNode; } } sub find_matching_nodes { my $input_node=shift; my $found_one=0; my @nodes; foreach my $node ($input_node->childNodes()) { next if( ! $node->isa( 'XML::LibXML::Element' ) ); if( $node->toString() =~ m{$string} ) { $found_one = 1; push @nodes, @{ find_matching_nodes( $node ) }; } } # If none of our children had it, return ourself if( ! $found_one || ! $input_node->childNodes() ) { return [ $input_node ]; } else { return \@nodes; } } sub find_unrooted_xpath { my $xpc = XML::LibXML::XPathContext->new( $dom->documentElement() ); my $node = shift; print " Finding an unrooted xpath for this node\n"; # ************************************************** # Find an absolute XPath for it that isn't rooted. # ************************************************** # First, find a basic attribute based XPath that'll catch it at # all my $xpath="//".$node->nodeName(); my $attr_xpath=""; if( $node->attributes ) { my $attr_num = 1; foreach my $attr ($node->attributes) { next if( $attr->nodeName() =~ m{onclick}i ); if( $attr_num > 1 ) { $attr_xpath .= " and "; } $attr_xpath .= "@".$attr->nodeName()."='".$attr->getValue()."'"; $attr_num++; } } if( $attr_xpath ) { $xpath .= "[$attr_xpath]"; } # Now tack a [number] on the end if we have to print " An initial xpath: $xpath\n"; my @results = $xpc->findnodes($xpath); print " Which returns ".@results." results\n"; ## print "results: ".Dumper(\@results)."\n"; if( @results == 1 ) { # Oh, it already works; fantastic. return $xpath; } # Still here; must be more than one for( my $i = 0; $i < @results; $i++) { my $new_xpath="(".$xpath.")[".($i+1)."]"; my @new_results = $xpc->findnodes($new_xpath); if( $new_results[0]->isSameNode( $node ) ) { ## print " A better xpath; one result only: $new_xpath\n"; return $new_xpath; } } } sub find_parental_xpath { my $node = shift; my $parent = shift; my $xpc = XML::LibXML::XPathContext->new( $parent ); ## print "node: ".Dumper($node)."\n"; ## print "parent: ".Dumper($parent)."\n"; print " Finding an xpath for this node based on the parent\n"; # ************************************************** # Find an absolute XPath for it rooted at the parent # ************************************************** # First, find a basic attribute based XPath that'll catch it at # all my $xpath=$node->nodeName(); my $attr_xpath=""; # Now tack a [number] on the end if we have to print " An initial xpath: $xpath\n"; my @results = $xpc->findnodes($xpath); print " Which returns ".@results." results\n"; ## print "results: ".Dumper(\@results)."\n"; if( @results == 1 ) { # Oh, it already works; fantastic. return $xpath; } # Still here; must be more than one for( my $i = 0; $i < @results; $i++) { my $new_xpath=$xpath."[".($i+1)."]"; ## print "new xpath: $new_xpath\n"; my @new_results = $xpc->findnodes($new_xpath); if( $new_results[0]->isSameNode( $node ) ) { ## print " A better xpath; one result only: $new_xpath\n"; return $new_xpath; } } }