NAME

WWW::Scraper::Lite

VERSION

$LastChangedRevision: 8 $

SYNOPSIS

 my $domain  = 'http://devsite.local/';
 my $scraper = WWW::Scraper::Lite->new();
 $scraper->crawl($domain,
		 {
		  '//a' => sub {                                               # handler for all 'a' tags
		    my ($scraper, $nodes) = @_;
		    $scraper->enqueue(grep { $_ =~ m{^$domain} }               # only this domain
				      map  { $scraper->url_remove_anchor($_) } # only index pages without #anchor
				      map  { $scraper->url_make_absolute($_) } # indexer needs absolute URLs
				      map  { $_->{href} }                      # pull href out of the 'a' DOM node
				      @{$nodes});
		  },
		  '/*' => sub {                                                # handler for all content
		    my ($scraper, $nodes) = @_;
		    print $scraper->{current}->{response}->content;            # do something useful with HTTP response
		  },
		 }
	        );

DESCRIPTION

SUBROUTINES/METHODS

new - constructor, initialises fetch-queue and seen-URL hash

my $oScraper = WWW::Scraper::Lite->new();

ua - new/cached LWP::UserAgent object

my $oUA = $oScraper->ua();

crawl - start crawling a given URL with a given set of XPath callbacks

$oScraper->crawl($sStartURL, $hrCallbacks);

enqueue - push one or more URLs onto the fetch queue

$oScraper->enqueue(@aURLs);

dequeue - shift a URL off the fetch queue

my $sURL = $oScraper->dequeue();

current - a hashref containing information on the current page

my $hrCurrentData = $oScraper->current;

url_remove_anchor - strip '#anchor' text from a URL string

my $sURLout = $oScraper->url_remove_anchor($sURLin);

url_make_absolute - add the current domain to a URL to make it absolute

my $sURLout = $oScraper->url_remove_anchor($sURLin);

DIAGNOSTICS

CONFIGURATION AND ENVIRONMENT

DEPENDENCIES

strict
warnings
LWP::UserAgent
HTML::TreeBuilder::XPath
Carp

INCOMPATIBILITIES

BUGS AND LIMITATIONS

AUTHOR

$Author: Roger Pettett,,,$

LICENSE AND COPYRIGHT

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.