NAME
WWW::Scraper::Lite
VERSION
$LastChangedRevision: 15 $
SYNOPSIS
my $domain = 'http://devsite.local/';
my $scraper = WWW::Scraper::Lite->new();
$scraper->crawl($domain,
{
'//a' => sub { # handler for all 'a' tags
my ($scraper, $nodes) = @_;
$scraper->enqueue(grep { $_ =~ m{^$domain} } # only this domain
map { $scraper->url_remove_anchor($_) } # only index pages without #anchor
map { $scraper->url_make_absolute($_) } # indexer needs absolute URLs
map { $_->{href} } # pull href out of the 'a' DOM node
@{$nodes});
},
'/*' => sub { # handler for all content
my ($scraper, $nodes) = @_;
print $scraper->{current}->{response}->content; # do something useful with HTTP response
},
}
);
DESCRIPTION
SUBROUTINES/METHODS
new - constructor, initialises fetch-queue and seen-URL hash
my $oScraper = WWW::Scraper::Lite->new();
ua - new/cached LWP::UserAgent object
my $oUA = $oScraper->ua();
crawl - start crawling a given URL with a given set of XPath callbacks
$oScraper->crawl($sStartURL, $hrCallbacks);
enqueue - push one or more URLs onto the fetch queue
$oScraper->enqueue(@aURLs);
dequeue - shift a URL off the fetch queue
my $sURL = $oScraper->dequeue();
current - a hashref containing information on the current page
my $hrCurrentData = $oScraper->current;
url_remove_anchor - strip '#anchor' text from a URL string
my $sURLout = $oScraper->url_remove_anchor($sURLin);
url_make_absolute - add the current domain to a URL to make it absolute
my $sURLout = $oScraper->url_remove_anchor($sURLin);
DIAGNOSTICS
CONFIGURATION AND ENVIRONMENT
DEPENDENCIES
INCOMPATIBILITIES
BUGS AND LIMITATIONS
AUTHOR
$Author: Roger Pettett,,,$
LICENSE AND COPYRIGHT
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.