NAME
FEAR::API - There's no fear with this elegant site scraper
DESCRIPTION
FEAR::API is a tool that helps reduce your time creating site scraping scripts and help you do it in a much more elegant way. FEAR::API combines many strong and powerful features from various CPAN modules, such as LWP::UserAgent, WWW::Mechanize, Template::Extract, Encode, HTML::Parser, etc. and digests them into a deeper Zen.
More documentation will come sooooooner or later.
EXAMPLES
use FEAR::API -base;
Fetch a page and store it in a scalar
fetch("google.com") > my $content;
my $content = fetch("google.com")->document->as_string;
Fetch a page and print to STDOUT
getprint("google.com");
print fetch("google.com")->document->as_string;
fetch("google.com");
print $$_;
fetch("google.com") | _print;
Fetch a page and save it to a file
getstore("google.com", 'google.html');
url("google.com")->() | _save_as("google.html");
fetch("google.com") | io('google.html');
Follow links in Google's homepage
url("google.com")->() >> _self;
&$_ while $_;
Save links in Google's homepage
url("google.com")->() >> _self | _save_as_tree("./root");
$_->() | _save_as_tree("./root") while $_;
Recursively get web pages from Google
url("google.com");
&$_ >> _self while $_;
Recursively get web pages from Google
url("google.com");
&$_ >> _self | _save_as_tree("./root") while $_;
Follow the second link of Google
url("google.com")->()->follow_link(n => 2);
Return links from Google's homepage
print Dumper fetch("google.com")->links;
Submit a query to Google
url("google.com")->();
submit_form(
form_number => 1,
fields => { q => "Kill Bush" }
);
Get links of some pattern
url("[% FOREACH i = ['a'..'z'] %]
http://some.site/[% i %]
[% END %]");
&$_ while $_;
Get pages in parallel
url("google.com")->() >> _self;
pfetch(sub{
local $_ = shift;
print join q/ /, title, current_url, document->size, $/;
});
Deal with links in a web page (I)
Minimal
url("google.com")->()
>> [
qr(^http:) => _self,
qr(google) => \my @l,
qr(google) => sub { print ">>>".$_[0]->[0],$/ }
];
$_->() while $_;
print Dumper \@l;
Verbose
fetch("http://google.com")
->report_links(
qr(^http:) => _self,
qr(google) => \my @l,
qr(google) => sub { print ">>>".$_[0]->[0],$/ }
);
fetch while has_more_urls;
print Dumper \@l;
Deal with links in a web page (II)
Minimal
url("google.com")->()
>> {
qr(^http:) => _self,
qr(google) => \my @l,
qr(google) => sub { print ">>>".$_[0]->[0],$/ }
};
$_->() while $_;
print Dumper \@l;
Verbose
fetch("http://google.com")
->fallthrough_report(1)
->report_links(
qr(^http:) => _self,
qr(google) => \my @l,
qr(google) => sub { print ">>>".$_[0]->[0],$/ }
);
fetch while has_more_urls;
print Dumper \@l;
Extraction
Extract data from CPAN
url("http://search.cpan.org/recent")->();
submit_form(
form_name => "f",
fields => {
query => "perl"
});
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;
Extract data from CPAN after some HTML cleanup
url("http://search.cpan.org/recent")->();
submit_form(
form_name => "f",
fields => {
query => "perl"
});
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print document->as_string; # print content to STDOUT
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;
HTML cleanup, extract data, and refine results
url("http://search.cpan.org/recent")->();
submit_form(
form_name => "f",
fields => {
query => "perl"
});
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print $$_; # print content to STDOUT
template("<!--item-->[% rec %]<!--end item-->");
extract;
postproc(q($_->{rec} =~ s/<.+?>//g)); # Strip HTML tags
print Dumper extresult;
Use filtering syntax
fetch("http://search.cpan.org/recent");
submit_form(
form_name => "f",
fields => {
query => "perl"
})
| _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
| _template("<!--item-->[% rec %]<!--end item-->")
| _result_filter(q($_->{rec} =~ s/<.+?>//g));
print Dumper \@$_;
Invoke handler for extracted results
fetch("http://search.cpan.org/recent");
submit_form(
form_name => "f",
fields => {
query => "perl"
})
| _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
| "<!--item-->[% rec %]<!--end item-->"
| _result_filter(q($_->{rec} =~ s/<.+?>//g));
invoke_handler('Data::Dumper');
Forward extracted data to relational database
template($template);
extract;
invoke_handler('Some::Module::based::on::Class::DBI');
# or
invoke_handler('Some::Module::based::on::DBIx::Class::CDBICompat');
Preprocess document
url("google.com")->()
| _preproc(use => "html_to_null")
| _preproc(use => "decode_entities")
| _print;
Postprocess extraction results
fetch("http://search.cpan.org/recent");
submit_form(
form_name => "f",
fields => {
query => "perl"
})
| _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
| _template("<!--item-->[% rec %]<!--end item-->")
| _result_filter(use => "html_to_null", qw(rec));
| _result_filter(use => "decode_entities", qw(rec))
print Dumper \@$_;
Scraping a file
file('some_file');
url('file:///the/path/to/your/file');
Convert HTML to XHTML
print fetch("google.com")->document->html_to_xhtml->as_string;
fetch("google.com") | _to_xhtml;
print $$_;
Select content of interest using XPath
print fetch("google.com")->document->html_to_xhtml->xpath('/html/body/*/form')->as_string;
fetch("google.com") | _to_xhtml | _xpath('/html/body/*/form');
print $$_;
Use FEAR::API in one-liners
fearperl -e 'fetch("google.com")'
perl -M'FEAR::API -base' -e 'fetch("google.com")'
AUTHOR & COPYRIGHT
Copyright (C) 2006 by Yung-chung Lin (a.k.a. xern) <xern@cpan.org>
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself