NAME

FEAR::API - There's no fear with this elegant site scraper

DESCRIPTION

FEAR::API is a tool that helps reduce your time creating site scraping scripts and help you do it in a much more elegant way. FEAR::API combines many strong and powerful features from various CPAN modules, such as LWP::UserAgent, WWW::Mechanize, Template::Extract, Encode, HTML::Parser, etc. and digests them into a deeper Zen.

However, this module violates probably every single rule of any Perl coding standards. Please stop here if you don't want to the yucky code.

EXAMPLES

use FEAR::API -base;

Fetch a page and store it in a scalar

fetch("google.com") > my $content;

my $content = fetch("google.com")->document->as_string;

Fetch a page and print to STDOUT

 getprint("google.com");

 print fetch("google.com")->document->as_string;

 fetch("google.com");
 print $$_;    

 fetch("google.com") | _print;

Fetch a page and save it to a file

getstore("google.com", 'google.html');

url("google.com")->() | _save_as("google.html");

fetch("google.com") | io('google.html');
url("google.com")->() >> _self;
&$_ while $_;
url("google.com")->() >> _self | _save_as_tree("./root");
$_->() | _save_as_tree("./root") while $_;

Recursively get web pages from Google

url("google.com");
&$_ >> _self while $_;

Recursively get web pages from Google

url("google.com");
&$_ >> _self | _save_as_tree("./root") while $_;
url("google.com")->()->follow_link(n => 2);
print Dumper fetch("google.com")->links;

Submit a query to Google

url("google.com")->();
submit_form(
            form_number => 1,
            fields => { q => "Kill Bush" }
            );
url("[% FOREACH i = ['a'..'z'] %]
     http://some.site/[% i %]
     [% END %]");
&$_ while $_;

Get pages in parallel

url("google.com")->() >> _self;
pfetch(sub{
           local $_ = shift;
           print join q/ /, title, current_url, document->size, $/;
       });

Minimal

url("google.com")->()
  >> [
      qr(^http:) => _self,
      qr(google) => \my @l,
      qr(google) => sub {  print ">>>".$_[0]->[0],$/ }
     ];
$_->() while $_;
print Dumper \@l;

Verbose

fetch("http://google.com")
->report_links(
               qr(^http:) => _self,
               qr(google) => \my @l,
               qr(google) => sub {  print ">>>".$_[0]->[0],$/ }
              );
fetch while has_more_urls;
print Dumper \@l;

Minimal

url("google.com")->()
  >> {
      qr(^http:) => _self,
      qr(google) => \my @l,
      qr(google) => sub {  print ">>>".$_[0]->[0],$/ }
     };
$_->() while $_;
print Dumper \@l;

Verbose

fetch("http://google.com")
->fallthrough_report(1)
->report_links(
               qr(^http:) => _self,
               qr(google) => \my @l,
               qr(google) => sub {  print ">>>".$_[0]->[0],$/ }
              );
fetch while has_more_urls;
print Dumper \@l;

Extraction

Extract data from CPAN

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;

Extract data from CPAN after some HTML cleanup

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print document->as_string;    # print content to STDOUT
template("<!--item-->[% p %]<!--end item-->");
extract;
print Dumper extresult;

HTML cleanup, extract data, and refine results

url("http://search.cpan.org/recent")->();
submit_form(
        form_name => "f",
        fields => {
                   query => "perl"
                  });
preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
print $$_;    # print content to STDOUT
template("<!--item-->[% rec %]<!--end item-->");
extract;
postproc(q($_->{rec} =~ s/<.+?>//g));     # Strip HTML tags
print Dumper extresult;

Use filtering syntax

fetch("http://search.cpan.org/recent");
submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
            })
   | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
   | _template("<!--item-->[% rec %]<!--end item-->")
   | _result_filter(q($_->{rec} =~ s/<.+?>//g));
print Dumper \@$_;

Invoke handler for extracted results

fetch("http://search.cpan.org/recent");
submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
            })
   | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
   | "<!--item-->[% rec %]<!--end item-->"
   | _result_filter(q($_->{rec} =~ s/<.+?>//g));
invoke_handler('Data::Dumper');

Forward extracted data to relational database

template($template);
extract;
invoke_handler('Some::Module::based::on::Class::DBI');
# or
invoke_handler('Some::Module::based::on::DBIx::Class::CDBICompat');

Preprocess document

url("google.com")->()
| _preproc(use => "html_to_null")
| _preproc(use => "decode_entities")
| _print;

Postprocess extraction results

fetch("http://search.cpan.org/recent");
submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
            })
   | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
   | _template("<!--item-->[% rec %]<!--end item-->")
   | _result_filter(use => "html_to_null",    qw(rec));
   | _result_filter(use => "decode_entities", qw(rec))
print Dumper \@$_;

Scraping a file

file('some_file');

url('file:///the/path/to/your/file');

Convert HTML to XHTML

print fetch("google.com")->document->html_to_xhtml->as_string;

fetch("google.com") | _to_xhtml;
print $$_;

Select content of interest using XPath

print fetch("google.com")->document->html_to_xhtml->xpath('/html/body/*/form')->as_string;

fetch("google.com") | _to_xhtml | _xpath('/html/body/*/form');
print $$_;

Make your site scraping script a subroutine

# sst is for site scraping template;
load_sst('fetch("google.com") >> _self; $_->() while $_');
run_sst;

load_sst('fetch("[% initial_link %]") >> _self; $_->() while $_');
run_sst({ initial_link => 'google.com'});

Tabbed scraping

fetch("google.com");        # Default tab is 0
tab 1;                             # Create a new tab, and switch to it.
fetch("search.cpan.org");  # Fetch page in tab 1
tab 0;                             # Switch back to tab 0
template($template);       # Continue processing in tab 0
extract();

keep_tab 1;                    # Keep tab 1 only and close others
close_tab 1;                    # Close tab 1

Create RSS file

use FEAR::API -base, -rss;
my $url = "http://google.com";
url($url)->();
rss_new( $url, "Google", "Google Search Engine" );
rss_language( 'en' );
rss_webmaster( 'xxxxx@yourdomain.com' );
rss_twice_daily();
rss_item(@$_) for map{ [ $_->url(), $_->text() ] } links;
die "No items have been added." unless rss_item_count;
rss_save('google.rss');

See also XML::RSS::SimpleGen

Use FEAR::API in one-liners

fearperl -e 'fetch("google.com")'

perl -M'FEAR::API -base' -e 'fetch("google.com")'

DEBATE

This module has been heavily criticized on Perlmonks. Please go to http://perlmonks.org/?node_id=537504 for details.

AUTHOR & COPYRIGHT

Copyright (C) 2006 by Yung-chung Lin (a.k.a. xern) <xern@cpan.org>

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself