The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.

NAME

Jungle - Jungle is a web spider framework to speed up crawler developments

SYNOPSIS

# Frist: create your spider scripts and add them to: lib/Sites/...
# and then:
# Run tests to have an idea and check out t/001_load.t

use Jungle;
my $spider = Jungle->new();
$spider->work_site( 'NewsSpider::Terra', Jungle::Data::News->new ); #Tests TERRA WEB SITE which is ISO-8859-1
$spider->work_site( 'NewsSpider::UOL' , Jungle::Data::News->new ); #Tests UOL WEB SITE which is UTF8
$spider->work_site( 'NewsSpider::Estadao', Jungle::Data::News->new ); #Tests Estadao WEB SITE which is UTF8

SAMPLE1: Spider News for terra.com.br

package Sites::NewsSpider::Terra; #has charset iso-8859-1
use Moose;
with qw/Jungle::Spider/;

has startpage => (
    is => 'rw',
    isa => 'Str',
    default => 'http://noticias.terra.com.br/ultimasnoticias/0,,EI188,00.html',
);

sub on_start { 
    my ( $self ) = @_; 
}

sub search {
    my ( $self ) = @_; 
    my $news = $self->tree->findnodes( '//div[@class="list articles"]/ol/li/a' );
    foreach my $item ( $news->get_nodelist ) {
        my $url = $item->attr( 'href' );
        if ( $url =~ m{br\.invertia\.com}ig ) {
            $self->prepend( details_invertia => $url ); 
        } else {
            $self->prepend( details => $url ); 
        }
    }
}

sub on_link {
    my ( $self, $url ) = @_;
    if ( $url =~ m{http://noticias.terra.com.br/ultimasnoticias/0,,EI188-PI(1|2|3|4|5|6),00.html}ig ) {
         $self->prepend( search => $url ); #  append url on end of list
    }
}


sub details_invertia {
    my ( $self ) = @_; 
    my $page_title = $self->tree->findvalue( '//title' );
    my $author_nodes = $self->tree->findnodes( '//dl/dd' );
    my $author  = '';
    foreach my $node ( $author_nodes->get_nodelist ) {
        $author .= $node->as_text . "\n";
    }
    my $content_nodes = $self->tree->findnodes( '//div[@id="SearchKey_Text1"]' );
    my $content;
    foreach my $node ( $content_nodes->get_nodelist ) {
        $content .= $node->as_HTML;
    }
    $self->data->author( $author );
    $self->data->content( $content );
    $self->data->title( $page_title );
    $self->data->webpage( $self->current_page );
    $self->grab_meta;
    $self->grab_images;

    $self->data->save;
}

sub grab_images {
    my ( $self ) = @_; 
    my $images_nodes = $self->tree->findnodes( '//div[contains(@class,"img-article")]/img' );
    my @images = ();
    foreach my $im ( $images_nodes->get_nodelist ) {
        push ( @images, $self->normalize_url( $im->attr( 'src' ) ) );
    }
    $self->data->images( \@images );
}


sub details {
    my ( $self ) = @_; 
    my $author_nodes = $self->tree->findnodes( '//dl/dd' );
    my $author  = '';
    foreach my $node ( $author_nodes->get_nodelist ) {
        $author .= $node->as_text . "\n";
    }
    my $content_nodes = $self->tree->findnodes( '//div[@id="SearchKey_Text1"]//p' );
    my $content = '';
    foreach my $node ( $content_nodes->get_nodelist ) {
        $content .= $node->as_text."\n";
    }
    $self->data->author( $author );
    $self->data->content( $content );
    $self->data->title( $self->tree->findvalue( '//title' ) );
    $self->data->webpage( $self->current_page );
    $self->grab_meta;
    $self->grab_images;

    $self->data->save;
}

sub grab_meta {
    my ( $self ) = @_; 
    $self->data->meta_keywords( $self->tree->findvalue( '//meta[@name="keywords"]/@content' ) );
    $self->data->meta_description( $self->tree->findvalue( '//meta[@name="description"]/@content' ) );
}

1;

SAMPLE2: Spider News for uol.com.br

package Sites::NewsSpider::UOL; #has charset UTF8
use Moose;
with qw/Jungle::Spider/;

has startpage => (
    is => 'rw',
    isa => 'Str',
    default => 'http://noticias.uol.com.br/ultimas-noticias/',
);

sub on_start {
    my ( $self ) = @_; 
    $self->append( search => $self->startpage , [ #POST EXAMPLE, use [ params => 'something' ] 
        some => 'params',
        test => 'POST',
    ] );
    #$self->append( search => $self->startpage );
}

sub search {
    my ( $self ) = @_; 
    my $news = $self->tree->findnodes( '//ul[@id="ultnot-list-noticias"]/li/h3/a' );
    foreach my $item ( $news->get_nodelist ) {
         my $url = $item->attr( 'href' );
         if ( $url =~ m{^http://www1.folha.uol.com.br}i ) {
             $self->prepend( details_folha => $url ); #  append url on end of list
         } else {
             $self->prepend( details => $url ); #  append url on end of list
         }
    }
}

sub on_link {
    my ( $self, $url ) = @_;
    if ( $url =~ m{http://noticias.uol.com.br/ultimas-noticias/index(1|2).jhtm}ig ) {
         $self->prepend( search => $url ); #  append url on end of list
    }
}

sub details {
    my ( $self ) = @_; 
    my $content_nodes = $self->tree->findnodes( '//div[@id="texto"]/p' );
    my $content = '';
    foreach my $node ( $content_nodes->get_nodelist ) {
        $content .= $node->as_text."\n";
    }
    $self->data->author( $self->tree->findvalue( '//span[@class="autor"]' ) );
    $self->data->webpage( $self->current_page );
    $self->data->content( $content );
    $self->data->title( $self->tree->findvalue( '//div[@id="materia"]//h1' ) );
    $self->grab_meta;

    $self->data->save;
}

sub details_folha {
    my ( $self ) = @_; 
    my $content_nodes = $self->tree->findnodes( '//div[@id="articleNew"]/p' );
    my $content = '';
    foreach my $node ( $content_nodes->get_nodelist ) {
        $content .= $node->as_text."\n";
    }
    $self->data->author( $self->tree->findvalue( '//div[@id="articleBy"]/p' ) );
    $self->data->webpage( $self->current_page );
    $self->data->content( $content );
    $self->data->title( $self->tree->findvalue( '//div[@id="articleNew"]/h1' ) );
    $self->grab_meta;

    $self->data->save;
}

sub grab_meta {
    my ( $self ) = @_; 
    $self->data->meta_keywords( $self->tree->findvalue( '//meta[@name="keywords"]/@content' ) );
    $self->data->meta_description( $self->tree->findvalue( '//meta[@name="description"]/@content' ) );
}

1;

SAMPLE3: Spider News for estadao.com.br

package Sites::NewsSpider::Estadao; #is UTF-8 charset
use Moose;
with qw(Jungle::Spider);

has startpage => (
    is => 'rw',
    isa => 'Str',
    default => 'http://www.estadao.com.br/ultimas/',
);

sub on_start { 
    my ( $self ) = @_; 
    $self->append( search => $self->startpage );
}

sub search {
    my ( $self ) = @_; 
    my $news = $self->tree->findnodes( '//ul/li/h2/a' );
    foreach my $item ( $news->get_nodelist ) {
         my $url = $item->attr( 'href' );
         $self->prepend( detail_noticia => $url ); #  append url on end of list
    }
}

sub on_link {
    my ( $self, $url ) = @_;
    if ( $url =~ m{pagina\.php\?i=(1|2)$}ig ) {
         $self->prepend( search => $url ); #  append url on end of list
    }
}

sub detail_noticia {
    my ( $self ) = @_; 
    my $content_nodes = $self->tree->findnodes( '//div[@class="texto-noticia"]//div[@class="corpo"]//p' );
    my $content = '';
    foreach my $node ( $content_nodes->get_nodelist ) {
        $content .= $node->as_text. "\n";
    }
    $self->data->author( $self->tree->findvalue( '//div[@class="bb-md-noticia-autor"]' ) );
    $self->data->webpage( $self->current_page );
    $self->data->content( $content );
    $self->data->title( $self->tree->findvalue( '//title' ) );
    $self->data->meta_keywords( $self->tree->findvalue( '//meta[@name="keywords"]/@content' ) );
    $self->data->meta_description( $self->tree->findvalue( '//meta[@name="description"]/@content' ) );
    $self->data->save;
}

1;

RESULTS OF EXTRACTED DATA

The extracted data should be avaliable under ./data/* as .csv files
<L|Jungle::Data::News> handles this process for our news websites
The Data class could write directly on the website db (using DBIx::Class) 
or it could generate something else than csv. Just modify and pass the
class to Jungle.

As of now, the extracted data will be saved under ./data/file-site.csv
If you wish to change this behaviour, modify lib/Jungle/Data/News.pm or
create your own classes

DESCRIPTION

Take this as a webspider framework example.

AUTHOR

Hernan Lopes
CPAN ID: HERNAN
Hernan
hernanlopes@gmail.com
http://github.com/hernan

COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.

SEE ALSO

perl(1).