WWW::CurlOO examples

WWW::CurlOO::examples -- sample modules and test code for WWW::CurlOO

Curl::Transport

Extracted from examples/01-curl-transport.pl

This module shows:

buildtime version check

Required features will be missing if libcurl was too old at WWW::CurlOO compilation.

basic inheritance

Use WWW::Curl::* as base for your modules.

exception handling

Most methods die() with a dualvar exception on error. You can compare them numerically, or display as part of a message.

Motivation

recv() and send() methods use non-blocking transfer, this may be very annoying in simple scripts. This wrapper implements blocking send() wrapper, and two recv() wrappers called read() and readline().

MODULE CODE

package Curl::Transport;

use strict;
use warnings;
use WWW::CurlOO::Easy qw(/^CURLE_/);
use base qw(WWW::CurlOO::Easy);

BEGIN {
    if ( WWW::CurlOO::LIBCURL_VERSION_NUM() < 0x071202 ) {
        my $ver = WWW::CurlOO::LIBCURL_VERSION();
        die "curl $ver does not support send() and recv()";
    }
    # alternatively you can write:
    if ( not WWW::CurlOO::Easy->can( "send" )
            or not WWW::CurlOO::Easy->can( "recv" ) ) {
        die "WWW::CurlOO is missing send() and recv()\n"
    }
}

use constant {
    B_URI => 0,
    B_SOCKET => 1,
    B_VEC => 2,
    B_READBUF => 3,
};


# new( URL ) -- get new object
sub new
{
    my $class = shift;
    my $uri = shift;

    # use an array as our object base
    my $base = [ $uri, undef, undef, '' ];

    my $self = $class->SUPER::new( $base );

    $self->setopt( WWW::CurlOO::Easy::CURLOPT_URL, $uri );
    $self->setopt( WWW::CurlOO::Easy::CURLOPT_CONNECT_ONLY, 1 );

    # will die if fails
    $self->perform();

    $self->[ B_SOCKET ] = $self->getinfo(
        WWW::CurlOO::Easy::CURLINFO_LASTSOCKET
    );

    # prepare select vector
    my $vec = '';
    vec( $vec, $self->[ B_SOCKET ], 1 ) = 1;
    $self->[ B_VEC ] = $vec;

    return $self;
}

# send( DATA ) -- send some data, wait for socket availability
# if it cannot be sent all at once
sub send($$)
{
    my $self = shift;
    my $data = shift;

    while ( length $data ) {
        # copy, because select overwrites those values
        my $w = $self->[ B_VEC ];

        # wait for write
        select undef, $w, undef, 0;

        # make sure some write bit is set
        next unless vec( $w, $self->[ B_SOCKET ], 1 );

        # actually send the data
        my $sent = $self->SUPER::send( $data );

        # remove from buffer what we sent
        substr $data, 0, $sent, '';
    };
}

# read( SIZE ) -- read SIZE bytes, wait for more data if there
# wasn't enough
sub read($$)
{
    my $self = shift;
    my $size = shift;

    return '' unless $size > 0;

    while ( length $self->[ B_READBUF ] < $size ) {
        my $r = $self->[ B_VEC ];

        # wait for data
        select $r, undef, undef, 0;

        # make sure some read bit is set
        redo unless vec( $r, $self->[ B_SOCKET ], 1 );

        eval {
            my $l = $self->SUPER::recv( $self->[ B_READBUF ],
                $size - length $self->[ B_READBUF ] );
        };
        if ( $@ ) {
            if ( $@ == CURLE_UNSUPPORTED_PROTOCOL ) {
                my $uri = $self->[ B_URI ];
                warn "Connection to $uri closed: $@\n";
                last;
            } elsif ( $@ == CURLE_AGAIN ) {
                warn "nothing to read, this should not happen";
            } else {
                die $@;
            }
        }
    }

    return substr $self->[ B_READBUF ], 0, $size, '';
}

# readline() -- read until $/
sub readline($)
{
    my $self = shift;

    # we allow changing $/, but we don't support $/ = undef.
    local $/;
    $/ = "\n" unless defined $/;

    my $idx;
    until ( ( $idx = index $self->[ B_READBUF ], $/ ) >= 0 ) {
        my $r = $self->[ B_VEC ];

        # wait for data
        select $r, undef, undef, 0;

        # make sure some read bit is set
        next unless vec( $r, $self->[ B_SOCKET ], 1 );

        # read 256 bytes, should be enough in most cases
        eval {
            $self->SUPER::recv( $self->[ B_READBUF ], 256 );
        };
        if ( $@ ) {
            if ( $@ == CURLE_UNSUPPORTED_PROTOCOL ) {
                my $uri = $self->[ B_URI ];
                warn "Connection to $uri closed: $@\n";
                last;
            } elsif ( $@ == CURLE_AGAIN ) {
                warn "nothing to read, this should not happen";
            } else {
                die $@;
            }
        }
    }

    return substr $self->[ B_READBUF ], 0, ($idx + length $/), '';
}

1;

TEST APPLICATION

Sample application using this module could look like this:

#!perl
use strict;
use warnings;
use Curl::Transport;

my $host = shift @ARGV || "example.com";

my $t = Curl::Transport->new( "http://$host" );
$t->send( "GET / HTTP/1.0\r\n" );
$t->send( "User-Agent: Curl::Transport test\r\n" );
$t->send( "Accept: */*\r\n" );
$t->send( "Host: $host\r\n" );
$t->send( "Connection: Close\r\n" );
$t->send( "\r\n" );

my $length;
{
    local $/ = "\r\n";
    local $_;
    do {
        $_ = $t->readline();
        $length = 0 | $1 if /Content-Length:\s*(\d+)/;
        chomp;
        print "HEADER: $_\n";
    } while ( length $_ );
}

if ( defined $length ) {
    print "Reading $length bytes of data:\n";
    print $t->read( $length );

    print "\nTrying to read one more byte, should fail:\n";
    print $t->read( 1 );
    print "\n";
} else {
    print "Don't know how much to read\n";
    while ( $_ = $t->readline() ) {
        print;
    }
}

printf "Last error: %s\n", $t->error();

Multi::Simple

Extracted from examples/02-multi-simple.pl

This module shows how to use WWW::CurlOO::Multi interface correctly in its simpliest form. Uses perl builtin select(). A more advanced code would use callbacks and some event library instead.

Motivation

Writing a proper multi wrapper code requires a rather good understainding of libcurl multi interface. This code provides a recipie for those who just need something that "simply works".

MODULE CODE

package Multi::Simple;

use strict;
use warnings;
use WWW::CurlOO::Multi;
use base qw(WWW::CurlOO::Multi);

# make new object, preset the data
sub new
{
    my $class = shift;
    my $active = 0;
    return $class->SUPER::new( \$active );
}

# add one handle and count it
sub add_handle($$)
{
    my $self = shift;
    my $easy = shift;

    $$self++;
    $self->SUPER::add_handle( $easy );
}

# perform until some handle finishes, does all the magic needed
# to make it efficient (check as soon as there is some data)
# without overusing the cpu.
sub get_one($)
{
    my $self = shift;

    if ( my @result = $self->info_read() ) {
        $self->remove_handle( $result[ 1 ] );
        return @result;
    }

    while ( $$self ) {
        my $t = $self->timeout;
        if ( $t != 0 ) {
            $t = 10000 if $t < 0;
            my ( $r, $w, $e ) = $self->fdset;

            select $r, $w, $e, $t / 1000;
        }

        my $ret = $self->perform();
        if ( $$self != $ret ) {
            $$self = $ret;
            if ( my @result = $self->info_read() ) {
                $self->remove_handle( $result[ 1 ] );
                return @result;
            }
        }
    };

    return ();
}

1;

TEST APPLICATION

Sample application using this module looks like this:

#!perl
use strict;
use warnings;
use Multi::Simple;
use WWW::CurlOO::Share qw(:constants);


sub easy
{
    my $uri = shift;
    my $share = shift;

    require WWW::CurlOO::Easy;

    my $easy = WWW::CurlOO::Easy->new( { uri => $uri, body => '' } );
    $easy->setopt( WWW::CurlOO::Easy::CURLOPT_VERBOSE(), 1 );
    $easy->setopt( WWW::CurlOO::Easy::CURLOPT_URL(), $uri );
    $easy->setopt( WWW::CurlOO::Easy::CURLOPT_WRITEHEADER(),
        \$easy->{headers} );
    $easy->setopt( WWW::CurlOO::Easy::CURLOPT_FILE(),
        \$easy->{body} );
    $easy->setopt( WWW::CurlOO::Easy::CURLOPT_SHARE(), $share );
    return $easy;
}

my $multi = Multi::Simple->new();

my @uri = (
    "http://www.google.com/search?q=perl",
    "http://www.google.com/search?q=curl",
    "http://www.google.com/search?q=perl+curl",
);

{
    # share cookies between all handles
    my $share = WWW::CurlOO::Share->new();
    $share->setopt( CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE );
    $multi->add_handle( easy( shift ( @uri ), $share ) );
}

my $ret = 0;
while ( my ( $msg, $easy, $result ) = $multi->get_one() ) {
    print "\nFinished downloading $easy->{uri}: $result:\n";
    printf "Body is %d bytes long\n", length $easy->{body};
    print "=" x 80 . "\n";

    $ret = 1 if $result;

    $multi->add_handle( easy( shift ( @uri ), $easy->share ) )
        if @uri;
}

exit $ret;

Multi::Event

Extracted from examples/03-multi-event.pl

This module shows how to use WWW::CurlOO::Multi interface with an event library, AnyEvent in this case.

Motivation

This is the most efficient method for using WWW::CurlOO::Multi interface, but it requires a really good understanding of it. This code tries to show the quirks found when using event-based programming.

MODULE CODE

package Multi::Event;

use strict;
use warnings;
use AnyEvent;
use WWW::CurlOO::Multi qw(/^CURL_POLL_/ /^CURL_CSELECT_/);
use base qw(WWW::CurlOO::Multi);

BEGIN {
    if ( not WWW::CurlOO::Multi->can( 'CURLMOPT_TIMERFUNCTION' ) ) {
        die "WWW::CurlOO::Multi is missing timer callback,\n" .
            "rebuild WWW::CurlOO with libcurl 7.16.0 or newer\n";
    }
}

sub new
{
    my $class = shift;

    # no base object this time
    # we'll use the default hash

    my $multi = $class->SUPER::new();

    $multi->setopt( WWW::CurlOO::Multi::CURLMOPT_SOCKETFUNCTION,
        \&_cb_socket );
    $multi->setopt( WWW::CurlOO::Multi::CURLMOPT_TIMERFUNCTION,
        \&_cb_timer );

    $multi->{active} = -1;

    return $multi;
}


# socket callback: will be called by curl any time events on some
# socket must be updated
sub _cb_socket
{
    my ( $multi, $easy, $socket, $poll ) = @_;
    #warn "on_socket( $socket => $poll )\n";

    # Right now $socket belongs to that $easy, but it can be
    # shared with another easy handle if server supports persistent
    # connections.
    # This is why we register socket events inside multi object
    # and not $easy.

    # deregister old io events
    delete $multi->{ "r$socket" };
    delete $multi->{ "w$socket" };

    # AnyEvent does not support registering a socket for both
    # reading and writing. This is rarely used so there is no
    # harm in separating the events.

    # register read event
    if ( $poll == CURL_POLL_IN or $poll == CURL_POLL_INOUT ) {
        $multi->{ "r$socket" } = AE::io $socket, 0, sub {
            $multi->socket_action( $socket, CURL_CSELECT_IN );
        };
    }

    # register write event
    if ( $poll == CURL_POLL_OUT or $poll == CURL_POLL_INOUT ) {
        $multi->{ "w$socket" } = AE::io $socket, 1, sub {
            $multi->socket_action( $socket, CURL_CSELECT_OUT );
        };
    }

    return 1;
}


# timer callback: It triggers timeout update. Timeout value tells
# us how soon socket_action must be called if there were no actions
# on sockets. This will allow curl to trigger timeout events.
sub _cb_timer
{
    my ( $multi, $timeout_ms ) = @_;
    #warn "on_timer( $timeout_ms )\n";

    # deregister old timer
    delete $multi->{timer};

    my $cb = sub {
        $multi->socket_action( WWW::CurlOO::Multi::CURL_SOCKET_TIMEOUT );
    };

    if ( $timeout_ms < 0 ) {
        # Negative timeout means there is no timeout at all.
        # Normally happens if there are no handles anymore.
        #
        # However, curl_multi_timeout(3) says:
        #
        # Note: if libcurl returns a -1 timeout here, it just means
        # that libcurl currently has no stored timeout value. You
        # must not wait too long (more than a few seconds perhaps)
        # before you call curl_multi_perform() again.

        if ( $multi->handles ) {
            $multi->{timer} = AE::timer 10, 10, $cb;
        }
    } else {
        # This will trigger timeouts if there are any.
        $multi->{timer} = AE::timer $timeout_ms / 1000, 0, $cb;
    }

    return 1;
}

# add one handle and kickstart download
sub add_handle($$)
{
    my $multi = shift;
    my $easy = shift;

    die "easy cannot finish()\n"
        unless $easy->can( 'finish' );

    # Calling socket_action with default arguments will trigger
    # socket callback and register IO events.
    #
    # It _must_ be called _after_ add_handle(); AE will take care
    # of that.
    #
    # We are delaying the call because in some cases socket_action
    # may finish inmediatelly (i.e. there was some error or we used
    # persistent connections and server returned data right away)
    # and it could confuse our application -- it would appear to
    # have finished before it started.
    AE::timer 0, 0, sub {
        $multi->socket_action();
    };

    $multi->SUPER::add_handle( $easy );
}

# perform and call any callbacks that have finished
sub socket_action
{
    my $multi = shift;

    my $active = $multi->SUPER::socket_action( @_ );
    return if $multi->{active} == $active;

    $multi->{active} = $active;

    while ( my ( $msg, $easy, $result ) = $multi->info_read() ) {
        if ( $msg == WWW::CurlOO::Multi::CURLMSG_DONE ) {
            $multi->remove_handle( $easy );
            $easy->finish( $result );
        } else {
            die "I don't know what to do with message $msg.\n";
        }
    }
}

1;

TEST Easy package

Multi::Event requires Easy object to provide finish() method.

package Easy::Event;
use strict;
use warnings;
use WWW::CurlOO::Easy qw(/^CURLOPT_/);
use base qw(WWW::CurlOO::Easy);

sub new
{
    my $class = shift;
    my $uri = shift;
    my $cb = shift;

    my $easy = $class->SUPER::new(
        { uri => $uri, body => '', cb => $cb }
    );
    $easy->setopt( CURLOPT_URL, $uri );
    $easy->setopt( CURLOPT_WRITEHEADER, \$easy->{headers} );
    $easy->setopt( CURLOPT_FILE, \$easy->{body} );

    return $easy;
}

sub finish
{
    my ( $easy, $result ) = @_;

    printf "\nFinished downloading %s: %s: %d bytes\n",
        $easy->{uri}, $result, length $easy->{body};

    $easy->{cb}->( $easy->{body} );
}

1;

TEST APPLICATION

#!perl
use strict;
use warnings;
use Easy::Event;
use Multi::Event;
use AnyEvent;

my $multi = Multi::Event->new();
my $cv = AE::cv;


my @uris = (
    "http://www.google.com/search?q=perl",
    "http://www.google.com/search?q=curl",
    "http://www.google.com/search?q=perl+curl",
);


my $i = scalar @uris;
sub done
{
    my $body = shift;

    # process...

    unless ( --$i ) {
        $cv->send;
    }
}

my $timer;
$timer = AE::timer 0, 0.1, sub {
    my $uri = shift @uris;
    $multi->add_handle( Easy::Event->new( $uri, \&done ) );

    unless ( @uris ) {
        undef $timer;
    }
};

$cv->recv;

exit 0;

Share::Threads

Extracted from examples/04-share-threads.pl

This module shows how one can share http cookies and dns cache between multiple threads.

Motivation

Threads are evil, but some people think they are not. I want to make them a favor and show how bad threads really are.

Limitations

  • WWW::CurlOO::Share is the only package that allows sharing between threads. Others (Easy, Multi, Form) are usable only in their creating thread.

  • Share internals are always shared between threads, but you must mark your base object as shared if you want to use the data elsewhere.

  • Shared WWW::CurlOO::Share does not support lock and unlock callbacks. They kill the interpreter. This may be fixed some day.

  • If we want to share the data, we cannot trigger all downloads at the same time, because there would be no data to share at the time. This solution opts to lock other downloads until headers from the server are fully received. It assures cache coherency, but slows down overall application.

  • This method does not reuse persistent connections, it would be much faster to get those 6 requests one after another than to doing all 6 in parallel.

  • WWW::CurlOO::Share currently leaks scalars, this is bad, and can be very bad for long-running applications.

  • If you share dns cache all connections for one domain will go to the same IP, even if domain name resolves to multiple adresses.

MODULE CODE

package Share::Threads;
use threads;
use threads::shared;
use Thread::Semaphore;
use WWW::CurlOO::Share qw(:constants);
use base qw(WWW::CurlOO::Share);


sub new
{
    my $class = shift;

    # we want our private data to be shareable
    my %base :shared;

    # create a shared share object
    my $self :shared = $class->SUPER::new( \%base );

    # share both cookies and dns
    $self->setopt( CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE );
    $self->setopt( CURLSHOPT_SHARE, CURL_LOCK_DATA_DNS );

    # XXX: WWW::CurlOO::Share does not support callbacks from other
    # threads yet, so this won't work. In fact it will trigger a
    # deep recursion and the application will be nooked.
    #$self->setopt( CURLSHOPT_LOCKFUNC, "cb_lock" );
    #$self->setopt( CURLSHOPT_UNLOCKFUNC, "cb_unlock" );

    # we use semaphore instead
    $self->{sem} = Thread::Semaphore->new();

    return $self;
}

#sub cb_lock
#{
#    my ( $share, $easy, $data, $locktype, $uservar ) = @_;
#    # Nothing here yet, because it won't work anyways.
#}

#sub cb_unlock
#{
#    my ( $share, $easy, $data, $uservar ) = @_;
#    # Nothing here yet, because it won't work anyways.
#}


# this locks way too much, but works as expected
sub lock
{
    my $share = shift;
    $share->{sem}->down();
    $share->{blocker} = threads->tid();
}

sub unlock
{
    my $share = shift;
    unless ( exists $share->{blocker} ) {
        warn "Tried to unlock share that wasn't locked\n";
        return;
    }
    unless ( $share->{blocker} == threads->tid() ) {
        warn "Tried to unlock share from another thread\n";
        return;
    }
    delete $share->{blocker};
    $share->{sem}->up();
}

1;

TEST Easy package

This Easy::Threads object will block whole share object for duration of dns name resolution and until headers are completely received.

package Easy::Threads;
use strict;
use warnings;
use WWW::CurlOO::Easy qw(/^CURLOPT_.*/);
use base qw(WWW::CurlOO::Easy);

sub new
{
    my $class = shift;
    my $share = shift;

    my $easy = $class->SUPER::new( { body => '', head => '' } );
    $easy->setopt( CURLOPT_VERBOSE, 1 );
    $easy->setopt( CURLOPT_WRITEHEADER, \$easy->{head} );
    $easy->setopt( CURLOPT_FILE, \$easy->{body} );
    $easy->setopt( CURLOPT_HEADERFUNCTION, \&cb_header );
    $easy->setopt( CURLOPT_SHARE, $share );

    return $easy;
}

sub cb_header {
    my ( $easy, $data, $uservar ) = @_;

    if ( $data eq "\r\n" ) {
        # we have all the headers now, allow other threads to run
        $easy->share->unlock()
            unless $easy->{unlocked};

        $easy->{unlocked} = 1;
    }

    $$uservar .= $data;

    return length $data;
}

sub get
{
    my $easy = shift;
    my $uri = shift;

    $easy->setopt( CURLOPT_URL, $uri );
    $easy->{uri} = $uri;
    $easy->{body} = '';
    $easy->{head} = '';
    delete $easy->{unlocked};

    # lock share
    $easy->share->lock();

    # ok, now we can request
    eval {
        $easy->perform();
    };

    # There may have been some problem, make sure we unlock the share.
    # This should issue a warning, check $easy->{unlocked} to see
    # whether we really need to unlock.
    $easy->share->unlock();

    # return something
    return $easy->{body};
}

1;

TEST APPLICATION

Sample application using this module looks like this:

#!perl
use threads;
use threads::shared;
use strict;
use warnings;
use Share::Threads;
use Easy::Threads;

my $share :shared = Share::Threads->new();

my @uri = (
    "http://www.google.com/search?q=perl",
    "http://www.google.com/search?q=curl",
    "http://www.google.com/search?q=perl+curl",
    "http://www.google.com/search?q=perl+threads",
    "http://www.google.com/search?q=curl+threads",
    "http://www.google.com/search?q=perl+curl+threads",
);

sub getone
{
    my $uri = shift;

    my $easy = Easy::Threads->new( $share );
    return $easy->get( $uri );
}

# start all threads
my @threads;
foreach my $uri ( @uri ) {
    push @threads, threads->create( \&getone, $uri );
    threads->yield();
}

# reap all threads
foreach my $t ( @threads ) {
    my $body = $t->join();
    my $len = length $body;
    print "DONE: [[[ $len ]]]\n";
}