NAME
Data::Downloader -- Download and organize files using RSS feeds and templates.
SYNOPSIS
use Data::Downloader -init_logging => "INFO";
my $repo = Data::Downloader::Repository->new(name => "ozone");
$repo->load(speculative => 1) or die "ozone repository not configured";
for my $feed (@{ $dado->feeds }) {
$feed->refresh;
}
for my $file (@{ $dado->files }) {
$file->download;
}
DESCRIPTION
Data::Downloader allows one to download and maintain local repositories of files. File metadata may be obtained from RSS or Atom feeds. Files are stored using MD5 sums, and symbolic links can be created based on the metadata for the files.
A command line version of Data::Downloader, dado, is also available.
Data::Downloader uses an SQLite database to store both the configuration, as well as information about the files and trees of symbolic links stored in a repository.
Parameters may be sent when updating feeds; parameters are replace tokens in URLs for the RSS/Atom feeds, just as in an opensearch URL template.
BACKGROUND
Several efforts are underway to extend Atom and RSS as a mechanism for distribution of scientific data. For example, datacasting provides a versatile response format as well as a client capable of graphically oriented searches for data. ESIP Discovery Services (e.g. version 1.1) are working on specifications for server-side filtering of data, based on the opensearch specification.
In addition, standards such as mrss and georss provide ways to represent structured metadata for resources which are often downloaded and organized on a local disk.
In addition to reading feeds, Data::Downloader may be used as an LRU cache which may be populated directly without querying Atom/RSS feeds.
EXAMPLE
This is an example of subscribing to flickr's mrss feed to download images. For more examples, please see dado.
my $images = Data::Downloader::Repository->new(
name => "images",
storage_root => "/usr/local/datastore/data",
cache_strategy => "Keep",
feeds => [
{
name => "flickr",
feed_template => 'http://api.flickr.com/services/feeds/photos_public.gne?tags=<tags>&lang=en-us&format=rss_200',
file_source => {
url_xpath => 'media:content/@url',
filename_xpath => 'media:content/@url',
filename_regex => '/([^/]*)$',
},
metadata_sources => [
{ name => 'date_taken', xpath => 'dc:date.Taken' },
{ name => 'tags', xpath => 'media:category' },
],
},
],
metadata_transformations => [
{
input => "tags",
output => "tag",
function_name => "split",
},
],
linktrees => [
{
root => "/usr/local/datastore/by_tag",
condition => undef,
path_template => "<tag>"
},
{
root => "/usr/local/datastore/by_date",
condition => undef,
path_template => "<date_taken:%Y/%m/%d>"
},
],
);
$images->load(speculative => 1) or $images->save;
for my $feed ($images->feeds) {
$feed->refresh(tags => "apples");
}
$images->download_all;
SEE ALSO
dado, Data::Downloader::Config, Data::Downloader::Repository, Data::Downloader::Feed, Data::Downloader::DB, Data::Downloader::Cache, Data::Downloader::Linktree, Rose
SCHEMA
__DATA__
-- See also Rose::DB::Object::Loader
/*
* Every table below corresponds to a class named Data::Downloader::<TableName>.
* Every class has accessors and mutators whose names are the same as the column names.
* Foreign keys also beget methods in both the parent and child classes, e.g.
* e.g. $repository->files and $file->repository_obj.
*/
/*** static tables : populated once during configuration ***/
--- A repository has a cache strategy.
create table repository (
id integer primary key,
name varchar(255) not null unique,
storage_root text not null,
file_url_template text,
cache_strategy text not null, -- e.g. "LRU" corresponds to Data::Downloader::Cache::LRU
cache_max_size integer -- in bytes
);
--- A repository has many feeds.
create table feed (
id integer primary key,
repository integer not null references repository(id),
name varchar(255) not null unique,
feed_template text
);
--- A repository has many disks.
create table disk (
id integer primary key,
repository integer not null references repository(id),
root varchar(255) not null unique
);
--- A feed has many default parameters
create table feed_parameter (
id integer primary key,
feed integer not null references feed(id),
name text not null,
default_value text
);
--- A feed has a file source.
create table file_source (
feed integer primary key not null references feed(id),
url_xpath text,
urn_xpath text, -- unique identifier for files
md5_xpath text,
filename_xpath text,
filename_regex text -- apply to whatever is extracted from the xpath
);
--- A feed has many metadata sources.
create table metadata_source (
id integer primary key,
feed integer references feed(id),
name text not null unique,
xpath text
);
--- A repository has many linktrees.
create table linktree (
id integer primary key,
repository integer references repository(id),
root text,
condition text, -- SQL::Abstract clause
path_template text, -- String::Template string
unique(root)
);
--- A repository has many metadata_transformationss.
create table metadata_transformation (
id integer primary key,
input text, -- references metadata_source.name or this table
output text not null,
repository integer references repository(id),
function_name text, -- function to apply
function_params text,
order_key integer not null default 1,
unique(order_key)
);
/*** dynamic tables : populated when files are downloaded, rss feeds are updated, etc. ***/
create table stat_info (
id integer primary key,
repository integer not null references repository(id),
last_stat_update datetime,
last_fsck datetime,
unique(repository)
);
--- metadata
create table metadatum (
id integer primary key,
file integer references file(id),
name varchar(255) references metadata_source(name),
value text,
unique (file,name)
);
--- A repository has many files.
create table file (
id integer primary key,
repository integer not null references repository(id),
filename text not null,
md5 char(32),
url text,
urn text,
size integer,
on_disk integer, -- boolean
disk integer references disk(id),
atime datetime, -- stat(file)->atime
unique(md5),
unique(filename),
unique(urn)
);
--- A file has many log entries --- (if $ENV{DATA_DOWNLOADER_GATHER_STATS} is set).
create table log_entry (
id integer primary key,
file integer not null references file(id),
requested_at datetime not null,
cache_hit integer, -- boolean
completed_at datetime,
prog text, -- $0
pid integer, -- $$
uid text, -- $<
note text -- $ENV{DATA_DOWNLOADER_LOG_NOTE}, e.g. app info
);
--- A file has many expirations
create table file_removal (
id integer primary key,
file integer not null references file(id),
expired_on datetime,
algorithm, -- the expiration algorithm
prog text, -- $0
pid integer, -- $$
uid text, -- $<
note text -- $ENV{DATA_DOWNLOADER_FILE_REMOVAL_NOTE},
);
--- A file has many symlinks.
create table symlink (
id integer primary key,
linkname text not null, -- linktree.path_template + file's metadata
file integer not null references file(id),
linktree integer not null references linktree(id),
unique(linkname)
);
/***\
AUTHOR
Brian Duggan
Phillip Durbin
Stuart Pineo
Arnold Martin
Graham Ollis
Curt Tilmes
Michael Walters