NAME

DiaColloDB::Utils - diachronic collocation database, generic utilities

SYNOPSIS

##========================================================================
## PRELIMINARIES

use DiaColloDB::Utils;

##========================================================================
## Functions: Fcntl

$flags = PACKAGE::fcflags($flags);
$flags = PACKAGE::fcgetfl($fh);
$bool = fcread($flags);
$bool = fcwrite($flags);
$bool = fctrunc($flags);
$bool = fccreat($flags);
$fh_or_undef = fcopen($file,$flags);

##========================================================================
## JSON: load

$data = PACKAGE::loadJsonString( $string,%opts);
$data = PACKAGE::loadJsonFile($filename_or_handle,%opts);

##========================================================================
## JSON: save

$str = PACKAGE::saveJsonString($data);
$bool = PACKAGE::saveJsonFile($data,$filename_or_handle,%opts);

##========================================================================
## Functions: env

\%setenv = PACKAGE::env_set(%setenv);
\%restored = PACKAGE::env_pop(%setenv);

##========================================================================
## Functions: run

$fh_or_undef = PACKAGE::opencmd($cmd);
$bool = crun(@IPC_Run_args);
$bool = csort_to(\@sortargs, \&catcher);
$bool = csortuc_to(\@sortargs, \&catcher);
$cmd = sortCmd();

##========================================================================
## Functions: pack filters

$len = PACKAGE::packsize($packfmt);
\&filter_sub = PACKAGE::packFilterStore($pack_template);
\&filter_sub = PACKAGE::packFilterFetch($pack_template);

##========================================================================
## Math stuff

$log2 = log2($x);
$max2 = max2($x,$y);
$min2 = min2($x,$y);

##========================================================================
## Functions: lists

\@l_uniq        = luniq(\@l);
\@l_sorted_uniq = sluniq(\@l_sorted);
\@l_uniq        = xluniq(\@l,\&keyfunc);

##========================================================================
## Functions: regexes

$re = regex($re_str);

##========================================================================
## Functions: html

$escaped = htmlesc($str);

##========================================================================
## Functions: time

$hms     = PACKAGE::s2hms($seconds,$sfmt="%06.3f");
$timestr = PACKAGE::s2timestr($seconds,$sfmt="%f");
$rfc_timestamp = PACAKGE->timestamp();

##========================================================================
## Functions: file

$mtime = PACKAGE->file_mtime($file_or_fh);
$timestamp = PACKAGE->file_timestamp($file_or_fh);

$nbytes = du_file(@filenames_or_fh);
$nbytes = du_glob(@globs);

$bool = PACKAGE->copyto  ($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->copyto_a($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->moveto  ($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->cp_a    ($src_filename_or_array, $dstdir);

##========================================================================
## Utils: SI

$str = si_str($float);

##========================================================================
## Functions: pdl: setops

$pi = CLASS::_intersect_p($p1,$p2);
$pu = CLASS::_union_p($p1,$p2);
$pneg = CLASS::_complement_p($p,$N);
$pdiff = CLASS::_setdiff_p($a,$b,$N);

##========================================================================
## Functions: pdl

$pdl_or_undef = CLASS->readPdlFile($basename, %opts);
$bool = CLASS->writePdlFile($pdl_or_undef, $basename, %opts);
$bool = CLASS->writePdlHeader($filename, $type, $ndims, @dims);
$bool = CLASS->writeCcsHeader($filename, $itype, $vtype, $pdims, %opts);

$pdl  = mmzeroes($file?, $type?, @dims, \%opts?);
$bool = mmunlink(@mmfiles);

$type   = CLASS->mintype($pdl,    @types);
$maxval = $type->maxval();
($vals,$counts) = $pdl->valcounts();

##========================================================================
## Functions: temporaries

$tmpdir = CLASS->tmpdir();
$fh = CLASS->tmpfh();
$filename = CLASS->tmpfile();
\@tmparray = CLASS->tmparray($template, %opts);
\@tmparrayp = CLASS->tmparrayp($template, $packas, %opts);
\%tmphash = CLASS->tmphash($class, $template, %opts);

##========================================================================
## Functions: parallelization

$ncores   = CLASS->nCores();
$njobs    = CLASS->nJobs();
$sortjobs = CLASS->sortJobs();

DESCRIPTION

Globals

Variable: @ISA

DiaColloDB::Utils inherits from Exporter and DiaColloDB::Logger.

Variable: %EXPORT_TAGS

Exportable tags:

 fcntl => [qw(fcflags fcgetfl fcread fcwrite fctrunc fccreat fcperl fcopen)],
 json  => [qw(loadJsonString loadJsonFile saveJsonString saveJsonFile)],
 sort  => [qw(csort_to csortuc_to sortCmd)],
 run   => [qw(crun opencmd)],
 env   => [qw(env_set env_push env_pop)],
 pack  => [qw(packsize packsingle packFilterFetch packFilterStore)],
 math  => [qw($LOG2 log2 min2 max2)],
 list  => [qw(luniq sluniq xluniq)],
 regex => [qw(regex)],
 html  => [qw(htmlesc)],
 time  => [qw(s2hms s2timestr timestamp)],
 file  => [qw(file_mtime file_timestamp du_file du_glob copyto copyto_a moveto cp_a fh_flush fh_reopen)],
 si    => [qw(si_str)],
 pdl   => [qw(_intersect_p _union_p _complement_p _setdiff_p),
	   qw(readPdlFile writePdlFile writePdlHeader writeCcsHeader mmzeroes mmtemp),
	   qw(maxval mintype),
          ],
 temp  => [qw($TMPDIR tmpdir tmpfh tmpfile tmparray tmparrayp tmphash)],
 jobs  => [qw(nCores nJobs sortJobs)],
Variable: @EXPORT_OK

All symbols in %EXPORT_TAGS are exportable

Variable: @EXPORT

All symbols in %EXPORT_TAGS are exported by default.

Functions: Fcntl

fcflags
$flags = PACKAGE::fcflags($flags);

returns Fcntl flags for symbolic string $flags

fcgetfl
$flags = PACKAGE::fcgetfl($fh);

returns Fcntl flags for filehandle $fh

fcread
$bool = fcread($flags);

returns true if any read-bits are set for $flags

fcwrite
$bool = fcwrite($flags);

returns true if any write-bits are set for $flags

fctrunc
$bool = fctrunc($flags);

returns true if truncate-bits are set for $flags

fccreat
$bool = fccreat($flags);

returns true iff creation flag is set for $flags.

fcperl
$str = fcperl($flags);

returns perl mode-string corresponding to $flags.

fcopen
$fh_or_undef = fcopen($file,$flags);
$fh_or_undef = fcopen($file,$flags,$mode,$perms)

opens $file with Fcntl-style flags $flags.

JSON: load

loadJsonString
$data = PACKAGE::loadJsonString( $string,%opts);
$data = PACKAGE::loadJsonString(\$string,%opts)

decodes JSON string. %opts are passed to JSON::from_json().

loadJsonFile
$data = PACKAGE::loadJsonFile($filename_or_handle,%opts);

loads JSON data from a file or filehandle. %opts are passed to loadJsonString().

JSON: save

saveJsonString
$str = PACKAGE::saveJsonString($data);
$str = PACKAGE::saveJsonString($data,%opts);

Encode data as a JSON string. %opts are passed to JSON::to_json(), e.g. (pretty=>0, canonical=>0)'.

saveJsonFile
$bool = PACKAGE::saveJsonFile($data,$filename_or_handle,%opts);

Save JSON data to a file. %opts are passed to saveJsonString().

Functions: env

Variable: @env_stack

Stack of temporary environment variables.

env_set
\%setenv = PACKAGE::env_set(%setenv);

Set or clear environment variables.

env_push
\%oldvals = PACKAGE::env_push(%setenv);

Push old values for keys(%setenv) to @env_stack and calls env_set(%setenv).

env_pop
\%restored = PACKAGE::env_pop(%setenv);

Pops the most recent variable bindings from @env_stack and restores them to the environment.

Functions: run

opencmd
$fh_or_undef = PACKAGE::opencmd($cmd);
$fh_or_undef = PACKAGE::opencmd($mode,@argv);

does log trace at level $TRACE_RUNCMD

crun
$bool = crun(@IPC_Run_args);

wrapper for IPC::Run::run(@IPC_Run_args) with $ENV{LC_ALL}='C'

csort_to
$bool = csort_to(\@sortargs, \&catcher);

runs system sort and feeds resulting lines to \&catcher

csortuc_to
$bool = csortuc_to(\@sortargs, \&catcher);

runs system sort | uniq -c and feeds resulting lines to \&catcher

sortCmd
$cmd = sortCmd();
$cmd = sortCmd($nJobs);

Returns command-line prefix (command and initial optopins) for GNU-like sort command. This method just returns the value of the DIACOLLO_SORT environment variable if it is set, otherwise the value of the SORT environment variable if that is set. If neither DIACOLLO_SORT nor SORT are set, it returns the string sort with the parallelization options returned by "sortJobs" appended. You can use the environment variable hooks e.g. to reduce the amount of RAM and/or CPU cores used by subordinate system sort calls by setting them appropriately, e.g.

env SORT="/bin/sort --parallel=4 --buffer-size=1G"

to request that GNU sort use at most 4 CPU cores and a maximum RAM buffer size of 1GB.

Functions: pack filters

packsize
$len = PACKAGE::packsize($packfmt);
$len = PACKAGE::packsize($packfmt,@args);

get pack-size for $packfmt with args @args

packFilterStore
\&filter_sub = PACKAGE::packFilterStore($pack_template);
\&filter_sub = PACKAGE::packFilterStore([$pack_template_store, $pack_template_fetch]);
\&filter_sub = PACKAGE::packFilterStore([\&pack_code_store,   \&pack_code_fetch]);

returns a DB_File-style STORE-filter sub for transparent packing of data to $pack_template

packFilterFetch
\&filter_sub = PACKAGE::packFilterFetch($pack_template);
\&filter_sub = PACKAGE::packFilterFetch([$pack_template_store, $pack_template_fetch]);
\&filter_sub = PACKAGE::packFilterFetch([\&pack_code_store,   \&pack_code_fetch]);

returns a DB_File-style FETCH-filter sub for transparent unpacking of data from $pack_template.

Math stuff

Variable: $LOG2

constant: log(2) for binary logarithms.

log2
$log2 = log2($x);

binary logarithm function.

max2
$max2 = max2($x,$y);

maximum

min2
$min2 = min2($x,$y);

minimum

Functions: lists

luniq
\@l_uniq = luniq(\@l);

returns sorted list of unique defined elements of @l; @l need not be sorted.

sluniq
\@l_sorted_uniq = sluniq(\@l_sorted);

returns unique defined elements of pre-sorted list @l_sorted.

xluniq
\@l_uniq = xluniq(\@l,\&keyfunc);

returns elements of @l with unique defined keys according to \&keyfunc (default=\&overload::StrVal); returned list is sorted by \&keyfunc.

Functions: regexes

regex
$re = regex($re_str);

parses regex $re_str, which can optionally be "/"-quoted. parses modifiers /[gimsadlu]. /g modifier is parsed a la ddc (match whole word).

Functions: html

htmlesc
$escaped = htmlesc($str);

escape an HTML string.

Functions: time

s2hms
$hms       = PACKAGE::s2hms($seconds,$sfmt="%06.3f");
($h,$m,$s) = PACKAGE::s2hms($seconds,$sfmt="%06.3f");

convert a time value in seconds to HH:MM:SS.SSSS format

s2timestr
$timestr = PACKAGE::s2timestr($seconds,$sfmt="%f");

convert a time value in seconds to H?M?S.SSSS format

timestamp
$rfc_timestamp = PACAKGE->timestamp();
$rfc_timestamp = PACAKGE->timestamp($time);

Return a UTC ISO-8601 timestamp format "%Y-%m-%dT%H:%M%SZ" for the UNIX time $time.

Functions: file

file_mtime
$mtime = PACKAGE->file_mtime($file_or_fh);

get mtime (last modification time) for $file_or_fh.

file_timestamp
$timestamp = PACKAGE->file_timestamp($file_or_fh);

get an ISO-8601 timestamp for mtime of $file_or_fh.

du_file
$nbytes = du_file(@filenames_or_fhs);

return number of bytes used by @filesnames_or_fhs

du_glob
$nbytes = du_glob(@globs);

return number of bytes used by files matching and $glob in @globs

copyto
$bool = PACKAGE->moveto($src_filename_or_array, $dstdir, %opts);

Copies source file(s) to $dstdir, creating $dstdir if it doesn't already exist. Argument $src_filename_or_array may be either a ARRAY-ref of filenames to be copied or a single scalar filename. By default, files are copied using File::Copy::copy(). Options %opts:

from   => $from,      ##-- replace prefix $from in file(s) with $todir; default=undef: flat copy to $todir
method => \&method,   ##-- use CODE-ref \&method as underlying copy routing; default=\&File::Copy::copy
label  => $label,     ##-- report errors as '$label'; (default='copyto()')
copyto_a
$bool = PACKAGE->copyto_a($src_filename_or_array, $dstdir, %opts);

Wrapper for PACKAGE->copyto($src_filename_or_array, $dstdir, %opts, method=>\&cp_a, label=>'moveto()').

moveto
$bool = PACKAGE->moveto($src_filename_or_array, $dstdir, %opts);

Wrapper for PACKAGE->copyto($src_filename_or_array, $dstdir, %opts, method=>\&File::Copy::move, label=>'moveto()').

cp_a
$bool = PACKAGE->cp_a($src,$dst);

Copies a single file $src to $dst, attempting to preserve ownership, permissions, and timestamps; used by copyto_a(). Uses File::Copy::syscopy() if available and distinct from File::Copy::copy(), otherwise first copies the file using File::Copy::copy() and subsequently propagates file attributes using the core perl functions chown(), chmod(), and utime().

fh_flush
$fh_or_undef = PACKAGE->fh_flush($fh);

flushes filehandle $fh using its flush() method if available

fh_reopen
$fh_or_undef = PACKAGE->fh_reopen($fh,$file);

closes and re-opens filehandle $fh, should be an expensive flush even if system doesn't support the IO::Handle::flush method.

Utils: SI

si_str
$str = si_str($float);

returns an SI string for $float.

Functions: pdl

_intersect_p
$pi = CLASS::_intersect_p($p1,$p2);
$pi = CLASS->_intersect_p($p1,$p2);

computes intersection of 2 piddles; undef is treated as the universal set; argument piddles MUST be sorted in ascending order.

_union_p
$pu = CLASS::_union_p($p1,$p2);
$pu = CLASS-E<gt>_intersect_p($p1,$p2);

computes union of 2 piddles; undef is treated as the universal set; argument piddles MUST be sorted in ascending order.

_complement_p
$pneg = CLASS::_complement_p($p,$N);
$pneg = CLASS-E<gt>_complement_p($p,$N);

computes complement of an index-piddle $p; undef is treated as the universal set; $N is the total number of elements in the index-universe.

_setdiff_p
$pdiff = CLASS::_setdiff_p($a,$b,$N);
$pdiff = CLASS-E<gt>_setdiff_p($a,$b,$N);

index-piddle difference; undef is treated as the universal set. $N is the total number of elements in the index-universe.

readPdlFile
$pdl_or_undef = CLASS->readPdlFile($basename, %opts);

Load or mmap a PDL file from disk using PDL::IO::FastRaw; %opts:

class=>$class,    # one of qw(PDL PDL::CCS::Nd)
mmap =>$bool,     # use mapfraw() (default=1)
log  =>$level,    # log-level (default=undef: off)
#...              # other keys passed to CLASS->mapfraw() rsp. CLASS->readfraw()
writePdlFile
$bool = CLASS->writePdlFile($pdl_or_undef, $basename, %opts);

Write a PDL file to disk using PDL::IO::FastRaw. Unlinks target file(s) if $pdl_or_undef is not defined. %opts:

log => $bool,       # log-level (default=undef: off)
#...                # other keys passed to $pdl->writefraw()
writePdlHeader
$bool = CLASS->writePdlHeader($filename, $type, $ndims, @dims);

writes a PDL::IO::FastRaw-style header $filename (e.g. "pdl.hdr"); adapted from PDL::IO::FastRaw::_writefrawhdr(). Arguments:

$type   ##-- PDL::Type or integer
$ndims  ##-- number of piddle dimensions
@dims   ##-- dimension size list, piddle, or ARRAY-ref
writeCcsHeader
$bool = CLASS->writeCcsHeader($filename, $itype, $vtype, $pdims, %opts);

writes a PDL::CCS::IO::FastRaw-style header $filename (e.g. "pdl.hdr"). Arguments:

$itype,          ##-- PDL::Type for index (default: PDL::CCS::Utils::ccs_indx())
$vtype,          ##-- PDL::Type for values (default: $PDL::IO::Misc::deftype)
$pdims,          ##-- dimension piddle or ARRAY-ref
%opts            ##-- passed to PDL::CCS::Nd-E<gt>newFromWich
mmzeroes
$pdl = mmzeroes      ($file?, $type?, @dims, \%opts?);
$pdl = $pdl->mmzeroes($file?, $type?,        \%opts?);

create a (temporary) mmap()ed pdl using DiaColloDB::PDL::MM; wraps DiaColloDB::PDL::MM->new(). %opts:

file => $template,   ##-- file basename or File::Temp template; default='pdlXXXX'
suffix => $suffix,   ##-- File::Temp::tempfile() suffix (default='.pdl')
log  => $level,      ##-- logging verbosity (default=undef: off)
temp => $bool,       ##-- delete on END (default: $file =~ /X{4}/)
mmtemp
$pdl = mmtemp      ($file?, $type?, @dims, \%opts?);
$pdl = $pdl->mmtemp($file?, $type?,        \%opts?);

Like mmzeroes(), but wraps DiaColloDB::PDL::MM->mmtemp(), implicitly setting $opts->{temp}=1.

$bool = mmunlink(@mmfiles);
$bool = mmunlink($mmpdl,@mmfiles);

unlinks file(s) generated by mmzeroes($basename) or mmtemp($basename); wraps DiaColloDB::PDL::MM::unlink().

mintype
$type = CLASS->mintype    ($pdl,    @types);
$type = CLASS->mintype($maxval, @types);

returns minimum PDL::Types type from @types required for representing $maxval, which in turn defaults to $maxval->max if $maxval is passed as a PDL; @types defaults to all known PDL types.

maxval
$maxval = $type->maxval();
$maxval = CLASS::maxval($type_or_name)

returns maximum value representable by PDL::Type $type (first form) or $type_or_name (second form); really only meaningful for integer types.

valcounts
($vals,$counts) = $pdl->valcounts();

wrapper for $pdl->flat->qsort->rle() with masking of zero-counts lifted from MUDL::PDL::Smooth.

Functions: temporaries

Variable: $TMPDIR

Global temp directory to use. If undefined (the default), File::Spec::tmpdir() will be used.

Variable: @TMPFILES

list of temporary files created by this process to be unlinked in an END block.

tmpdir
$tmpdir = CLASS->tmpdir();
$tmpdir = CLASS-t>tmpdir($template, %opts);

in first form, get name of global tempdir ($TMPDIR || File::Spec::tmpdir()). in second form, create and return a new temporary directory via File::Temp::tempdir().

tmpfh
$fh = CLASS->tmpfh();
$fh = CLASS->tmpfh($template_or_filename, %opts);

get a new temporary filehandle or undef on error; in list context, returns ($fh,$filename) or empty list on error. $template_or_filename defaults to "tmpXXXXX". uses File::Temp::tempfile() if $template_or_filename contains at least 4 "X" characters, otherwise uses literal $template_or_filename, honoring the DIR, TMPDIR, SUFFIX, and UNLINK options in %opts, which are interpreted as for File::Temp::tempfile().

tmpfile
$filename = CLASS->tmpfile();
$filename = CLASS->tmpfile($template, %opts);

Wrapper for tmpfh which returns only the filename.

tmparray
\@tmparray = CLASS->tmparray($template, %opts);

ties a new temporary array via DiaColloDB::Temp::Array and returns a reference to the newly tied array. wraps tmpfile($template,%opts) tie(my @tmparray, 'DiaColloDB::Temp::Array', $tmpfilename, %opts).

tmparrayp
\@tmparrayp = CLASS->tmparrayp($template, $packas, %opts);

ties a new temporary integer-array via DiaColloDB::PackedFile and returns a reference to the newly tied array. wraps tmpfile($template,%opts) and tie(my @tmparray, 'DiaColloDB::PackedFile', $tmpfilename, %opts).

tmphash
\%tmphash = CLASS->tmphash($template, %opts);

ties a new temporary hash via DiaColloDB::Temp::Hash and returns a reference to the newly tied hash. wraps tmpfile($template,%opts) and tie(my %tmphash, 'DiaColloDB::Temp::Hash', $tmpfilename, %opts).

Functions: parallelization

Variable: %NCORES

Cache for nCores() utility: ($cpuinfo_file=>$n, ...)

nCores
$ncores = PACKAGE::nCores();
$ncores = PACKAGE::nCores($proc_cpuinfo_filename);

Returns the number of CPU cores on the system according to the file $proc_cpuinfo_filename (by default /proc/cpuinfo) if available, otherwise according to the external program nproc if that is avaialble, and otherwise zero. Caches result as $NCORES{$proc_cpuinfo_filename}.

nJobs
$njobs = PACKAGE::nJobs();
$njobs = PACKAGE::nJobs($njobsRequest);

Gets non-negative number of parallel jobs (threads) for user request $njobsRequest, which defaults to the current value of the package variable $DiaColloDB::NJOBS, or -1 if it is undefined.

If $njobsRequest is negative, returns the number of CPU cores on the system via nCores(). Otherwise, if (0 < $njobsRequest < 1), $njobsRequest is interpreted as the desired fraction of the number of available CPU cores to be used, and returns ($njobsRequest*nCores()). In all other cases, $njobsRequest is interpreted as an exact number of jobs to use, ans is returned as int($njobs+0).

sortJobs
$sort_parallel_option = sortJobs();
$sort_parallel_option = sortJobs($njobsRequest);

Returns an appropriate --parallel option for external sort system command calls to use <$njobsRequest> parallel jobs (assuming sort calling conventions as for GNU coreutils).

AUTHOR

Bryan Jurish <moocow@cpan.org>

COPYRIGHT AND LICENSE

Copyright (C) 2015-2020 by Bryan Jurish

This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.14.2 or, at your option, any later version of Perl 5 you may have available.

SEE ALSO

DiaColloDB(3pm), perl(1), ...