DiaColloDB::Utils - diachronic collocation database, generic utilities
use DiaColloDB::Utils;
## Functions: Fcntl
$flags = PACKAGE::fcflags($flags);
$flags = PACKAGE::fcgetfl($fh);
$bool = fcread($flags);
$bool = fcwrite($flags);
$bool = fctrunc($flags);
$bool = fccreat($flags);
$fh_or_undef = fcopen($file,$flags);
## JSON: load
$data = PACKAGE::loadJsonString( $string,%opts);
$data = PACKAGE::loadJsonFile($filename_or_handle,%opts);
## JSON: save
$str = PACKAGE::saveJsonString($data);
$bool = PACKAGE::saveJsonFile($data,$filename_or_handle,%opts);
## Functions: env
\%setenv = PACKAGE::env_set(%setenv);
\%restored = PACKAGE::env_pop(%setenv);
## Functions: run
$fh_or_undef = PACKAGE::opencmd($cmd);
$bool = crun(@IPC_Run_args);
$bool = csort_to(\@sortargs, \&catcher);
$bool = csortuc_to(\@sortargs, \&catcher);
$cmd = sortCmd();
## Functions: pack filters
$len = PACKAGE::packsize($packfmt);
\&filter_sub = PACKAGE::packFilterStore($pack_template);
\&filter_sub = PACKAGE::packFilterFetch($pack_template);
## Math stuff
$log2 = log2($x);
$max2 = max2($x,$y);
$min2 = min2($x,$y);
## Functions: lists
\@l_uniq = luniq(\@l);
\@l_sorted_uniq = sluniq(\@l_sorted);
\@l_uniq = xluniq(\@l,\&keyfunc);
## Functions: regexes
$re = regex($re_str);
## Functions: html
$escaped = htmlesc($str);
## Functions: time
$hms = PACKAGE::s2hms($seconds,$sfmt="%06.3f");
$timestr = PACKAGE::s2timestr($seconds,$sfmt="%f");
$rfc_timestamp = PACAKGE->timestamp();
## Functions: file
$mtime = PACKAGE->file_mtime($file_or_fh);
$timestamp = PACKAGE->file_timestamp($file_or_fh);
$nbytes = du_file(@filenames_or_fh);
$nbytes = du_glob(@globs);
$bool = PACKAGE->copyto ($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->copyto_a($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->moveto ($src_filename_or_array, $dstdir, %opts);
$bool = PACKAGE->cp_a ($src_filename_or_array, $dstdir);
## Utils: SI
$str = si_str($float);
## Functions: pdl: setops
$pi = CLASS::_intersect_p($p1,$p2);
$pu = CLASS::_union_p($p1,$p2);
$pneg = CLASS::_complement_p($p,$N);
$pdiff = CLASS::_setdiff_p($a,$b,$N);
## Functions: pdl
$pdl_or_undef = CLASS->readPdlFile($basename, %opts);
$bool = CLASS->writePdlFile($pdl_or_undef, $basename, %opts);
$bool = CLASS->writePdlHeader($filename, $type, $ndims, @dims);
$bool = CLASS->writeCcsHeader($filename, $itype, $vtype, $pdims, %opts);
$pdl = mmzeroes($file?, $type?, @dims, \%opts?);
$bool = mmunlink(@mmfiles);
$type = CLASS->mintype($pdl, @types);
$maxval = $type->maxval();
($vals,$counts) = $pdl->valcounts();
## Functions: temporaries
$tmpdir = CLASS->tmpdir();
$fh = CLASS->tmpfh();
$filename = CLASS->tmpfile();
\@tmparray = CLASS->tmparray($template, %opts);
\@tmparrayp = CLASS->tmparrayp($template, $packas, %opts);
\%tmphash = CLASS->tmphash($class, $template, %opts);
## Functions: parallelization
$ncores = CLASS->nCores();
$njobs = CLASS->nJobs();
$sortjobs = CLASS->sortJobs();
- Variable: @ISA
DiaColloDB::Utils inherits from Exporter and DiaColloDB::Logger.
- Variable: %EXPORT_TAGS
Exportable tags:
fcntl => [qw(fcflags fcgetfl fcread fcwrite fctrunc fccreat fcperl fcopen)], json => [qw(loadJsonString loadJsonFile saveJsonString saveJsonFile)], sort => [qw(csort_to csortuc_to sortCmd)], run => [qw(crun opencmd)], env => [qw(env_set env_push env_pop)], pack => [qw(packsize packsingle packFilterFetch packFilterStore)], math => [qw($LOG2 log2 min2 max2)], list => [qw(luniq sluniq xluniq)], regex => [qw(regex)], html => [qw(htmlesc)], time => [qw(s2hms s2timestr timestamp)], file => [qw(file_mtime file_timestamp du_file du_glob copyto copyto_a moveto cp_a fh_flush fh_reopen)], si => [qw(si_str)], pdl => [qw(_intersect_p _union_p _complement_p _setdiff_p), qw(readPdlFile writePdlFile writePdlHeader writeCcsHeader mmzeroes mmtemp), qw(maxval mintype), ], temp => [qw($TMPDIR tmpdir tmpfh tmpfile tmparray tmparrayp tmphash)], jobs => [qw(nCores nJobs sortJobs)],
- Variable: @EXPORT_OK
All symbols in %EXPORT_TAGS are exportable
- Variable: @EXPORT
All symbols in %EXPORT_TAGS are exported by default.
Functions: Fcntl
- fcflags
$flags = PACKAGE::fcflags($flags);
returns Fcntl flags for symbolic string $flags
- fcgetfl
$flags = PACKAGE::fcgetfl($fh);
returns Fcntl flags for filehandle $fh
- fcread
$bool = fcread($flags);
returns true if any read-bits are set for $flags
- fcwrite
$bool = fcwrite($flags);
returns true if any write-bits are set for $flags
- fctrunc
$bool = fctrunc($flags);
returns true if truncate-bits are set for $flags
- fccreat
$bool = fccreat($flags);
returns true iff creation flag is set for $flags.
- fcperl
$str = fcperl($flags);
returns perl mode-string corresponding to $flags.
- fcopen
$fh_or_undef = fcopen($file,$flags); $fh_or_undef = fcopen($file,$flags,$mode,$perms)
opens $file with Fcntl-style flags $flags.
JSON: load
- loadJsonString
$data = PACKAGE::loadJsonString( $string,%opts); $data = PACKAGE::loadJsonString(\$string,%opts)
decodes JSON string. %opts are passed to JSON::from_json().
- loadJsonFile
$data = PACKAGE::loadJsonFile($filename_or_handle,%opts);
loads JSON data from a file or filehandle. %opts are passed to loadJsonString().
JSON: save
- saveJsonString
$str = PACKAGE::saveJsonString($data); $str = PACKAGE::saveJsonString($data,%opts);
Encode data as a JSON string. %opts are passed to JSON::to_json(), e.g. (pretty=>0, canonical=>0)'.
- saveJsonFile
$bool = PACKAGE::saveJsonFile($data,$filename_or_handle,%opts);
Save JSON data to a file. %opts are passed to saveJsonString().
Functions: env
- Variable: @env_stack
Stack of temporary environment variables.
- env_set
\%setenv = PACKAGE::env_set(%setenv);
Set or clear environment variables.
- env_push
\%oldvals = PACKAGE::env_push(%setenv);
Push old values for keys(%setenv) to @env_stack and calls env_set(%setenv).
- env_pop
\%restored = PACKAGE::env_pop(%setenv);
Pops the most recent variable bindings from @env_stack and restores them to the environment.
Functions: run
- opencmd
$fh_or_undef = PACKAGE::opencmd($cmd); $fh_or_undef = PACKAGE::opencmd($mode,@argv);
does log trace at level $TRACE_RUNCMD
- crun
$bool = crun(@IPC_Run_args);
wrapper for IPC::Run::run(@IPC_Run_args) with $ENV{LC_ALL}='C'
- csort_to
$bool = csort_to(\@sortargs, \&catcher);
runs system sort and feeds resulting lines to \&catcher
- csortuc_to
$bool = csortuc_to(\@sortargs, \&catcher);
runs system sort | uniq -c and feeds resulting lines to \&catcher
- sortCmd
$cmd = sortCmd(); $cmd = sortCmd($nJobs);
Returns command-line prefix (command and initial optopins) for GNU-like sort command. This method just returns the value of the
environment variable if it is set, otherwise the value of theSORT
environment variable if that is set. If neitherDIACOLLO_SORT
are set, it returns the string sort with the parallelization options returned by "sortJobs" appended. You can use the environment variable hooks e.g. to reduce the amount of RAM and/or CPU cores used by subordinate system sort calls by setting them appropriately, e.g.env SORT="/bin/sort --parallel=4 --buffer-size=1G"
to request that GNU sort use at most 4 CPU cores and a maximum RAM buffer size of 1GB.
Functions: pack filters
- packsize
$len = PACKAGE::packsize($packfmt); $len = PACKAGE::packsize($packfmt,@args);
get pack-size for $packfmt with args @args
- packFilterStore
\&filter_sub = PACKAGE::packFilterStore($pack_template); \&filter_sub = PACKAGE::packFilterStore([$pack_template_store, $pack_template_fetch]); \&filter_sub = PACKAGE::packFilterStore([\&pack_code_store, \&pack_code_fetch]);
returns a DB_File-style STORE-filter sub for transparent packing of data to $pack_template
- packFilterFetch
\&filter_sub = PACKAGE::packFilterFetch($pack_template); \&filter_sub = PACKAGE::packFilterFetch([$pack_template_store, $pack_template_fetch]); \&filter_sub = PACKAGE::packFilterFetch([\&pack_code_store, \&pack_code_fetch]);
returns a DB_File-style FETCH-filter sub for transparent unpacking of data from $pack_template.
Math stuff
- Variable: $LOG2
for binary logarithms. - log2
$log2 = log2($x);
binary logarithm function.
- max2
$max2 = max2($x,$y);
- min2
$min2 = min2($x,$y);
Functions: lists
- luniq
\@l_uniq = luniq(\@l);
returns sorted list of unique defined elements of @l; @l need not be sorted.
- sluniq
\@l_sorted_uniq = sluniq(\@l_sorted);
returns unique defined elements of pre-sorted list @l_sorted.
- xluniq
\@l_uniq = xluniq(\@l,\&keyfunc);
returns elements of @l with unique defined keys according to
); returned list is sorted by\&keyfunc
Functions: regexes
- regex
$re = regex($re_str);
parses regex $re_str, which can optionally be "/"-quoted. parses modifiers /[gimsadlu]. /g modifier is parsed a la ddc (match whole word).
Functions: html
- htmlesc
$escaped = htmlesc($str);
escape an HTML string.
Functions: time
- s2hms
$hms = PACKAGE::s2hms($seconds,$sfmt="%06.3f"); ($h,$m,$s) = PACKAGE::s2hms($seconds,$sfmt="%06.3f");
convert a time value in seconds to HH:MM:SS.SSSS format
- s2timestr
$timestr = PACKAGE::s2timestr($seconds,$sfmt="%f");
convert a time value in seconds to H?M?S.SSSS format
- timestamp
$rfc_timestamp = PACAKGE->timestamp(); $rfc_timestamp = PACAKGE->timestamp($time);
Return a UTC ISO-8601 timestamp format "%Y-%m-%dT%H:%M%SZ" for the UNIX time $time.
Functions: file
- file_mtime
$mtime = PACKAGE->file_mtime($file_or_fh);
get mtime (last modification time) for $file_or_fh.
- file_timestamp
$timestamp = PACKAGE->file_timestamp($file_or_fh);
get an ISO-8601 timestamp for mtime of $file_or_fh.
- du_file
$nbytes = du_file(@filenames_or_fhs);
return number of bytes used by @filesnames_or_fhs
- du_glob
$nbytes = du_glob(@globs);
return number of bytes used by files matching and $glob in @globs
- copyto
$bool = PACKAGE->moveto($src_filename_or_array, $dstdir, %opts);
Copies source file(s) to
, creating$dstdir
if it doesn't already exist. Argument$src_filename_or_array
may be either a ARRAY-ref of filenames to be copied or a single scalar filename. By default, files are copied using File::Copy::copy(). Options %opts:from => $from, ##-- replace prefix $from in file(s) with $todir; default=undef: flat copy to $todir method => \&method, ##-- use CODE-ref \&method as underlying copy routing; default=\&File::Copy::copy label => $label, ##-- report errors as '$label'; (default='copyto()')
- copyto_a
$bool = PACKAGE->copyto_a($src_filename_or_array, $dstdir, %opts);
Wrapper for
PACKAGE->copyto($src_filename_or_array, $dstdir, %opts, method=>\&cp_a, label=>'moveto()')
. - moveto
$bool = PACKAGE->moveto($src_filename_or_array, $dstdir, %opts);
Wrapper for
PACKAGE->copyto($src_filename_or_array, $dstdir, %opts, method=>\&File::Copy::move, label=>'moveto()')
. - cp_a
$bool = PACKAGE->cp_a($src,$dst);
Copies a single file
, attempting to preserve ownership, permissions, and timestamps; used by copyto_a(). Uses File::Copy::syscopy() if available and distinct from File::Copy::copy(), otherwise first copies the file using File::Copy::copy() and subsequently propagates file attributes using the core perl functions chown(), chmod(), and utime(). - fh_flush
$fh_or_undef = PACKAGE->fh_flush($fh);
flushes filehandle $fh using its flush() method if available
- fh_reopen
$fh_or_undef = PACKAGE->fh_reopen($fh,$file);
closes and re-opens filehandle $fh, should be an expensive flush even if system doesn't support the IO::Handle::flush method.
Utils: SI
- si_str
$str = si_str($float);
returns an SI string for $float.
Functions: pdl
- _intersect_p
$pi = CLASS::_intersect_p($p1,$p2); $pi = CLASS->_intersect_p($p1,$p2);
computes intersection of 2 piddles; undef is treated as the universal set; argument piddles MUST be sorted in ascending order.
- _union_p
$pu = CLASS::_union_p($p1,$p2); $pu = CLASS-E<gt>_intersect_p($p1,$p2);
computes union of 2 piddles; undef is treated as the universal set; argument piddles MUST be sorted in ascending order.
- _complement_p
$pneg = CLASS::_complement_p($p,$N); $pneg = CLASS-E<gt>_complement_p($p,$N);
computes complement of an index-piddle
; undef is treated as the universal set;$N
is the total number of elements in the index-universe. - _setdiff_p
$pdiff = CLASS::_setdiff_p($a,$b,$N); $pdiff = CLASS-E<gt>_setdiff_p($a,$b,$N);
index-piddle difference; undef is treated as the universal set.
is the total number of elements in the index-universe. - readPdlFile
$pdl_or_undef = CLASS->readPdlFile($basename, %opts);
Load or mmap a PDL file from disk using PDL::IO::FastRaw; %opts:
class=>$class, # one of qw(PDL PDL::CCS::Nd) mmap =>$bool, # use mapfraw() (default=1) log =>$level, # log-level (default=undef: off) #... # other keys passed to CLASS->mapfraw() rsp. CLASS->readfraw()
- writePdlFile
$bool = CLASS->writePdlFile($pdl_or_undef, $basename, %opts);
Write a PDL file to disk using PDL::IO::FastRaw. Unlinks target file(s) if
is not defined. %opts:log => $bool, # log-level (default=undef: off) #... # other keys passed to $pdl->writefraw()
- writePdlHeader
$bool = CLASS->writePdlHeader($filename, $type, $ndims, @dims);
writes a PDL::IO::FastRaw-style header
); adapted from PDL::IO::FastRaw::_writefrawhdr(). Arguments:$type ##-- PDL::Type or integer $ndims ##-- number of piddle dimensions @dims ##-- dimension size list, piddle, or ARRAY-ref
- writeCcsHeader
$bool = CLASS->writeCcsHeader($filename, $itype, $vtype, $pdims, %opts);
writes a PDL::CCS::IO::FastRaw-style header
). Arguments:$itype, ##-- PDL::Type for index (default: PDL::CCS::Utils::ccs_indx()) $vtype, ##-- PDL::Type for values (default: $PDL::IO::Misc::deftype) $pdims, ##-- dimension piddle or ARRAY-ref %opts ##-- passed to PDL::CCS::Nd-E<gt>newFromWich
- mmzeroes
$pdl = mmzeroes ($file?, $type?, @dims, \%opts?); $pdl = $pdl->mmzeroes($file?, $type?, \%opts?);
create a (temporary) mmap()ed pdl using DiaColloDB::PDL::MM; wraps DiaColloDB::PDL::MM->new(). %opts:
file => $template, ##-- file basename or File::Temp template; default='pdlXXXX' suffix => $suffix, ##-- File::Temp::tempfile() suffix (default='.pdl') log => $level, ##-- logging verbosity (default=undef: off) temp => $bool, ##-- delete on END (default: $file =~ /X{4}/)
- mmtemp
$pdl = mmtemp ($file?, $type?, @dims, \%opts?); $pdl = $pdl->mmtemp($file?, $type?, \%opts?);
Like mmzeroes(), but wraps DiaColloDB::PDL::MM->mmtemp(), implicitly setting
. - mmunlink
$bool = mmunlink(@mmfiles); $bool = mmunlink($mmpdl,@mmfiles);
unlinks file(s) generated by mmzeroes($basename) or mmtemp($basename); wraps DiaColloDB::PDL::MM::unlink().
- mintype
$type = CLASS->mintype ($pdl, @types); $type = CLASS->mintype($maxval, @types);
returns minimum PDL::Types type from
required for representing$maxval
, which in turn defaults to$maxval->max
is passed as a PDL;@types
defaults to all known PDL types. - maxval
$maxval = $type->maxval(); $maxval = CLASS::maxval($type_or_name)
returns maximum value representable by PDL::Type
(first form) or$type_or_name
(second form); really only meaningful for integer types. - valcounts
($vals,$counts) = $pdl->valcounts();
wrapper for $pdl->flat->qsort->rle() with masking of zero-counts lifted from MUDL::PDL::Smooth.
Functions: temporaries
- Variable: $TMPDIR
Global temp directory to use. If undefined (the default), File::Spec::tmpdir() will be used.
- Variable: @TMPFILES
list of temporary files created by this process to be unlinked in an END block.
- tmpdir
$tmpdir = CLASS->tmpdir(); $tmpdir = CLASS-t>tmpdir($template, %opts);
in first form, get name of global tempdir ($TMPDIR || File::Spec::tmpdir()). in second form, create and return a new temporary directory via File::Temp::tempdir().
- tmpfh
$fh = CLASS->tmpfh(); $fh = CLASS->tmpfh($template_or_filename, %opts);
get a new temporary filehandle or
on error; in list context, returns($fh,$filename)
or empty list on error.$template_or_filename
defaults to"tmpXXXXX"
. uses File::Temp::tempfile() if $template_or_filename contains at least 4"X"
characters, otherwise uses literal $template_or_filename, honoring theDIR
options in%opts
, which are interpreted as for File::Temp::tempfile(). - tmpfile
$filename = CLASS->tmpfile(); $filename = CLASS->tmpfile($template, %opts);
Wrapper for tmpfh which returns only the filename.
- tmparray
\@tmparray = CLASS->tmparray($template, %opts);
ties a new temporary array via DiaColloDB::Temp::Array and returns a reference to the newly tied array. wraps
tie(my @tmparray, 'DiaColloDB::Temp::Array', $tmpfilename, %opts)
. - tmparrayp
\@tmparrayp = CLASS->tmparrayp($template, $packas, %opts);
ties a new temporary integer-array via DiaColloDB::PackedFile and returns a reference to the newly tied array. wraps
andtie(my @tmparray, 'DiaColloDB::PackedFile', $tmpfilename, %opts)
. - tmphash
\%tmphash = CLASS->tmphash($template, %opts);
ties a new temporary hash via DiaColloDB::Temp::Hash and returns a reference to the newly tied hash. wraps
andtie(my %tmphash, 'DiaColloDB::Temp::Hash', $tmpfilename, %opts)
Functions: parallelization
- Variable: %NCORES
Cache for nCores() utility:
($cpuinfo_file=>$n, ...)
- nCores
$ncores = PACKAGE::nCores(); $ncores = PACKAGE::nCores($proc_cpuinfo_filename);
Returns the number of CPU cores on the system according to the file $proc_cpuinfo_filename (by default /proc/cpuinfo) if available, otherwise according to the external program
if that is avaialble, and otherwise zero. Caches result as$NCORES{$proc_cpuinfo_filename}
. - nJobs
$njobs = PACKAGE::nJobs(); $njobs = PACKAGE::nJobs($njobsRequest);
Gets non-negative number of parallel jobs (threads) for user request
, which defaults to the current value of the package variable$DiaColloDB::NJOBS
, or -1 if it is undefined.If
is negative, returns the number of CPU cores on the system via nCores(). Otherwise, if (0 < $njobsRequest < 1), $njobsRequest is interpreted as the desired fraction of the number of available CPU cores to be used, and returns($njobsRequest*nCores())
. In all other cases,$njobsRequest
is interpreted as an exact number of jobs to use, ans is returned asint($njobs+0)
. - sortJobs
$sort_parallel_option = sortJobs(); $sort_parallel_option = sortJobs($njobsRequest);
Returns an appropriate
option for externalsort
system command calls to use <$njobsRequest> parallel jobs (assumingsort
calling conventions as for GNU coreutils).
Bryan Jurish <>
Copyright (C) 2015-2020 by Bryan Jurish
This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.14.2 or, at your option, any later version of Perl 5 you may have available.
DiaColloDB(3pm), perl(1), ...