NAME
Reshape data like R : read.table, write.table, merge, reshape2, dplyr
数据处理转换,接口山寨自R语言
TOOL
split_file.pl
split_file.pl -f xxx.csv -l 500 -t yyy
split_file.pl -f xxx.csv -i 0 -s ',' -t yyy
FUNCTION
example : xt/ å®ä¾åè xt åæ件夹read_table
data : csv / arrayref / hashref , dim 2, 二维数据表
input data -> filter (skip_sub) -> select / transform / mutate (conv_sub) -> output
输入 数据 -> 过滤 行(skip_sub) -> 抽取/转换/新增 列 (conv_sub) -> 输出
csv / arrayref data: row is $arrayref, hashref data : row is ($k, $v)
my $df = 'reshape_src.csv';
my $r = read_table($df,
#sep=>',',
charset=>'utf8',
#skip_head=>0,
skip_sub => sub {
my ($r) = @_; # csv or arrayref
# my ($k, $v) = @_; # hashref
$r->[3]<200
},
conv_sub => sub {
my ($r) = @_; # csv or arrayref
# my ($k, $v) = @_; # hashref
[ "$r->[0] $r->[1]", $r->[2], $r->[3] ]
},
#write_head => [ "head_a", "key" , "value" ],
#return_arrayref => 1,
write_file => '01.read_table.csv',
);
write_table
write data into csv 将指定数据写入文本文件
my $d = [ [qw/a b 1/], [qw/c d 2/] ];
write_table($d,
file=> 'write_table.csv',
#sep => ',',
head => [ 'ka', 'kb', 'cnt'],
charset => 'utf8',
);
melt
melt data like R reshape2
原始数据按id聚合,然后把measure的多个列映射成key-value对
#id / measure => [ 1, 2, 'somekey', sub { ... }, ], 4, 'somekey', sub { ... }
my $r = melt('reshape_src.csv',
#sep=>',',
charset => 'utf8',
skip_head => 1,
#skip_sub => sub { $_[0][3]<1000 },
names => [ qw/day hour state cnt rank/ ],
id => [ 0, 1, 2 ],
measure => [3, 4],
#measure_names => [qw/.../],
write_head => [ qw/day hour state key value/ ],
return_arrayref => 1,
melt_file => '02.melt.1.csv',
);
melt('reshape_src.csv',
skip_head => 1,
#names => [ qw/day hour state cnt rank/ ],
id => [ sub { "$_[0][0]d $_[0][1]h" } , 2 , 'test' ],
measure => [ 3, 4, sub { $_[0][3] * $_[0][4] } ],
measure_names => [qw/cnt rank cxr/],
write_head => [ qw/dayhour state somehead key value/ ],
melt_file => '02.melt.2.csv',
);
cast
cast data like R reshape2,原始数据按id聚合,根据指定的 measure(key) 分组,统计value
reduce_sub : process data when read each row,在读取每一行数据的过程中,顺便处理value
stat_sub : process data after read all rows,在数据全部读取完毕后,对value列表进行最终统计
id : same as melt, 与melt相同
measure/value : return 1 value,返回单个标量
my $r = cast('02.melt.csv',
#sep => ',',
#key 有 cnt / rank 两种
names => [ qw/day hour state key value/ ],
id => [ 0, 1, 2 ],
measure => 3,
value => 4,
reduce_sub => sub { my ($last, $now) = @_; return $last+$now; },
#reduce_start_value => 0,
write_head => 1,
default_cell_value => 0,
#default_cast_value => 0,
cast_file => '03.cast.1.csv',
return_arrayref => 1,
);
cast('02.melt.csv',
sep => ',',
#names => [ qw/day hour state key value/ ], #key 有 cnt / rank 两种
id => [ sub { "$_[0][0] $_[0][1]" }, 2 ],
id_names => [ qw/dayhour state/ ],
measure => 3,
measure_names => [ qw/rank cnt/ ],
value => 4,
stat_sub => sub { my ($r) = @_; (sort { $b<=> $a } @$r)[0] },
default_cell_value => 0,
write_head => 1,
cast_file => '03.cast.2.csv',
return_arrayref => 0,
);
merge
merge 2 dataframe, 合并两个dataframe,在perl中是二层数组
my $r = merge(
[ [qw/a b 1/], [qw/c d 2/] ],
[ [qw/a b 3/], [qw/c d 4/] ],
by => [ 0, 1],
value => [2],
);
# $r = [["a", "b", 1, 3], ["c", "d", 2, 4]]
merge_file
merge 2 txt file, 合并两个文本文件
$big left join $small with some columns data
my $merge_fname = merge_file(
$small,
$big,
merge_file => "$big.merge.csv",
by_x => [ 1 ],
value_x => [0, 2],
by_y => [ 0 ],
value_y => [ 0, 1, 2, 3 ],
);
split_file
split large file by some columns or line count
把一个大文件按指定id或行数拆分成多个小文件
my $src_file = '06.split_file.log';
split_file($src_file, id => [ 0 ] ,
# sep => ',',
# split_file => '06.test.log',
);
split_file($src_file, line_cnt => 400);
arrange
sort rows by some method
按指定方法,将所有数据按行重新排序
my $r = arrange('reshape_src.csv',
skip_head => 1,
sep=> ',',
charset => 'utf8',
arrange_sub => sub {
$a->[4] <=> $b->[4] or
$a->[3] <=> $b->[3]
},
arrange_file => '07.arrange.csv',
return_arrayref => 1,
write_head => [ qw/day hour state cnt rank/ ],
);