next_data_hash

About   : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
Usage   : my $vcf = VCF->new();
          $vcf->parse_header();
          my $x = $vcf->next_data_hash();

          # Or having a VCF data line $line
          my $x = $vcf->next_data_hash($line);

Args    : Optional line to parse.

parse_header

About   : Reads (and stores) the VCF header.
Usage   : my $vcf = VCF->new(); $vcf->parse_header();
Args    : silent .. do not warn about duplicate header lines

_next_header_line

About   : Stores the header lines and meta information, such as fields types, etc.
Args    : silent .. do not warn about duplicate column names

get_header_line

Usage   : $vcf->get_header_line(key=>'INFO', ID=>'AC')
          $vcf->get_header_line(key=>'FILTER', ID=>'q10')
          $vcf->get_header_line(key=>'reference')
          $vcf->get_header_line(key=>'contig',ID=>'20')
Args    : Header line filter as in the example above
Returns : List ref of header line hashes matching the filter

add_header_line

Usage   : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
          $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
Args    : Header line hash as in the example above
          Hash with additional parameters [optional]
            silent .. do not warn about existing header keys
            append .. append timestamp to the name of the new one
Returns :

remove_header_line

Usage   : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
Args    :
Returns :

parse_header_line

Usage   : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
          $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
Args    :
Returns :

_read_column_names

About   : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
          The indexes go from 1.
Usage   : $vcf->_read_column_names();
Args    : none

_fake_column_names

About   : When no header is present, fake column names as the default mandatory ones + numbers
Args    : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present

format_header

About   : Returns the header.
Usage   : print $vcf->format_header();
Args    : The columns to include on output [optional]

format_line

About   : Returns the header.
Usage   : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
          $x = $vcf->next_data_array(); print $vcf->format_line($x);
Args 1  : The columns or hash in the format returned by next_data_hash or next_data_array.
     2  : The columns to include [optional]

recalc_ac_an

About   : Control if the AC and AN values should be updated.
Usage   : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
Args 1  : 0 .. never recalculate
          1 .. recalculate if present
          2 .. recalculate if present and add if missing

get_tag_index

Usage   : my $idx = $vcf->get_tag_index('GT:PL:DP:SP:GQ','PL',':');
Arg 1   : Field
    2   : The tag to find
    3   : Tag separator
Returns : Index of the tag or -1 when not found

remove_field

Usage   : my $field = $vcf->remove_field('GT:PL:DP:SP:GQ',1,':');    # returns 'GT:DP:SP:GQ'
Arg 1   : Field
    2   : The index of the field to remove
    3   : Field separator
Returns : Modified string

replace_field

Usage   : my $col = $vcf->replace_field('GT:PL:DP:SP:GQ','XX',1,':');    # returns 'GT:XX:DP:SP:GQ'
Arg 1   : Field
    2   : Replacement
    3   : 0-based index of the field to replace
    4   : Field separator
Returns : Modified string

get_info_field

Usage   : my $line  = $vcf->next_line;
          my @items = split(/\t/,$line);
          $af = $vcf->get_info_field('DP=14;AF=0.5;DB','AF');    # returns 0.5
          $af = $vcf->get_info_field('DP=14;AF=0.5;DB','DB');    # returns 1
          $af = $vcf->get_info_field('DP=14;AF=0.5;DB','XY');    # returns undef
Arg 1   : The VCF line broken into an array
    2   : The tag to retrieve
Returns : undef when tag is not present, the tag value if present, or 1 if flag is present

get_field

Usage   : my $line  = $vcf->next_line;
          my @items = split(/\t/,$line);
          my $idx = $vcf->get_tag_index($$line[8],'PL',':');
          my $pl  = $vcf->get_field($$line[9],$idx) unless $idx==-1;
Arg 1   : The VCF line broken into an array
    2   : The index of the field to retrieve
    3   : The delimiter [Default is ':']
Returns : The tag value

get_sample_field

Usage   : my $line  = $vcf->next_line;
          my @items = split(/\t/,$line);
          my $idx = $vcf->get_tag_index($$line[8],'PL',':');
          my $pls = $vcf->get_sample_field(\@items,$idx) unless $idx==-1;
Arg 1   : The VCF line broken into an array
    2   : The index of the field to retrieve
Returns : Array of values

split_mandatory

About   : Faster alternative to regexs, extract the mandatory columns
Usage   : my $line=$vcf->next_line; my @cols = $vcf->split_mandatory($line);
Arg     :
Returns : Pointer to the array of values

split_gt

About   : Faster alternative to regexs
Usage   : my ($a1,$a2,$a3) = $vcf->split_gt('0/0/1'); # returns (0,0,1)
Arg     : Diploid genotype to split into alleles
Returns : Array of values

split_by

About   : Generalization of split_gt
Usage   : my ($a1,$a2,$a3) = $vcf->split_gt('0/0|1',qw(| /)); # returns (0,0,1)
Arg     : Diploid genotype to split into alleles
Returns : Array of values

decode_genotype

About   : Faster alternative to regexs
Usage   : my $gt = $vcf->decode_genotype('G',['A','C'],'0/0'); # returns 'G/G'
Arg   1 : Ref allele
      2 : Alt alleles
      3 : The genotype to decode
Returns : Decoded GT string

validate_alt_field

Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
Args    : The ALT arrayref
Returns : Error message in case of an error.

event_type

Usage   :   my $x = $vcf->next_data_hash();
            my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
            for my $allele (@$alleles)
            {
                my ($type,$len,$ht) = $vcf->event_type($x,$allele);
            }
          or
            my ($type,$len,$ht) = $vcf->event_type($ref,$al);
Args    : VCF data line parsed by next_data_hash or the reference allele
        : Allele
Returns :   's' for SNP and number of SNPs in the record
            'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
            'r' identical to the reference, length 0
            'o' for other (complex events) and the number of affected bases
            'b' breakend
            'u' unknown

has_AGtags

About   : Checks the header for the presence of tags with variable number of fields (Number=A or Number=G, such as GL)
Usage   : $vcf->parse_header(); my $agtags = $vcf->has_AGtags();
Args    : None
Returns : Hash {fmtA=>[tags],fmtG=>[tags],infoA=>[tags],infoG=>[tags]} or undef if none is present

parse_AGtags

About   : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
Usage   : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
Args    : VCF data line parsed by next_data_hash
        : Mapping between ALT representations based on different REFs [optional]
        : New REF [optional]
Returns : Hash {Allele=>Value}

format_AGtag

About   : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
Usage   :
Args    :
        :
        :
Returns :

parse_alleles

About   : Deprecated, use parse_haplotype instead.
Usage   : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
Args    : VCF data line parsed by next_data_hash
        : The genotype column name
Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.

parse_haplotype

About   : Similar to parse_alleles, supports also multiploid VCFs.
Usage   : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
Args    : VCF data line parsed by next_data_hash
        : The genotype column name
Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
            can be cashed and must be therefore considered read only!

format_haplotype

Usage   : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);

format_genotype_strings

Usage   : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
          $vcf->format_genotype_strings($x);
          print $vcf->format_line($x);
Args 1  : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
     2  : Optionally, a subset of columns can be supplied. See also format_line.
Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
            numbers starting from 1. If the key $$vcf{trim_redundant_ALTs} is set, ALT alleles not appearing
            in any of the sample column will be removed.

format_header_line

Usage   : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
Args    :
Returns :

remove_columns

Usage   : my $rec=$vcf->next_data_hash(); $vcf->remove_columns($rec,remove=>['NA001','NA0002']);
Args    : VCF hash pointer
        : list of columns to remove or a lookup hash with column names to keep (remove=>[] or keep=>{})
Returns :

add_columns

Usage   : $vcf->add_columns('NA001','NA0002');
Args    :
Returns :

add_format_field

Usage   : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
Args    : The record obtained by next_data_hash
        : The field name
Returns :

remove_format_field

Usage   : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
Args    : The record obtained by next_data_hash
        : The field name
Returns :

add_info_field

Usage   : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
Args    : The record obtained by next_data_array
        : The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
            it will be removed. To add fields without a value, use empty string ''.
Returns : The formatted INFO.

add_filter

Usage   : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
Args    : The record obtained by next_data_array or next_data_hash
        : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
Returns : The formatted filter field.

validate_filter_field

Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
Args    : The FILTER arrayref
Returns : Error message in case of an error.

validate_header

About   : Version specific header validation code.
Usage   : my $vcf = VCF->new(); $vcf->parse_header(); $vcf->validate_header();
Args    :

validate_line

About   : Version specific line validation code.
Usage   : my $vcf = VCF->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
Args    :

validate_info_field

Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
Args    : The INFO hashref
Returns : Error message in case of an error.

validate_gtype_field

Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
Args    : The genotype data hashref
          The ALT arrayref
Returns : Error message in case of an error.

run_validation

About   : Validates the VCF file.
Usage   : my $vcf = VCF->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
Args    : File name or file handle.

get_chromosomes

About   : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
Usage   : my $vcf = VCF->new(); $vcf->get_chromosomes();
Args    : none

get_samples

About   : Get list of samples.
Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
Args    : none

get_column

About   : Convenient way to get data for a sample
Usage   : my $rec = $vcf->next_data_array(); my $sample_col = $vcf->get_column($rec, 'NA0001');
Args 1  : Array pointer returned by next_data_array
     2  : Column/Sample name

get_column_name

About   : Mapping between zero-based VCF column and its name
Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my $name = $vcf->get_column_name(1); # returns POS
Args    : Index of the column (0-based)

get_column_index

About   : Mapping between VCF column name and its zero-based index
Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my $name = $vcf->get_column_index('POS'); # returns 1
Args    : Name of the column