• Headers are required.

  • The first column of each clinical data file must contain sample IDs which match those in both the --bam-list and the MAF variant list (in the MAF, this is the Tumor_Sample_Barcode column, specifically).

  • In at least one of the clinical data files input, columns with headers "vital_status" and "days_to_last_followup" (case insensitive) must exist. "vital_status" must be delineated by 1's and 0's, where 0 denotes 'living', and 1 denotes 'deceased'.

Note that all input files must be tab-separated.

HELP }

sub _additional_help_sections { return ( "ARGUMENTS", <<EOS

--bam-list
Provide a file containing sample names and normal/tumor BAM locations for each. Use the tab- delimited format [sample_name normal_bam tumor_bam] per line. This tool only needs sample_name, so all other columns can be skipped. The sample_name must be the same as the tumor sample names used in the MAF file (16th column, with the header Tumor_Sample_Barcode).

EOS ); }

sub _doc_authors { return <<EOS Nathan D. Dees, Ph.D. Qunyuan Zhang, Ph.D. EOS }

sub execute {

# parse input arguments
my $self = shift;
my $bam_list = $self->bam_list;
my $genetic_data_type = $self->genetic_data_type;
my $legend_placement = $self->legend_placement;

# handle phenotype inclusions
my @phenotypes_to_include;
my @clinical_phenotypes_to_include;
my @mutated_genes_to_include;
if ($self->phenotypes_to_include) { @phenotypes_to_include = split /,/,$self->phenotypes_to_include; }

# check genetic data type
unless ($genetic_data_type =~ /^gene|variant$/i) {
    $self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter.");
    return;
}

# load clinical data and analysis types
my %clinical_data;
if ($self->numeric_clinical_data_file) {
    $clinical_data{'numeric'} = $self->numeric_clinical_data_file;
}
if ($self->categorical_clinical_data_file) {
    $clinical_data{'categ'} = $self->categorical_clinical_data_file;
}
if ($self->glm_clinical_data_file) {
    $clinical_data{'glm'} = $self->glm_clinical_data_file;
}

# create array of all sample names possibly included from clinical data and MAF
my @all_sample_names; # names of all the samples, no matter if they are mutated or not
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline )
{
    next if ( $line =~ m/^#/ );
    chomp( $line );
    my ( $sample ) = split( /\t/, $line );
    push( @all_sample_names, $sample );
}
$sampleFh->close;

# loop through clinical data files and assemble survival data hash (vital_status and days_to_last_followup required);
my %survival_data;
my $vital_status_flag = 0;
my $days_to_last_follow_flag = 0;

for my $clin_file (keys %clinical_data) {

    #check filehandle
    my $clin_fh = new IO::File $clinical_data{$clin_file},"r";
    unless ($clin_fh) {
        $self->error_message("Failed to open $clinical_data{$clin_file} for reading: $!");
        return;
    }

    #initiate variables to hold column info
    my %phenotypes_to_print;
    my $vital_status_col = 0;
    my $days_to_last_follow_col = 0;

    #parse header and record column locations for needed data
    my $header = $clin_fh->getline;
    my @header_fields = split /\t/,$header;
    for (my $i = 1; $i <= $#header_fields; $i++) { #sample ID should be in first column of file
        my $field = $header_fields[$i];
        if ($field =~ /vital_status|vitalstatus/i) { $vital_status_col = $i; $vital_status_flag++; }
        if ($field =~ /days_to_last_(follow_up|followup)|daystolastfollowup/i) { $days_to_last_follow_col = $i; $days_to_last_follow_flag++; }
        if (scalar grep { /^$field$/i } @phenotypes_to_include) { $phenotypes_to_print{$field} = $i; }
    }

    #read through clinical data file and store needed data in a hash
    while (my $line = $clin_fh->getline) {
        chomp $line;
        my @fields = split /\t/,$line;
        my $sample = $fields[0];
        unless (scalar grep { m/^$sample$/ } @all_sample_names) {
            $self->status_message("Skipping sample $sample. (Sample is not in --bam-list).");
            next;
        }
        if ($vital_status_col) {
            my $vital_status;
            if ($fields[$vital_status_col] =~ /^(0|living)$/i) { $vital_status = 0; }
            elsif ($fields[$vital_status_col] =~ /^(1|deceased)$/i) { $vital_status = 1; }
            else { $vital_status = "NA"; }
            $survival_data{$sample}{'vital_status'} = $vital_status;
        }
        if ($days_to_last_follow_col) { $survival_data{$sample}{'days'} = $fields[$days_to_last_follow_col]; }
        for my $pheno (keys %phenotypes_to_print) { $survival_data{$sample}{$pheno} = $fields[$phenotypes_to_print{$pheno}]; }
    }
    $clin_fh->close;

    # record phenotypes included from clinical data
    push @clinical_phenotypes_to_include, keys %phenotypes_to_print;

}

# check for necessary header fields
unless ($vital_status_flag) {
    $self->error_message('Clinical data does not seem to contain a column labeled "vital_status".');
    return;
}
unless ($days_to_last_follow_flag) {
    $self->error_message('Clnical data does not seem to contain a column labeled "days_to_last_followup".');
    return;
}

# create temporary files for R command
my $survival_data_file = Genome::Sys->create_temp_file_path();
my $mutation_matrix = Genome::Sys->create_temp_file_path();

# print survival data (temp file)
my $surv_fh = new IO::File $survival_data_file,"w" or die "Couldn't open survival data filehandle.";
print $surv_fh join("\t","Sample","Days_To_Last_Followup","Vital_Status");
if (@clinical_phenotypes_to_include) { print $surv_fh "\t" . join("\t",@clinical_phenotypes_to_include); }
print $surv_fh "\n";
for my $sample (keys %survival_data) {
    unless (exists $survival_data{$sample}{'days'}) { $survival_data{$sample}{'days'} = "NA"; }
    unless (exists $survival_data{$sample}{'vital_status'}) { $survival_data{$sample}{'vital_status'} = "NA"; }
    print $surv_fh join("\t",$sample,$survival_data{$sample}{'days'},$survival_data{$sample}{'vital_status'});
    for my $pheno (@clinical_phenotypes_to_include) {
        unless (exists $survival_data{$sample}{$pheno}) { $survival_data{$sample}{$pheno} = "NA"; }
        print $surv_fh "\t" . $survival_data{$sample}{$pheno};
    }
    print $surv_fh "\n";
}
$surv_fh->close;

# find if any of the "phenotypes_to_include" are genes, and if so, limit the MAF mutation matrix to those genes
my %clinical_pheno_to_include;
@clinical_pheno_to_include{@clinical_phenotypes_to_include} = ();
for my $item (@phenotypes_to_include) {
    push @mutated_genes_to_include,$item unless exists $clinical_pheno_to_include{$item};
}
my $mutated_genes_to_include = \@mutated_genes_to_include;

# create mutation matrix file
if( $genetic_data_type =~ /^gene$/i ) {
    $self->create_sample_gene_matrix_gene( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
}
elsif( $genetic_data_type =~ /^variant$/i ) {
    $self->create_sample_gene_matrix_variant( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
}
else {
    $self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." );
    return;
}

# check and prepare output directory
my $output_dir = $self->output_dir . "/";
unless (-e $output_dir) {
    $self->status_message("Creating output directory: $output_dir...");
    unless(mkdir $output_dir) {
        $self->error_message("Failed to create output directory: $!");
        return;
    }
}

# set up R command
my $R_cmd = "R --slave --args < " . __FILE__ . ".R " . join( " ", $survival_data_file, $mutation_matrix, $legend_placement, $output_dir );
print "R_cmd:\n$R_cmd\n";

#run R command
WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)";

return(1);
}

sub create_sample_gene_matrix_gene {

my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;

#create hash of mutations from the MAF file
my ( %mutations, %all_genes );

#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
    next if( $line =~ m/^(#|Hugo_Symbol)/ );
    chomp $line;
    my @cols = split( /\t/, $line );
    my ( $gene, $mutation_class, $sample ) = @cols[0,8,15];

    #check that the mutation class is valid
    if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
        $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
        return;
    }

    #check to see if this gene is on the list (if there is a list at all)
    if( defined @{$mutated_genes_to_include} ) {
        next unless( scalar grep { m/^$gene$/ } @{$mutated_genes_to_include} );
    }

    # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
    if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
       ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
        print "Skipping $mutation_class mutation in gene $gene.\n";
        next;
    }

    $all_genes{$gene}++;
    $mutations{$sample}{$gene}++;
}
$maf_fh->close;

#sort @all_genes for consistency in header and loops
my @all_genes = sort keys %all_genes;

#write the input matrix for R code to a temp file
my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";

#print input matrix file header
my $header = join( "\t", "Sample", @all_genes );
$matrix_fh->print( "$header\n" );

#print mutation relation input matrix
for my $sample ( sort @all_sample_names ) {
    $matrix_fh->print( $sample );
    for my $gene ( @all_genes ) {
        if( exists $mutations{$sample}{$gene} ) {
            $matrix_fh->print( "\t$mutations{$sample}{$gene}" );
        }
        else {
            $matrix_fh->print( "\t0" );
        }
    }
    $matrix_fh->print( "\n" );
}
}

sub create_sample_gene_matrix_variant {

my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;

#create hash of mutations from the MAF file
my ( %variants_hash, %all_variants );

#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
    next if( $line =~ m/^(#|Hugo_Symbol)/ );
    chomp $line;
    my @cols = split( /\t/, $line );
    my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];

    #check that the mutation class is valid
    if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
        $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
        return;
    }

    #check to see if this gene is on the list (if there is a list at all)
    if( defined @{$mutated_genes_to_include} ) {
        next unless (scalar grep { m/^$gene$/ } @{$mutated_genes_to_include});
    }

    # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
    if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
       ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
        print "Skipping $mutation_class mutation in gene $gene.\n";
        next;
    }

    my $var;
    my $variant_name;
    if( $ref eq $var1 ) {
        $var = $var2;
        $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
        $variants_hash{$sample}{$variant_name}++;
        $all_variants{$variant_name}++;
    }
    elsif( $ref eq $var2 ) {
        $var = $var1;
        $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
        $variants_hash{$sample}{$variant_name}++;
        $all_variants{$variant_name}++;
    }
    elsif( $ref ne $var1 && $ref ne $var2 ) {
        $var = $var1;
        $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
        $variants_hash{$sample}{$variant_name}++;
        $all_variants{$variant_name}++;
        $var = $var2;
        $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
        $variants_hash{$sample}{$variant_name}++;
        $all_variants{$variant_name}++;
    }
}
$maf_fh->close;

#sort variants for consistency
my @variant_names = sort keys %all_variants;

#write the input matrix for R code to a file
my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";

#print input matrix file header
my $header = join("\t","Sample",@variant_names);
$matrix_fh->print("$header\n");

#print mutation relation input matrix
for my $sample (sort @all_sample_names) {
    $matrix_fh->print($sample);
    for my $variant (@variant_names) {
        if (exists $variants_hash{$sample}{$variant}) {
            $matrix_fh->print("\t$variants_hash{$sample}{$variant}");
        }
        else {
            $matrix_fh->print("\t0");
        }
    }
    $matrix_fh->print("\n");
}
}

1;