v2 = hl.read_matrix_table(f'{root_dir}/gnomad_v2_b38_chrom20.qc_regions.v2_qc_samples.mt').rows()
v3 = hl.read_matrix_table(f'{root_dir}/gnomad_v3_chrom20.hardcalls.split.qc_regions.v2_qc_samples.mt').rows()
snv_variants = hl.read_table(f'{root_dir}/gnomad_v2_v3_qc_snv_variant_concordance.annotated.ht')
snv_variants = snv_variants.annotate(
v2_info=v2[snv_variants.key].info,
v3_info=v3[snv_variants.key].info,
# adj_concordance=adj_variants[snv_variants.key].concordance
)
snv_variants.persist()
snv_variants.count()
....ht/rows/metadata.json.gz
- does it say OrderedRVDSpec or UnpartitionedRVDSpec)
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'locus': locus<GRCh38>
'alleles': array<str>
'n_discordant': int64
'concordance': array<array<int64>>
'dataset': str
'v2_callstats': struct {
AC: array<int32>,
AF: array<float64>,
AN: int32,
homozygote_count: array<int32>
}
'v2_was_split': bool
'v3_callstats': struct {
AC: array<int32>,
AF: array<float64>,
AN: int32,
homozygote_count: array<int32>
}
'v3_was_split': bool
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------
ht = ht.persist()
rank_ht = ht.select(is_indel=hl.is_indel(ht.alleles[0], ht.alleles[1]), score=ht.CNN_1D_Score)
rank_ht = rank_ht.order_by(rank_ht.is_indel, rank_ht.score)
rank_ht = rank_ht.add_index().persist()
n_snvs = rank_ht.aggregate(hl.agg.count_where(~rank_ht.is_indel))
rank_ht = rank_ht.annotate(idx=hl.cond(rank_ht.is_indel, rank_ht.idx - n_snvs, rank_ht.idx))
gsutil du -s -h gs://path/to
ht = hl.import_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.gz', force_bgz=True, min_partitions=1000, impute=True)
ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht', overwrite=True)
Laurent:hail2 laurent$ gsutil ls gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht/rows/parts/
gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht/rows/parts/
gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht/rows/parts/part-0-2-0-0-4c1e0ba4-7fc3-be60-93ab-7160eaea2afa
def main():
hl.init(log='/variantqc.log')
ht = hl.import_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.gz', force_bgz=True, min_partitions=1000, impute=True)
ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht', overwrite=True)
ht = hl.read_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ht')
ht = ht.annotate(alt_alleles=ht.Alt.split(',')) # This transforms to a list
ht = ht.explode('alt_alleles')
ht = ht.annotate(locus=hl.locus(hl.str(ht.Contig), ht.Pos))
# Apply minrep
ht = ht.annotate(alleles=hl.min_rep(ht.locus, [ht.Ref, ht.alt_alleles])[1])
# Add variant_type
ht = ht.annotate(vartype=add_variant_type(ht.alleles))
ht = ht.transmute(variant_type=ht.vartype.variant_type, n_alt_alleles=ht.vartype.n_alt_alleles)
# Add rank
print('Adding rank...')
ht = add_rank(ht)
ht.key_by('locus', 'alleles').write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.no_chr17.ranked.ht', overwrite=True)