Source code for dnarecords.macros

"""
DNARecords available macros.
"""


# pylint: disable=too-few-public-methods
# It is reasonable in this case.
[docs]class DosageSparsityMacro: """Pre-created methods to increase sparsity of your datasets""" from typing import TYPE_CHECKING if TYPE_CHECKING: from hail import MatrixTable
[docs] @staticmethod def supercharge_dosage_sparsity(mt: 'MatrixTable', info_score_threshold: float = 0.8, p_value_hwe_threshold: float = 1e-10, af_threshold: float = 0.001, sparse_threshold: float = 0.1) -> 'MatrixTable': """ Supercharges dosage sparsity based on info_score and variant_qc. Assumes the input MatrixTable to have info_score and variant_qc row fields, and dosage entry field. Note ---- It flips the dosage (2 - dosage) for those variants where alt allele is more frequent and includes a flag indicating whether the flip was done or not. Example -------- .. code-block:: python import dnarecords as dr hl = dr.helper.DNARecordsUtils.init_hail() hl.utils.get_1kg('/tmp/1kg') mt = hl.read_matrix_table('/tmp/1kg/1kg.mt') mt = mt.annotate_rows(info_score=hl.agg.info_score(hl.pl_to_gp(mt.PL))) mt = hl.variant_qc(mt) mt = mt.annotate_entries(dosage=hl.pl_dosage(mt.PL)) mt = dr.macros.DosageSparsityMacro.supercharge_dosage_sparsity(mt) mt.select_cols().select_rows('dosage_flip') \\ .select_entries('dosage','sparse_dosage').entries().show() .. code-block:: text +---------------+------------+-------------+-----------+----------+---------------+ | locus | alleles | dosage_flip | s | dosage | sparse_dosage | +---------------+------------+-------------+-----------+----------+---------------+ | locus<GRCh37> | array<str> | bool | str | float64 | float64 | +---------------+------------+-------------+-----------+----------+---------------+ | 1:904165 | ["G","A"] | False | "HG00096" | 5,94e-02 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00099" | 3,97e-03 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00105" | 4,99e-03 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00118" | 7,88e-03 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00129" | 3,07e-02 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00148" | 7,36e-02 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00177" | 2,01e-01 | 2,01e-01 | | 1:904165 | ["G","A"] | False | "HG00182" | 3,83e-02 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00242" | 3,07e-02 | 0,00e+00 | | 1:904165 | ["G","A"] | False | "HG00254" | 1,26e-04 | 0,00e+00 | +---------------+------------+-------------+-----------+----------+---------------+ :param mt: a MatrixTable with **info_score** and **variant_qc** row fields, and **dosage** entry field. :param info_score_threshold: rows with info_score.score below the threshold are filtered out. :param p_value_hwe_threshold: rows with variant_qc.p_value_hwe below the threshold are filtered out. :param af_threshold: rows with AF < af_threshold or AF > (1 - af_threshold) are filtered out. :param sparse_threshold: those entries with dosage below the threshold are set dosage = 0. :return: a MatrixTable with above transformations done. :rtype: MatrixTable """ from dnarecords.helper import DNARecordsUtils hl = DNARecordsUtils.init_hail() mt = mt.filter_rows(info_score_threshold < mt.info_score.score) mt = mt.filter_rows(p_value_hwe_threshold < mt.variant_qc.p_value_hwe) mt = mt.filter_rows((af_threshold < mt.variant_qc.AF[1]) & (mt.variant_qc.AF[1] < (1 - af_threshold))) mt = mt.annotate_entries(sparse_dosage=hl.if_else(0.5 < mt.variant_qc.AF[1], 2 - mt.dosage, mt.dosage)) mt = mt.annotate_rows(dosage_flip=(0.5 < mt.variant_qc.AF[1])) mt = mt.annotate_entries(sparse_dosage=hl.if_else(mt.sparse_dosage < sparse_threshold, 0.0, mt.sparse_dosage)) return mt