Source code for dnarecords.helper

"""
DNARecords helper utilities.
"""


[docs]class DNARecordsUtils: """Utility class to provide common functionalities used in other modules. """ from typing import Dict from types import ModuleType from typing import TYPE_CHECKING if TYPE_CHECKING: from pyspark.sql import SparkSession
[docs] @staticmethod def spark_session() -> 'SparkSession': """Gets the current spark session or builds a new one if none. Ensures sparktfrecord libraries are available in the session. :return: a spark session with sparktfrecord libraries available. :rtype: SparkSession """ from pyspark.sql import SparkSession return SparkSession.builder \ .config('spark.jars.packages', 'com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.4') \ .getOrCreate()
[docs] @staticmethod def init_hail() -> ModuleType: """Initializes Hail ensuring sparktfrecord libraries are available in the session. :return: the hail module (with Hail initialized) :rtype: ModuleType """ import hail as hl # TODO: do not make it idempotent to avoid potential collisions conf = {'spark.jars.packages': 'com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.4'} hl.init(idempotent=True, log='/tmp/hail.log', spark_conf=conf) return hl
[docs] @staticmethod def dnarecords_tree(dnarecords_path) -> Dict[str, str]: """DNARecords directory structure. Gets a dictionary with the full structure of a DNARecords dataset given a root path. .. code-block:: text swrec -> <dnarecords_path>/data/swrec (sample wise dna tfrecords) vwrec -> <dnarecords_path>/data/vwrec (variant wise dna tfrecords) swpar -> <dnarecords_path>/data/swpar (sample wise dna parquet files) vwpar -> <dnarecords_path>/data/vwpar (variant wise dna parquet files) skeys -> <dnarecords_path>/meta/skeys (sample wise key mapping) vkeys -> <dnarecords_path>/meta/vkeys (variant wise key mapping) swpfs -> <dnarecords_path>/meta/swpfs (sample wise parquet files index) vwpfs -> <dnarecords_path>/meta/vwpfs (variant wise parquet files index) swrfs -> <dnarecords_path>/meta/swrfs (sample wise tfrecords index) vwrfs -> <dnarecords_path>/meta/vwrfs (variant wise tfrecords index) swpsc -> <dnarecords_path>/meta/swpsc (sample wise parquet schema) vwpsc -> <dnarecords_path>/meta/vwpsc (variant wise parquet schema) swrsc -> <dnarecords_path>/meta/swrsc (sample wise tfrecord schema) vwrsc -> <dnarecords_path>/meta/vwrsc (variant wise tfrecord schema) :return: a dictionary with the structure of the DNARecords dataset. :rtype: Dict[str,str] """ return {'swrec': f'{dnarecords_path}/data/swrec', 'vwrec': f'{dnarecords_path}/data/vwrec', 'swpar': f'{dnarecords_path}/data/swpar', 'vwpar': f'{dnarecords_path}/data/vwpar', 'skeys': f'{dnarecords_path}/meta/skeys', 'vkeys': f'{dnarecords_path}/meta/vkeys', 'swpfs': f'{dnarecords_path}/meta/swpfs', 'vwpfs': f'{dnarecords_path}/meta/vwpfs', 'swrfs': f'{dnarecords_path}/meta/swrfs', 'vwrfs': f'{dnarecords_path}/meta/vwrfs', 'swpsc': f'{dnarecords_path}/meta/swpsc', 'vwpsc': f'{dnarecords_path}/meta/vwpsc', 'swrsc': f'{dnarecords_path}/meta/swrsc', 'vwrsc': f'{dnarecords_path}/meta/vwrsc'}