Example usage

To use dnarecords in a project:

import dnarecords

print(dnarecords.__version__)
0.1.4

To transform your genomics data into DNARecords:

import dnarecords as dr


hl = dr.helper.DNARecordsUtils.init_hail()
hl.utils.get_1kg('/tmp/1kg')
mt = hl.read_matrix_table('/tmp/1kg/1kg.mt')
mt = mt.annotate_entries(dosage=hl.pl_dosage(mt.PL))

dnarecords_path = '/tmp/dnarecords'
writer = dr.writer.DNARecordsWriter(mt.dosage)
writer.write(dnarecords_path, sparse=True, sample_wise=True, variant_wise=True,
             tfrecord_format=True, parquet_format=True,
             write_mode='overwrite', gzip=True)

print(f'DNARecords createt at {dnarecords_path}')
DNARecords createt at /tmp/dnarecords
2022-05-11 11:12:54 Hail: INFO: 1KG files found

To read your DNARecords dataset as Tensorflow Datasets:

import dnarecords as dr
import tensorflow as tf

dnarecords_path = '/tmp/dnarecords'
reader = dr.reader.DNARecordsReader(dnarecords_path)

samplewise_ds = reader.sample_wise_dataset()
tf.print(next(iter(samplewise_ds)))

variantwise_ds = reader.variant_wise_dataset()
tf.print(next(iter(variantwise_ds)))
{'chr1': 'SparseTensor(indices=[[0]
 [2]
 [4]
 ...
 [906]
 [907]
 [908]], values=[0.336072147 0.00498687895 0.0593509413 ... 0.99874264 0.0306534301 1.88818419], shape=[909])',
 'chr10': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [529]
 [530]
 [531]], values=[0.0593509413 0.00315230922 0.99941957 ... 0.0593509413 1.79924 0.0306534301], shape=[532])',
 'chr11': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [567]
 [568]
 [569]], values=[0.200760186 0.015601662 0.136806905 ... 0.015601662 1.92641246 1.94064903], shape=[570])',
 'chr12': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [563]
 [564]
 [565]], values=[1.00395739 1.96934652 0.0593509413 ... 0.200760037 1.00158238 0.999999821], shape=[566])',
 'chr13': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [333]
 [334]
 [335]], values=[0.015601662 1.00000501 1.92641246 ... 0.0306534301 0.0593509413 0.200760469], shape=[336])',
 'chr14': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [334]
 [335]
 [336]], values=[0.136806905 0.00788068399 0.00788068399 ... 0.111815773 0.24447079 0.111815773], shape=[337])',
 'chr15': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [339]
 [340]
 [341]], values=[0.99999994 1.00000393 0.111815773 ... 0.0593509413 0.200760156 0.111815773], shape=[342])',
 'chr16': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [379]
 [380]
 [381]], values=[0.111815773 0.015601662 0.999999821 ... 0.00125734252 1.66599965 0.0306534301], shape=[382])',
 'chr17': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [344]
 [345]
 [346]], values=[0.00396528561 1.99211931 0.0593509413 ... 0.0593509413 0.999999642 0.0593509413], shape=[347])',
 'chr18': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [295]
 [296]
 [297]], values=[0.111815773 1.92641246 1.86319304 ... 0.999996841 0.111815773 1.66578853], shape=[298])',
 'chr19': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [339]
 [340]
 [341]], values=[0.111815773 0.111815773 1.66543949 ... 0.000999001 0.0306534301 0.0019912892], shape=[342])',
 'chr2': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [841]
 [842]
 [843]], values=[1.79924 1.00125718 1.0000025 ... 0.111815773 0.00788068399 1.96934652], shape=[844])',
 'chr20': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [309]
 [310]
 [311]], values=[1.96934652 0.0593509451 1.79924 ... 0.200760156 1.23837578 0.0593509413], shape=[312])',
 'chr21': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [147]
 [148]
 [150]], values=[1.98439837 0.20076 0.0593509413 ... 0.00396528561 0.111815773 0.00396528561], shape=[151])',
 'chr22': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [167]
 [168]
 [169]], values=[0.200760469 1.98439837 0.0593509413 ... 0.015601662 0.00788068399 0.200760096], shape=[170])',
 'chr3': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [734]
 [736]
 [737]], values=[1.79924 0.33397156 0.0593509413 ... 0.00396528561 0.00396528561 1.88818419], shape=[738])',
 'chr4': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [637]
 [638]
 [639]], values=[0.200769082 0.0909091 0.0477267206 ... 0.0593509413 0.111815773 0.200760037], shape=[640])',
 'chr5': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [644]
 [645]
 [646]], values=[0.111815773 0.00788068399 0.333948731 ... 0.0306534301 0.200760052 0.336643815], shape=[647])',
 'chr6': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [671]
 [672]
 [673]], values=[0.136806905 1.79923987 0.0019912892 ... 0.334036469 0.0306534301 0.0593509413], shape=[674])',
 'chr7': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [540]
 [541]
 [542]], values=[1.00593591 0.111815773 0.00788068399 ... 1.00788069 0.99999994 0.998993754], shape=[543])',
 'chr8': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [507]
 [508]
 [509]], values=[0.015601662 1.94064903 0.0735875592 ... 1.66599965 0.284747332 0.015601662], shape=[510])',
 'chr9': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [413]
 [414]
 [415]], values=[0.200761154 1.79924 0.999602139 ... 0.333916187 0.0593509413 0.00788068399], shape=[416])',
 'chrX': 'SparseTensor(indices=[[0]
 [2]
 [3]
 ...
 [268]
 [271]
 [272]], values=[0.0306534301 1.66543949 0.20076 ... 0.334082 0.111815788 1.94064903], shape=[273])',
 'key': 26}
{'key': 3506, 'tensor': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [281]
 [282]
 [283]], values=[0.998417675 0.00788068399 0.015601662 ... 0.00396528561 0.00788068399 0.0019912892], shape=[10880])'}

To read your DNARecords dataset as Pyspark DataFrames:

import dnarecords as dr

dnarecords_path = '/tmp/dnarecords'
reader = dr.reader.DNASparkReader(dnarecords_path)

samplewise_df = reader.sample_wise_dnarecords()
samplewise_df.show(2)

variantwise_df = reader.variant_wise_dnarecords()
variantwise_df.show(2)
+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+----------------+----------------+--------------------+--------------------+-----------------+----------------+--------------------+---+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+----------------+--------------------+----------------+-----------------+--------------------+--------------------+-----------------+----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|        chr4_indices|chrX_dense_shape|        chrX_indices|        chr11_values|       chr10_indices|       chr22_indices|         chr1_values|        chr1_indices|         chr5_values|        chr15_values|chr3_dense_shape|chr22_dense_shape|       chr15_indices|        chr19_values|chr2_dense_shape|chr9_dense_shape|       chr18_indices|         chr9_values|chr14_dense_shape|chr1_dense_shape|       chr12_indices|key|chr13_dense_shape|        chr9_indices|        chr10_values|        chr3_indices|        chr14_values|         chr4_values|chr12_dense_shape|       chr21_indices|        chr22_values|chr21_dense_shape|        chr6_indices|         chr8_values|        chr18_values|chr20_dense_shape|chr8_dense_shape|       chr17_indices|chr7_dense_shape|chr19_dense_shape|        chr13_values|         chr3_values|chr18_dense_shape|chr6_dense_shape|        chr8_indices|         chr7_values|chr11_dense_shape|       chr11_indices|        chr17_values|       chr14_indices|chr10_dense_shape|chr17_dense_shape|       chr19_indices|        chr5_indices|        chr21_values|       chr16_indices|        chr2_indices|chr5_dense_shape|chr16_dense_shape|chr4_dense_shape|         chr2_values|       chr20_indices|        chr12_values|         chrX_values|        chr7_indices|         chr6_values|chr15_dense_shape|        chr20_values|       chr13_indices|        chr16_values|
+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+----------------+----------------+--------------------+--------------------+-----------------+----------------+--------------------+---+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+----------------+--------------------+----------------+-----------------+--------------------+--------------------+-----------------+----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|[0, 1, 2, 3, 4, 5...|             273|[0, 2, 3, 4, 5, 7...|[0.20076019, 0.01...|[0, 1, 2, 3, 4, 5...|[0, 1, 2, 3, 5, 6...|[0.33607215, 0.00...|[0, 2, 4, 5, 6, 7...|[0.11181577, 0.00...|[0.99999994, 1.00...|             738|              170|[0, 1, 2, 3, 4, 5...|[0.11181577, 0.11...|             844|             416|[0, 1, 2, 3, 4, 5...|[0.20076115, 1.79...|              337|             909|[0, 1, 2, 4, 5, 6...| 26|              336|[0, 1, 2, 3, 4, 5...|[0.05935094, 0.00...|[0, 1, 2, 3, 4, 5...|[0.1368069, 0.007...|[0.20076908, 0.09...|              566|[0, 1, 2, 3, 4, 5...|[0.20076047, 1.98...|              151|[0, 1, 2, 3, 4, 5...|[0.015601662, 1.9...|[0.11181577, 1.92...|              312|             510|[0, 1, 2, 3, 4, 5...|             543|              342|[0.015601662, 1.0...|[1.79924, 0.33397...|              298|             674|[0, 1, 2, 3, 4, 5...|[1.0059359, 0.111...|              570|[0, 1, 2, 4, 5, 6...|[0.0039652856, 1....|[0, 1, 2, 3, 4, 5...|              532|              347|[0, 1, 2, 4, 5, 6...|[0, 1, 2, 3, 4, 5...|[1.9843984, 0.200...|[0, 1, 2, 3, 4, 5...|[0, 1, 2, 3, 4, 5...|             647|              382|             640|[1.79924, 1.00125...|[0, 1, 2, 3, 4, 5...|[1.0039574, 1.969...|[0.03065343, 1.66...|[0, 1, 2, 3, 4, 5...|[0.1368069, 1.799...|              342|[1.9693465, 0.059...|[0, 1, 2, 3, 4, 5...|[0.11181577, 0.01...|
|[0, 1, 2, 3, 4, 5...|             273|[0, 1, 2, 3, 4, 5...|[0.20076002, 0.24...|[0, 1, 2, 3, 4, 5...|[0, 1, 2, 3, 5, 7...|[0.20076008, 0.20...|[0, 1, 2, 3, 4, 5...|[0.11181577, 0.05...|[0.9984176, 0.333...|             738|              170|[0, 1, 2, 3, 4, 5...|[0.015601662, 0.2...|             844|             416|[0, 1, 2, 3, 4, 5...|[0.11181577, 1.98...|              337|             909|[0, 1, 2, 3, 4, 5...| 29|              336|[0, 1, 2, 3, 4, 5...|[0.05935094, 0.03...|[0, 1, 2, 3, 4, 5...|[1.9693465, 0.200...|[0.20076023, 0.11...|              566|[0, 1, 2, 3, 4, 5...|[0.03065343, 1.94...|              151|[0, 1, 2, 3, 4, 5...|[1.9843984, 1.000...|[1.9960347, 1.799...|              312|             510|[0, 1, 2, 4, 5, 6...|             543|              342|[0.038286503, 1.8...|[1.940649, 0.9997...|              298|             674|[0, 1, 2, 3, 4, 5...|[1.6660694, 0.059...|              570|[0, 1, 2, 3, 5, 7...|[0.015601662, 1.9...|[0, 1, 2, 3, 4, 5...|              532|              347|[0, 1, 2, 4, 5, 6...|[0, 1, 2, 3, 4, 5...|[0.05935094, 0.00...|[0, 1, 2, 3, 4, 5...|[0, 1, 2, 3, 4, 5...|             647|              382|             640|[1.0, 0.11181577,...|[0, 1, 3, 4, 5, 6...|[1.8881842, 1.940...|[0.11181577, 0.20...|[0, 1, 2, 3, 5, 6...|[0.9999984, 0.030...|              342|[0.999001, 0.3347...|[0, 1, 2, 4, 5, 6...|[1.7992399, 0.337...|
+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+----------------+----------------+--------------------+--------------------+-----------------+----------------+--------------------+---+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+----------------+--------------------+----------------+-----------------+--------------------+--------------------+-----------------+----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
only showing top 2 rows

[Stage 74:>                                                         (0 + 1) / 1]+--------------------+--------------------+----+-----------+
|             indices|              values| key|dense_shape|
+--------------------+--------------------+----+-----------+
|[0, 1, 2, 3, 4, 5...|[0.9984177, 0.007...|3506|      10880|
|[0, 1, 2, 3, 4, 5...|[0.11181577, 0.01...|3764|      10880|
+--------------------+--------------------+----+-----------+
only showing top 2 rows