def variant_pairs_ht(mt, row_groups):
mt = mt.add_row_index(name='_row_idx')
mt = mt.add_col_index(name='_col_idx')
mt = mt.group_rows_by(*row_groups).aggregate(
variant_pairs_entry=hl.set(
hl.agg.collect(
hl.tuple([mt._col_idx, mt._row_idx])
)
# [col_idx, [row_idx]]
.group_by(lambda x: x[0])
# [[col_idx, row_idx), (col_idx, row_idx), ...], ...]
.values()
# [[row_idx, row_idx, ...], ...]
.map(lambda x: x.map(lambda y: y[0]))
.flatmap(lambda x: hl.range(0, hl.len(x))
.flatmap(lambda i1: hl.range(i1 + 1, hl.len(x))
.map(lambda i2: hl.tuple([x[i1], x[i2]]))))
)
)
mt.describe()
ht = mt.annotate_rows(
variant_pairs=hl.agg.take(mt.variant_pairs_entry, 1)[0]
).rows()
ht = ht.explode('variant_pairs')
ht = ht.key_by(v1_idx=ht.variant_pairs[0], v2_idx=ht.variant_pairs[1])
return ht.select()
I just got a warning on a pipeline:
UserWarning: The mt[<row keys>, :] syntax is deprecated, and will be removed before 0.2 release.
Use one of the following instead:
mt.rows()[<row keys>]
mt.index_rows(<row keys>)
ht = ht.annotate(info=hl.struct(AC=mt[ht.key, :].info.AC, a_index=mt[ht.key, :].a_index))
Does this mean the new syntax should read:
ht = ht.annotate(info=hl.struct(AC=mt.rows()[ht.key].info.AC, a_index=mt.rows()[ht.key].a_index))
?
union_cols
and union_rows
WARNING: Failed to fetch GCS output:
HttpError accessing <https://www.googleapis.com/storage/v1/b/dataproc-35495967-f2a1-4c0d-8ce0-a135afa728d4-us/o/google-cloud-dataproc-metainfo%2F41281f99-e03a-442a-a365-b8a50e8b50c7%2Fjobs%2F5dc3117ca5a24e2a937161a318173653%2Fdriveroutput.000000001?alt=json>: response: <{'status': '503', 'content-length': '0', 'expires': 'Thu, 14 Jun 2018 18:29:08 GMT', 'server': 'UploadServer', 'cache-control': 'private, max-age=0', 'date': 'Thu, 14 Jun 2018 18:29:08 GMT', 'alt-svc': 'quic=":443"; ma=2592000; v="43,42,41,39,35"', 'content-type': 'text/html; charset=UTF-8', 'x-guploader-uploadid': 'AEnB2UpYfilLrAR9zFvO8ZXEDsQUg5bgDHvlMdzA1wPpyFtUW2COEUasz_1KHKzEGZ1OJWSHazUTQUELyHCPMjh7O9kQqy04kQ'}>, content <>
I tried running ld_pruning method on b38 1 thousand genome data but got an assertion error.
Below is the script and the error.
th_gn_mt = hl.read_matrix_table("/path")
th_gn_biallelic_mt = th_gn_mt.filter_rows(th_gn_mt.alleles.length() == 2 )
def unphase_mt(mt: hl.MatrixTable) -> hl.MatrixTable:
return mt.annotate_entries(GT=hl.case()
.when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
.when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
.default(hl.null(hl.tcall))
)
th_gn_biallelic_mt = unphase_mt(th_gn_biallelic_mt)
th_gn_ld_pruned_mt = hl.ld_prune(th_gn_biallelic_mt.GT,r2=0.2,bp_window_size=500000)
2018-06-25 09:27:47 Hail: INFO: ld_prune: running local pruning stage with max queue size of 99274 variants
2018-06-25 09:43:59 Hail: INFO: wrote 13911578 items in 1509 partitions
2018-06-25 09:45:04 Hail: INFO: wrote 13911766 items in 1509 partitions to hdfs://prod-scc/tmp/hail.kIhhSGEG1wBW/sgOZm5mSHU
2018-06-25 09:45:04 Hail: INFO: ld_prune: local pruning stage retained 13911766 variants
2018-06-25 09:45:42 Hail: INFO: Wrote all 3397 blocks of 13911766 x 2504 matrix with block size 4096.
2018-06-25 09:45:46 Hail: INFO: Coerced almost-sorted dataset
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/GWD/RDIP/apps/hail_pipeline_api/hail_0.2/hail-python_6942d09.zip/hail/typecheck/check.py", line 547, in wrapper
File "/GWD/RDIP/apps/hail_pipeline_api/hail_0.2/hail-python_6942d09.zip/hail/methods/statgen.py", line 3034, in ld_prune
File "/GWD/RDIP/apps/hail_pipeline_api/hail_0.2/hail-python_6942d09.zip/hail/typecheck/check.py", line 547, in wrapper
File "/GWD/RDIP/apps/hail_pipeline_api/hail_0.2/hail-python_6942d09.zip/hail/linalg/blockmatrix.py", line 708, in _filtered_entries_table
File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/GWD/RDIP/apps/hail_pipeline_api/hail_0.2/hail-python_6942d09.zip/hail/utils/java.py", line 196, in deco
hail.utils.java.FatalError: AssertionError: assertion failed
Java stack trace:
java.lang.AssertionError: assertion failed
at scala.Predef$.assert(Predef.scala:156)
at is.hail.methods.UpperIndexBounds$.computeCoverByUpperTriangularBlocks(UpperIndexBounds.scala:63)
at is.hail.linalg.BlockMatrix.filteredEntriesTable(BlockMatrix.scala:1198)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
Hail version: devel-6942d090d618
Error summary: AssertionError: assertion failed
This is on chr1 to chr 22 data.