Skip to content

Commit

Permalink
hack at hdfs
Browse files Browse the repository at this point in the history
  • Loading branch information
bpblanken committed Nov 26, 2024
1 parent c98f18b commit d076358
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 6 deletions.
4 changes: 3 additions & 1 deletion v03_pipeline/lib/reference_datasets/clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
CLINVAR_PATHOGENICITIES_LOOKUP,
)
from v03_pipeline.lib.model.definitions import ReferenceGenome
from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht
from v03_pipeline.lib.reference_datasets.misc import safely_add_to_hdfs, vcf_to_ht

CLINVAR_GOLD_STARS_LOOKUP = hl.dict(
{
Expand Down Expand Up @@ -109,6 +109,7 @@ def get_submission_summary_ht() -> hl.Table:
timeout=10,
) as r:
shutil.copyfileobj(r.raw, tmp_file)
safely_add_to_hdfs(tmp_file.name)
ht = hl.import_table(
tmp_file.name,
force=True,
Expand Down Expand Up @@ -163,6 +164,7 @@ def get_ht(
delete=False,
) as tmp_file, requests.get(clinvar_url, stream=True, timeout=10) as r:
shutil.copyfileobj(r.raw, tmp_file)
safely_add_to_hdfs(tmp_file.name)
ht = vcf_to_ht(tmp_file.name, reference_genome)
submitters_ht = get_submission_summary_ht()
ht = ht.annotate(
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/reference_datasets/dbnsfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
types = TYPES[reference_genome]
rename = RENAME[reference_genome]

with download_zip_file(path) as unzipped_dir:
with download_zip_file(path, 'dbnsfp') as unzipped_dir:
ht = hl.import_table(
f'{unzipped_dir}/dbNSFP*_variant.chr*.gz',
types=types,
Expand Down
2 changes: 2 additions & 0 deletions v03_pipeline/lib/reference_datasets/helix_mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests

from v03_pipeline.lib.model.definitions import ReferenceGenome
from v03_pipeline.lib.reference_datasets.misc import safely_add_to_hdfs

RENAME = {
'counts_hom': 'AC_hom',
Expand All @@ -22,6 +23,7 @@ def get_ht(
delete=False,
) as tmp_file, requests.get(url, stream=True, timeout=10) as r:
shutil.copyfileobj(r.raw, tmp_file)
safely_add_to_hdfs(tmp_file.name)
ht = hl.import_table(
tmp_file.name,
types={
Expand Down
4 changes: 3 additions & 1 deletion v03_pipeline/lib/reference_datasets/local_constraint_mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@


def get_ht(url: str, reference_genome: ReferenceGenome) -> hl.Table:
with download_zip_file(url, decode_content=True) as unzipped_dir:
with download_zip_file(
url, 'local_constraint_mito', decode_content=True

Check failure on line 13 in v03_pipeline/lib/reference_datasets/local_constraint_mito.py

View workflow job for this annotation

GitHub Actions / unit_tests (3.10)

Ruff (COM812)

v03_pipeline/lib/reference_datasets/local_constraint_mito.py:13:58: COM812 Trailing comma missing
) as unzipped_dir:
ht = hl.import_table(
os.path.join(
unzipped_dir,
Expand Down
25 changes: 23 additions & 2 deletions v03_pipeline/lib/reference_datasets/misc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import contextlib
import os
import subprocess
import tempfile
import zipfile

Expand Down Expand Up @@ -118,14 +119,16 @@ def copyfileobj(fsrc, fdst, decode_content, length=16 * 1024):


@contextlib.contextmanager
def download_zip_file(url, suffix='.zip', decode_content=False):
def download_zip_file(url, dataset_name: str, suffix='.zip', decode_content=False):
dir_ = f'/tmp/{dataset_name}' # noqa: S108
with tempfile.NamedTemporaryFile(
dir=dir_,
suffix=suffix,
) as tmp_file, requests.get(url, stream=True, timeout=10) as r:
copyfileobj(r.raw, tmp_file, decode_content)
with zipfile.ZipFile(tmp_file.name, 'r') as zipf:
zipf.extractall(os.path.dirname(tmp_file.name))
# Extracting the zip file
safely_add_to_hdfs(dir_)
yield os.path.dirname(tmp_file.name)


Expand All @@ -148,3 +151,21 @@ def select_for_interval_reference_dataset(
**additional_selects,
)
return ht.key_by('interval')


def safely_add_to_hdfs(file_name: str):
if os.getenv('HAIL_DATAPROC') != '1':
return
subprocess.run(
[ # noqa: S603
'/usr/bin/hdfs',
'dfs',
'-copyFromLocal',
'-f',
f'file://{file_name}',
file_name,
],
capture_output=True,
text=True,
check=True,
)
2 changes: 1 addition & 1 deletion v03_pipeline/lib/reference_datasets/mitimpact.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_ht(
reference_genome: ReferenceGenome,
) -> hl.Table:
extracted_filename = url.removesuffix('.zip').split('/')[-1]
with download_zip_file(url, suffix='.txt.zip') as unzipped_dir:
with download_zip_file(url, 'mitimpact', suffix='.txt.zip') as unzipped_dir:
ht = hl.import_table(
os.path.join(
unzipped_dir,
Expand Down
2 changes: 2 additions & 0 deletions v03_pipeline/lib/reference_datasets/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from v03_pipeline.lib.model import ReferenceGenome
from v03_pipeline.lib.reference_datasets.misc import (
safely_add_to_hdfs,
select_for_interval_reference_dataset,
)

Expand All @@ -20,6 +21,7 @@ def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
timeout=10,
) as r:
shutil.copyfileobj(r.raw, tmp_file)
safely_add_to_hdfs(tmp_file.name)
ht = hl.import_table(
tmp_file.name,
no_header=True,
Expand Down

0 comments on commit d076358

Please sign in to comment.