Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding GATK concordance script (connected to #167) #173

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
83 changes: 83 additions & 0 deletions src/toil_scripts/gatk_concordance/gatk_concordance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python2.7

import argparse
import multiprocessing
import os
import subprocess

from toil.job import Job

from toil_scripts import download_from_s3_url
from toil_scripts.batch_alignment.bwa_alignment import upload_or_move, docker_call
from toil_scripts.gatk_germline.germline import download_url, read_from_filestore_hc

def build_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--reference', required=True, help='Reference Genome URL')
parser.add_argument('-i', '--reference_index', required=True, help='Reference Genome index (.fai) URL')
parser.add_argument('-d', '--reference_dict', required=True, help='Reference Genome sequence dictionary (.dict) URL')
parser.add_argument('-1', '--eval_vcf', required=True, help='VCF file URL to evaluate')
parser.add_argument('-y', '--eval_label', required=False, help='Optional label for VCF file to evaluate, defaults to file name')
parser.add_argument('-2', '--comp_vcf', required=True, help='VCF file URL to compare against')
parser.add_argument('-z', '--comp_label', required=False, help='Optional label for VCF file to compare against, defaults to file name')
parser.add_argument('-o', '--output_dir', required=False, default=None, help='Local output directory, use one of this option or --s3_dir')
parser.add_argument('-3', '--s3_dir', required=False, default=None, help='S3 output directory, starting with bucket name, use one of this option or --output_dir')
return parser


def batch_start(job, input_args):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have a more descriptive name?

shared_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf']
shared_ids = {}
for file_name in shared_files:
url = input_args[file_name]
shared_ids[file_name] = job.addChildJobFn(download_url, url, file_name).rv()
job.addFollowOnJobFn(concordance, shared_ids, input_args)


def concordance(job, shared_ids, input_args):
"""
Evaluate concordance between two VCFs with GATK.
"""
work_dir = job.fileStore.getLocalTempDir()
input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'eval.vcf', 'comp.vcf']
read_from_filestore_hc(job, work_dir, shared_ids, *input_files)

eval_name = input_args.get('eval_label', os.path.basename(input_args['eval.vcf']))
comp_name = input_args.get('comp_label', os.path.basename(input_args['comp.vcf']))
output = '%s_vs_%s.concordance' % (eval_name, comp_name)
command = ['-T', 'GenotypeConcordance',
'-R', 'ref.fa',
'-nct', input_args['cpu_count'],
'--eval', 'eval.vcf',
'--comp', 'comp.vcf',
'-o', output]

docker_call(work_dir = work_dir,
tool_parameters = command,
tool = 'quay.io/ucsc_cgl/gatk',
sudo = input_args['sudo'])

upload_or_move(work_dir, input_args, output)


if __name__ == '__main__':
args_parser = build_parser()
Job.Runner.addToilOptions(args_parser)
args = args_parser.parse_args()

inputs = {'ref.fa': args.reference,
'ref.fa.fai': args.reference_index,
'ref.dict': args.reference_dict,
'eval.vcf': args.eval_vcf,
'eval_label': args.eval_label,
'comp.vcf': args.comp_vcf,
'comp_label': args.comp_label,
'output_dir': args.output_dir,
's3_dir': args.s3_dir,
'uuid': None,
# todo: this is req'd by gatk and by upload_to_s3
'cpu_count': str(multiprocessing.cpu_count()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's prefer using the cgcloud/ec2 library in new pipelines. See #125.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, esp. here, because this code will get run on the toil leader and will reflect the toil leader core count, which is not necessarily the toil worker core count.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, not sure how to call out to that library.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now just copy (I know, I know, but read on) that code from cgcloud into src/toil_script /ec2.py. Put a big comment at the top referring to the cgcloud commit that was used and stating that this will be removed again once we fix #175.

'ssec': None,
'sudo': False}

Job.Runner.startToil(Job.wrapJobFn(batch_start, inputs), args)