Accommodating pipes in programs.py

Adding in docker pipes unit test Added docs to docker_call and fixed docker_call unit test Changed datasize for docker_call pipes unit test to 1GB
BD2KGenomics · Jul 30, 2016 · 887292f · 887292f
1 parent f3df8ff
commit 887292f
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 16 deletions.
diff --git a/src/toil_scripts/lib/programs.py b/src/toil_scripts/lib/programs.py
@@ -15,7 +15,8 @@ def mock_mode():
     return True if int(os.environ.get('TOIL_SCRIPTS_MOCK_MODE', '0')) else False
 
 
-def docker_call(tool,
+def docker_call(tool='',
+                tools=[],
                 parameters=None,
                 work_dir='.',
                 rm=True,
@@ -25,11 +26,14 @@ def docker_call(tool,
                 outputs=None,
                 docker_parameters=None,
                 check_output=False,
+                return_stderr=False,
                 mock=None):
     """
     Calls Docker, passing along parameters and tool.
 
-    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
+    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) 
+    :param list[str] tools: str list of names of the Docker images and order to be used in
+                     adding piped commands to docker. (e.g. ['quay.io/ucsc_cgl/samtools', 'ubuntu'])
     :param list[str] parameters: Command line arguments to be passed to the tool
     :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data
     :param bool rm: Set to True to pass `--rm` flag.
@@ -41,8 +45,33 @@ def docker_call(tool,
                                   or a url. The value is only used if mock=True
     :param dict[str,str] docker_parameters: Parameters to pass to docker
     :param bool check_output: When True, this function returns docker's output
+    :param bool return_stderr: When True, this function includes stderr in docker's output
     :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by
                       the environment variable.
+
+    Piping docker commands can be done in one of two ways depending on use case:
+    Running a pipe in docker in 'pipe-in-single-container' mode produces command structure
+        docker '... | ... | ...' where each '...' command corresponds to each element in the 'parameters'
+        argument that uses a docker container. This is the most efficient method if you want to run a pipe of
+        commands where each command uses the same docker container.
+    
+    Running a pipe in docker in 'pipe-of-containers' mode produces command structure
+        docker '...' | docker '...' | docker '...' where each '...' command corresponds to each element in
+        the 'parameters' argument that uses a docker container and each 'docker' tool in the pipe
+        corresponds to each element in the 'tool' argument
+    
+    Examples for running command 'head -c 1M </dev/urandom | tee >(md5sum 1>&2) | gzip | gunzip | md5sum 1>&2':
+        Running 'pipe-in-single-container' mode:
+            command= ['head -c 1M /dev/urandom | tee >(md5sum 1>&2)', 'gzip', 'gunzip', 'md5sum 1>&2']
+            work_dir=curr_work_dir
+            docker_tools=['ubuntu']
+            stdout = docker_call(work_dir=docker_work_dir, parameters=command, tool=docker_tools, check_output=True)
+        
+        Running 'pipe-of-containers' mode:
+            command= ['head -c 1M /dev/urandom | tee >(md5sum 1>&2)', 'gzip', 'gunzip', 'md5sum 1>&2']
+            work_dir=curr_work_dir
+            docker_tools=['ubuntu', 'ubuntu', 'ubuntu', 'ubuntu']
+            stdout = docker_call(work_dir=docker_work_dir, parameters=command, tool=docker_tools, check_output=True)
     """
     from toil_scripts.lib.urls import download_url
 
@@ -83,36 +112,73 @@ def docker_call(tool,
     if env:
         for e, v in env.iteritems():
             base_docker_call.extend(['-e', '{}={}'.format(e, v)])
+
     if docker_parameters:
         base_docker_call += docker_parameters
+
+    docker_call = []
+
+    run_pipe = False
 
-    _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters))
-
-    docker_call = base_docker_call + [tool] + parameters
+    if bool(tools) == bool(tool):
+        raise Exception('Either "tool" or "tools" must contain a value, but not both.')
+    if not tools:
+        tools = [ tool ]
+    else:
+        run_pipe = True
+
+    # Pipe functionality
+    #   each element in the parameters list must represent a sub-pipe command
+    shell_flag = True      # Flag for running subprocess with string command or list command
+    if run_pipe:
+        command_list = []
+        if len(tools) > 1:
+            # If tool is a list containing multiple docker container name strings
+            #   then format the docker call in the 'pipe-of-containers' mode
+            docker_call.extend(base_docker_call + ['--entrypoint /bin/bash', tools[0], '-c \'{}\''.format(parameters[0])])
+            for i in xrange(1, len(tools)):
+                docker_call.extend(['|'] + base_docker_call + ['-i --entrypoint /bin/bash', tools[i], '-c \'{}\''.format(parameters[i])])
+            docker_call = " ".join(docker_call)
+            _log.debug("Calling docker with %s." % docker_call)
+
+        elif len(tools) == 1:
+            # If tool is a list containing a single docker container name string
+            #   then format the docker call in the 'pipe-in-single-container' mode
+            docker_call.extend(base_docker_call + ['--entrypoint /bin/bash', tools[0], '-c \'{}\''.format(" | ".join(parameters))])
+            docker_call = " ".join(docker_call)
+            _log.debug("Calling docker with %s." % docker_call)
+
+    else:        
+        docker_call = " ".join(base_docker_call + tools + parameters)
+        _log.debug("Calling docker with %s." % docker_call)
 
+
     try:
         if outfile:
-            subprocess.check_call(docker_call, stdout=outfile)
+            subprocess.check_call(docker_call, stdout=outfile, shell=shell_flag)
+        elif check_output and return_stderr:
+            return subprocess.check_output(docker_call, shell=shell_flag, stderr=subprocess.STDOUT)
+        elif check_output:
+            return subprocess.check_output(docker_call, shell=shell_flag)
+        elif return_stderr:
+            return subprocess.check_call(docker_call, stderr=subprocess.STDOUT, shell=shell_flag)
         else:
-            if check_output:
-                return subprocess.check_output(docker_call)
-            else:
-                subprocess.check_call(docker_call)
+            subprocess.check_call(docker_call, shell=shell_flag)
     # Fix root ownership of output files
     except:
         # Panic avoids hiding the exception raised in the try block
         with panic():
-            _fix_permissions(base_docker_call, tool, work_dir)
+            _fix_permissions(base_docker_call, tools, work_dir)
     else:
-        _fix_permissions(base_docker_call, tool, work_dir)
+        _fix_permissions(base_docker_call, tools, work_dir)
 
     for filename in outputs.keys():
         if not os.path.isabs(filename):
             filename = os.path.join(work_dir, filename)
         assert(os.path.isfile(filename))
 
 
-def _fix_permissions(base_docker_call, tool, work_dir):
+def _fix_permissions(base_docker_call, tools, work_dir):
     """
     Fix permission of a mounted Docker directory by reusing the tool
 
@@ -122,5 +188,15 @@ def _fix_permissions(base_docker_call, tool, work_dir):
     """
     base_docker_call.append('--entrypoint=chown')
     stat = os.stat(work_dir)
-    command = base_docker_call + [tool] + ['-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data']
-    subprocess.check_call(command)
+    command = []
+    command_list = []
+    for tool in tools:
+        command = base_docker_call + [tool] + ['-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data']
+        command_list.append(command)
+
+    for command in command_list:
+        subprocess.check_call(command)
+
+
+
+
diff --git a/src/toil_scripts/lib/test/test_programs.py b/src/toil_scripts/lib/test/test_programs.py
@@ -1,5 +1,5 @@
 import os
-
+import re
 
 def test_docker_call(tmpdir):
     from toil_scripts.lib.programs import docker_call
@@ -12,3 +12,23 @@ def test_docker_call(tmpdir):
     with open(fpath, 'w') as f:
         docker_call(tool='ubuntu', env=dict(foo='bar'), parameters=['printenv', 'foo'], outfile=f)
     assert open(fpath).read() == 'bar\n'
+
+    # Test pipe functionality
+    # download ubuntu docker image
+    docker_call(work_dir=work_dir, tool="ubuntu")
+    command1 = ['head -c 1G /dev/urandom | tee /data/first', 'gzip', 'gunzip', 'md5sum 1>&2']
+    command2 = ['md5sum /data/first 1>&2']
+    # Test 'pipe-in-single-container' mode
+    docker_tools1=['ubuntu']
+    stdout1 = docker_call(work_dir=work_dir, parameters=command1, tools=docker_tools1, check_output=True, return_stderr=True)
+    stdout2 = docker_call(work_dir=work_dir, parameters=command2, tool='ubuntu', check_output=True, return_stderr=True)
+    test1 = re.findall(r"([a-fA-F\d]{32})", stdout1)
+    test2 = re.findall(r"([a-fA-F\d]{32})", stdout2)
+    assert test1[0] == test2[0]
+    # Test 'pipe-of-containers' mode
+    docker_tools2=['ubuntu', 'ubuntu', 'ubuntu', 'ubuntu']
+    stdout1 = docker_call(work_dir=work_dir, parameters=command1, tools=docker_tools2, check_output=True, return_stderr=True)
+    stdout2 = docker_call(work_dir=work_dir, parameters=command2, tool='ubuntu', check_output=True, return_stderr=True)
+    test1 = re.findall(r"([a-fA-F\d]{32})", stdout1)
+    test2 = re.findall(r"([a-fA-F\d]{32})", stdout2)
+    assert test1[0] == test2[0]