Skip to content

Commit a8bb40c

Browse files
committed
Merge branch 'master' of https://github.com/mskcc/cmo
2 parents 6badb9c + afb1f51 commit a8bb40c

File tree

2 files changed

+13
-48
lines changed

2 files changed

+13
-48
lines changed

bin/cmo_split_reads

+8-46
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,20 @@
1-
#!/opt/common/CentOS_6-dev/python/python-2.7.10/bin/python
1+
#!/ifs/work/pi/roslin-pipelines/roslin-core/2.0.6/config/variant/2.4.2/virtualenv/bin/python
22

33
import argparse, os, sys, signal, subprocess, math, gzip, io
44
import cmo
55
import multiprocessing
66

7-
def chunk(fastq, platform_unit, lines_per_chunk, num_pieces):
7+
def chunk(fastq, platform_unit, lines_per_chunk):
88
logger = cmo.util.get_logger()
9-
output_prefix = os.path.basename(fastq).split(".", 1)[0] + "."
9+
output_prefix = os.path.basename(fastq).split(".", 1)[0] + ".chunk"
1010
if(platform_unit != None):
1111
exploded = output_prefix.split("_")
1212
exploded[0]=platform_unit
1313
output_prefix = "-".join(exploded)
14-
while lines_per_chunk % 4 != 0:
15-
lines_per_chunk +=1
16-
fh = io.BufferedReader(gzip.open(fastq, "rb"))
17-
output_file_count = 0
18-
output_file_lines = 0
14+
# output_prefix = output_prefix+"chunk"
15+
logger.info("Opening %s and writing reads..." % (fastq))
1916
#these aren't relaly gz but trimgalore doesnt like files not named gz...great work trimgalore
20-
filename = output_prefix + "chunk{:0>3d}".format(output_file_count) + ".fastq.gz"
21-
logger.info("Opening %s and writing reads..." % (filename))
22-
ofh = gzip.open(filename, "wb", 1)
23-
lines = list()
24-
for line in os.popen("zcat < " + fastq):
25-
lines.append(line)
26-
output_file_lines+=1
27-
if output_file_lines == lines_per_chunk:
28-
if(output_file_count < int(num_pieces)-1):
29-
output_file_lines=0
30-
ofh.write("".join(lines))
31-
ofh.close()
32-
lines = list()
33-
output_file_count +=1
34-
filename = output_prefix + "chunk{:0>3d}".format(output_file_count) + ".fastq.gz"
35-
logger.info("Opening %s and writing reads..." % (filename))
36-
ofh = gzip.open(filename, "wb", 1)
37-
ofh.write("".join(lines))
38-
ofh.close()
17+
os.popen('zcat %s | split -l %d -d --additional-suffix=.fastq.gz --suffix-length 3 - %s' % (fastq, lines_per_chunk, output_prefix))
3918
return True
4019

4120

@@ -44,34 +23,17 @@ if __name__ =='__main__':
4423
parser = argparse.ArgumentParser(description="split files into chunks based on filesize")
4524
parser.add_argument('-f1', "--fastq1", action='store', help="filename to split", required=True)
4625
parser.add_argument('-f2', "--fastq2", action='store', help="filename2 to split")
47-
# parser.add_argument('-s', "--sample", action='store', help="sample ID", required=True)
4826
parser.add_argument('-p', "--platform-unit", action='store', help="RG/PU ID", required=True)
4927
args = parser.parse_args()
5028
fastqs = [args.fastq1]
5129
if args.fastq2:
5230
fastqs.append(args.fastq2)
5331
filesize = os.path.getsize(fastqs[0])
5432
logger.info("Fastq1 Filesize: %sGB" % ("{:.2f}".format(float(filesize)/1000000000)))
55-
num_pieces = math.ceil(float(filesize)/2800000000)
56-
logger.info("Splitting into %s pieces" % "{:.0f}".format(num_pieces))
57-
num_lines = sum(1 for line in os.popen("zcat " + fastqs[0]))
58-
lines_per_chunk = math.ceil(float(num_lines) / int(num_pieces))
33+
lines_per_chunk = 182560840
5934
logger.info("%s lines per chunk" % str(lines_per_chunk))
6035
pool=multiprocessing.Pool(processes=2)
6136
for fastq in fastqs:
62-
result = pool.apply_async(chunk, args=(fastq, args.platform_unit, lines_per_chunk, num_pieces, ))
37+
result = pool.apply_async(chunk, args=(fastq, args.platform_unit, lines_per_chunk ))
6338
pool.close()
6439
pool.join()
65-
66-
67-
68-
69-
70-
71-
72-
73-
74-
75-
76-
77-

bin/cmo_vcf2maf

+5-2
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ if __name__ =='__main__':
2828
preparser.add_argument("--version", help="Version of tool to run", choices=cmo.util.programs['vcf2maf'].keys(), default="default")
2929
preparser.add_argument("--vep-release", help="Version of VEP and its cache to use", choices=cmo.util.programs['vep'].keys(), default="default")
3030
preparser.add_argument("--species", help="Species of variants in input", choices=["homo_sapiens","mus_musculus"], default="homo_sapiens")
31-
preparser.add_argument("--ncbi-build", help="Genome build of variants in input", choices=["GRCh37","GRCh38","GRCm38"], default="GRCh37")
31+
preparser.add_argument("--ncbi-build", help="Genome build of variants in input", choices=["GRCh37","GRCh38","GRCm38","GRCh37_mm10"], default="GRCh37")
3232
options, _ = preparser.parse_known_args()
33-
33+
if options.ncbi_build == 'GRCh37_mm10':
34+
options.ncbi_build = 'GRCh37'
3435
# Figure out the path to the actual Perl script that this Python wrapper will run
3536
script_path = cmo.util.programs['vcf2maf'][options.version] + "vcf2maf.pl"
3637
# Extract arguments and their defaults, by parsing the --help output
@@ -62,6 +63,8 @@ if __name__ =='__main__':
6263

6364
# Now run the argparse instance, which will parse and execute, or print help text if requested
6465
args = parser.parse_args()
66+
if args.ncbi_build == 'GRCh37_mm10':
67+
args.ncbi_build = 'GRCh37'
6568
args_dict = vars(args)
6669

6770
# Unless user-defined, assume that sample IDs are the same in VCF and MAF

0 commit comments

Comments
 (0)