Skip to content

Commit aa6f24f

Browse files
authored
Split reads (#24)
* helper script to index at start of module3..could be use for all modules * increment version for cmo_index, etc * initial commit of cmo_index * logging * cmo_split_reads now opens and emits gzips..slow bc native python gzip instead of subprocess zcat * add logger function * use bufferereader for speed improvement on gzip * increment version * version increment AGAIN
1 parent 2f999e8 commit aa6f24f

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

bin/cmo_split_reads

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/opt/common/CentOS_6-dev/python/python-2.7.10/bin/python
22

3-
import argparse, os, sys, signal, subprocess, math, gzip
3+
import argparse, os, sys, signal, subprocess, math, gzip, io
44
import cmo
55

66
if __name__ =='__main__':
@@ -16,19 +16,19 @@ if __name__ =='__main__':
1616
logger.info("Fastq1 Filesize: %sGB" % ("{:.2f}".format(float(filesize)/1000000000)))
1717
num_pieces = math.ceil(float(filesize)/350000000)
1818
logger.info("Splitting into %s pieces" % "{:.0f}".format(num_pieces))
19-
num_lines = sum(1 for line in gzip.open(fastqs[0]))
19+
num_lines = sum(1 for line in io.BufferedReader(gzip.open(fastqs[0])))
2020
lines_per_chunk = math.ceil(float(num_lines) / int(num_pieces))
2121
logger.info("%s lines per chunk" % str(lines_per_chunk))
2222
for fastq in fastqs:
2323
output_prefix = os.path.basename(fastq).split(".", 1)[0] + "."
2424
while lines_per_chunk % 4 != 0:
2525
lines_per_chunk +=1
26-
fh = gzip.open(fastq, "rb")
26+
fh = io.BufferedReader(gzip.open(fastq, "rb"))
2727
output_file_count = 0
2828
output_file_lines = 0
2929
filename = output_prefix + "chunk{:0>3d}".format(output_file_count) + ".fastq.gz"
3030
logger.info("Opening %s and writing reads..." % (filename))
31-
ofh = gzip.open(filename, "wb")
31+
ofh = io.BufferedWriter(gzip.open(filename, "wb"))
3232
while(1):
3333
line = fh.readline()
3434
if not line:
@@ -42,7 +42,7 @@ if __name__ =='__main__':
4242
output_file_count +=1
4343
filename = output_prefix + "chunk{:0>3d}".format(output_file_count) + ".fastq.gz"
4444
logger.info("Opening %s and writing reads..." % (filename))
45-
ofh = gzip.open(filename, "wb")
45+
ofh = io.BufferedWriter(gzip.open(filename, "wb"))
4646
ofh.close()
4747

4848

cmo/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# This file is originally generated from Git information by running 'setup.py
33
# version'. Distribution tarballs contain a pre-generated copy of this file.
44

5-
__version__ = '1.2.0'
5+
__version__ = '1.3.0'

0 commit comments

Comments
 (0)