1
- #!/opt/common/CentOS_6-dev/python/python-2.7.10 /bin/python
1
+ #!/ifs/work/pi/roslin-pipelines/roslin-core/2.0.6/config/variant/2.4.2/virtualenv /bin/python
2
2
3
3
import argparse , os , sys , signal , subprocess , math , gzip , io
4
4
import cmo
5
5
import multiprocessing
6
6
7
- def chunk (fastq , platform_unit , lines_per_chunk , num_pieces ):
7
+ def chunk (fastq , platform_unit , lines_per_chunk ):
8
8
logger = cmo .util .get_logger ()
9
- output_prefix = os .path .basename (fastq ).split ("." , 1 )[0 ] + "."
9
+ output_prefix = os .path .basename (fastq ).split ("." , 1 )[0 ] + ".chunk "
10
10
if (platform_unit != None ):
11
11
exploded = output_prefix .split ("_" )
12
12
exploded [0 ]= platform_unit
13
13
output_prefix = "-" .join (exploded )
14
- while lines_per_chunk % 4 != 0 :
15
- lines_per_chunk += 1
16
- fh = io .BufferedReader (gzip .open (fastq , "rb" ))
17
- output_file_count = 0
18
- output_file_lines = 0
14
+ # output_prefix = output_prefix+"chunk"
15
+ logger .info ("Opening %s and writing reads..." % (fastq ))
19
16
#these aren't relaly gz but trimgalore doesnt like files not named gz...great work trimgalore
20
- filename = output_prefix + "chunk{:0>3d}" .format (output_file_count ) + ".fastq.gz"
21
- logger .info ("Opening %s and writing reads..." % (filename ))
22
- ofh = gzip .open (filename , "wb" , 1 )
23
- lines = list ()
24
- for line in os .popen ("zcat < " + fastq ):
25
- lines .append (line )
26
- output_file_lines += 1
27
- if output_file_lines == lines_per_chunk :
28
- if (output_file_count < int (num_pieces )- 1 ):
29
- output_file_lines = 0
30
- ofh .write ("" .join (lines ))
31
- ofh .close ()
32
- lines = list ()
33
- output_file_count += 1
34
- filename = output_prefix + "chunk{:0>3d}" .format (output_file_count ) + ".fastq.gz"
35
- logger .info ("Opening %s and writing reads..." % (filename ))
36
- ofh = gzip .open (filename , "wb" , 1 )
37
- ofh .write ("" .join (lines ))
38
- ofh .close ()
17
+ os .popen ('zcat %s | split -l %d -d --additional-suffix=.fastq.gz --suffix-length 3 - %s' % (fastq , lines_per_chunk , output_prefix ))
39
18
return True
40
19
41
20
@@ -44,34 +23,17 @@ if __name__ =='__main__':
44
23
parser = argparse .ArgumentParser (description = "split files into chunks based on filesize" )
45
24
parser .add_argument ('-f1' , "--fastq1" , action = 'store' , help = "filename to split" , required = True )
46
25
parser .add_argument ('-f2' , "--fastq2" , action = 'store' , help = "filename2 to split" )
47
- # parser.add_argument('-s', "--sample", action='store', help="sample ID", required=True)
48
26
parser .add_argument ('-p' , "--platform-unit" , action = 'store' , help = "RG/PU ID" , required = True )
49
27
args = parser .parse_args ()
50
28
fastqs = [args .fastq1 ]
51
29
if args .fastq2 :
52
30
fastqs .append (args .fastq2 )
53
31
filesize = os .path .getsize (fastqs [0 ])
54
32
logger .info ("Fastq1 Filesize: %sGB" % ("{:.2f}" .format (float (filesize )/ 1000000000 )))
55
- num_pieces = math .ceil (float (filesize )/ 2800000000 )
56
- logger .info ("Splitting into %s pieces" % "{:.0f}" .format (num_pieces ))
57
- num_lines = sum (1 for line in os .popen ("zcat " + fastqs [0 ]))
58
- lines_per_chunk = math .ceil (float (num_lines ) / int (num_pieces ))
33
+ lines_per_chunk = 182560840
59
34
logger .info ("%s lines per chunk" % str (lines_per_chunk ))
60
35
pool = multiprocessing .Pool (processes = 2 )
61
36
for fastq in fastqs :
62
- result = pool .apply_async (chunk , args = (fastq , args .platform_unit , lines_per_chunk , num_pieces , ))
37
+ result = pool .apply_async (chunk , args = (fastq , args .platform_unit , lines_per_chunk ))
63
38
pool .close ()
64
39
pool .join ()
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
0 commit comments