-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
91 lines (67 loc) · 3.67 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
__author__ = "Johannes Alneberg"
__license__ = "MIT"
import os
import glob
import sys
configfile: "config.json"
megahit_coassembly_contigs = "assembly/BARM.fa.gz"
if not os.path.isfile(megahit_coassembly_contigs):
sys.stderr.write("No assembly file {} found. Please download and place the BARM assembly on the intended location\n".format(megahit_coassembly_contigs))
sys.exit("-1")
sample_groups = {}
config["bowtie2_quant_rules"]["samples"] = {}
config["bowtie2_quant_rules"]["rna_samples"] = []
# Add reads for mapping
found_reads = False
for i, read_file in enumerate(glob.glob("finished_reads/*na/*/*.fq.gz")):
found_reads = True
read_basename = os.path.basename(read_file)
read_name = read_basename.replace(".fq.gz", "")
sample_name = read_name.replace("_R1", "").replace("_R2", "")
sample_type = read_file.split('/')[1]
if sample_type not in ['dna','rna']:
continue
elif sample_type == 'rna':
# RNA-samples have slightly different handling
config["bowtie2_quant_rules"]["rna_samples"].append(sample_name)
# Add sample to sample group
sample_group = read_file.split('/')[2]
if sample_group not in sample_groups:
sample_groups[sample_group] = []
if sample_name not in sample_groups[sample_group]:
sample_groups[sample_group].append(sample_name)
if sample_name in config["bowtie2_quant_rules"]["samples"]:
config["bowtie2_quant_rules"]["units"][sample_name].append(read_file)
config["bowtie2_quant_rules"]["units"][sample_name].sort()
else:
config["bowtie2_quant_rules"]["units"][sample_name] = [read_file]
config["bowtie2_quant_rules"]["samples"][sample_name] = [sample_name]
if not found_reads:
sys.stderr.write("No read files were found, please place them in the path 'finished_reads/{d,r}na/<sample_group>/<sample_name>.fq.gz'\n")
sys.exit(-1)
else:
sys.stderr.write("Found {} read files in {} sample groups\n".format(i+1, len(sample_groups)))
config["bowtie2_quant_rules"]["sample_groups"] = sample_groups
config["bowtie2_quant_rules"]["references"]["BARM"] = megahit_coassembly_contigs
config["bowtie2_quant_rules"]["reference_for_ref_set"]["BARM"] = "BARM"
config["bowtie2_quant_rules"]["count_units"] = ['raw_counts', 'tpm']
config["bowtie2_rules"]["references"] = config["bowtie2_quant_rules"]["references"]
config["bowtie2_rules"]["units"] = config["bowtie2_quant_rules"]["units"]
config["bowtie2_rules"]["samples"] = config["bowtie2_quant_rules"]["samples"]
config["bowtie2_rules"]["mapping_params"] = config["bowtie2_quant_rules"]["mapping_params"]
config["prodigal_rules"] = {}
config["prodigal_rules"]["dbs"] = ['EggNOG', 'dbCAN', 'EC', 'pfam']
config["taxonomic_annotation"] = {}
config["taxonomic_annotation"]["sample_sets"] = config['bowtie2_quant_rules']["split_ref_sets"]
WORKFLOW_DIR = "snakemake-workflows/"
include: os.path.join(WORKFLOW_DIR, "bio/ngs/rules/mapping/bowtie2.rules")
include: os.path.join(WORKFLOW_DIR, "bio/ngs/rules/mapping/samtools.rules")
include: os.path.join(WORKFLOW_DIR, "bio/ngs/rules/quantification/bowtie2.rules")
# The index is large, need to use .bt2l indices
ruleorder: bowtie2_map_large > bowtie2_map
rule link_quantification:
input: "quantification/bowtie2_genes/local/BARM/annotations/{sample_group}/{annotation}.{normalization}.annotated.tsv.gz"
output: "summary_tables/{sample_group}/{annotation}.{normalization}.annotated.tsv.gz"
shell: "ln -s {input} {output}"
rule quantify_all:
input: expand("summary_tables/{sample_group}/{annotation}.{normalization}.annotated.tsv.gz", sample_group = sample_groups, annotation = config["prodigal_rules"]["dbs"], normalization = config["bowtie2_quant_rules"]["count_units"])