-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
123 lines (98 loc) · 4.18 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# ╭───────────────────────────────────────────────────────────────────────────╮
# METAGENOMIC SEARCHES FOR PRD1-LIKE PHAGES IN WASTEWATER
#
# author: Natalia Quinones-Olvera
# email: [email protected]
# ╰───────────────────────────────────────────────────────────────────────────╯
import pandas as pd
import os
# -----------------------------------------------------------------------------
# setup
# -----------------------------------------------------------------------------
# main directories
BAYM_DIR = 'data/'
SCRATCH_DIR = '/n/scratch3/users/n/nq10/wastewater/'
def main_dir(child):
return os.path.join(BAYM_DIR, child)
def scra_dir(child):
return os.path.join(SCRATCH_DIR, child)
# samples to run
df = pd.read_table(main_dir('wastewater_metadata.tsv'),
sep='\t')
samples = df[df['snakemake']]['run_acc']
# -----------------------------------------------------------------------------
# RULE ALL
# -----------------------------------------------------------------------------
rule all:
input:
expand(main_dir('kraken_results/fastq/{sample}_1.alphatecti.fastq'), sample=samples),
expand(main_dir('kraken_results/fastq/{sample}_2.alphatecti.fastq'), sample=samples),
expand(main_dir('kraken_results/summary/{sample}.summary.tsv'), sample=samples)
# -----------------------------------------------------------------------------
# sra_download:
# download read files from SRA, given run accession from dataframe
# -----------------------------------------------------------------------------
rule sra_download:
output:
# original reads files from SRA
r1 = scra_dir('reads/{sample}_1.fastq'),
r2 = scra_dir('reads/{sample}_2.fastq')
params:
# output dir: scratch3
outdir = scra_dir('reads')
conda:
'envs/sratools.yml'
shell:
'fasterq-dump '\
'-t {params.outdir} '\
'-O {params.outdir} '\
'{wildcards.sample}'
# -----------------------------------------------------------------------------
# kraken_viral:
# run kraken with viral/tecti database
# -----------------------------------------------------------------------------
rule kraken_viral:
input:
r1 = scra_dir('reads/{sample}_1.fastq'),
r2 = scra_dir('reads/{sample}_2.fastq')
output:
kraken_results = main_dir('kraken_results/{sample}.kraken'),
kraken_report = main_dir('kraken_results/{sample}.report')
params:
db = main_dir('kraken_db/ww_viral')
conda:
'envs/kraken2.yml'
shell:
'kraken2 '\
'--paired '\
'--report {output.kraken_report} '\
'--db {params.db} '\
'{input.r1} '\
'{input.r2} '\
'> {output.kraken_results}'
# -----------------------------------------------------------------------------
# extract_reads:
# extract reads matching alphatectivirus by kraken
# -----------------------------------------------------------------------------
rule extract_reads:
input:
r1 = scra_dir('reads/{sample}_1.fastq'),
r2 = scra_dir('reads/{sample}_2.fastq'),
kraken_results = main_dir('kraken_results/{sample}.kraken')
output:
r1_extract = main_dir('kraken_results/fastq/{sample}_1.alphatecti.fastq'),
r2_extract = main_dir('kraken_results/fastq/{sample}_2.alphatecti.fastq'),
summary = main_dir('kraken_results/summary/{sample}.summary.tsv')
params:
taxids = main_dir('taxids.tsv')
conda:
'envs/mykrakentools.yml'
shell:
'python scripts/get_kraken_reads.py '\
'-1 {input.r1} '\
'-2 {input.r2} '\
'-o1 {output.r1_extract} '\
'-o2 {output.r2_extract} '\
'-t {params.taxids} '\
'-s {output.summary} '\
'{input.kraken_results}'