-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathwes_pipeline.sh
executable file
·240 lines (201 loc) · 9.67 KB
/
wes_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/bin/bash
# Gregory Way 2017
# pdx_exomeseq
# wes_pipeline.sh
#
# Pipeline for tumor-only WES variant calling. Each script submits a job to the
# DISCOVERY compute cluster at Dartmouth College and will run the specified
# step for all samples
###################
# STEP 1 - Quality Control
###################
# FastQC
python pdx_exomeseq.py fastqc --input_directory 'data' \
--output_directory 'results/fastqc_raw' \
--walltime '01:30:00' --nodes 1 --cores 4
# Run MultiQC on FastQC results
python pdx_exomeseq.py multiqc --input_directory 'results/fastqc_raw/' \
--html_file 'results/multiqc_report_raw.html'
# Run TrimGalore to cut adapters and filter low quality reads
# TrimGalore is run in `--paired` mode, which performs additional filtering on
# low-quality read pairs between samples. Both pairs of the reads must have
# greater than 20 high quality sequences between them.
python pdx_exomeseq.py trimgalore --input_directory 'data' \
--output_directory 'processed/trimmed' \
--fastqc_results_dir 'results/fastqc_trimmed/' \
--walltime '04:00:00' --nodes 1 --cores 4
# Run MultiQC again on Trimmed FastQC results output from the trimgalore step
python pdx_exomeseq.py multiqc --input_directory 'results/fastqc_trimmed/' \
--html_file 'results/multiqc_report_trimmed.html'
###################
# STEP 2 - Alignment
###################
# Align reads to human reference genome (g1k_v37)
python pdx_exomeseq.py bwa --genome 'hg' --input_directory 'processed/trimmed' \
--output_directory 'processed/sam' \
--walltime '05:00:00' --nodes 1 --cores 8
# We also need to align reads to mouse genome (mm9) to disambiguate species
python pdx_exomeseq.py bwa --genome 'mm' --input_directory 'processed/trimmed' \
--output_directory 'processed/sam_mouse' \
--walltime '05:00:00' --nodes 1 --cores 8
# NOTE: to save space, delete intermediate files after sorting and compression
###################
# STEP 3 - Remove mouse reads with ngs-disambiguate
###################
# Sort SAM and convert to BAM
python pdx_exomeseq.py samtools --sub_command 'sort_name' --genome 'hg' \
--input_directory 'processed/sam' \
--output_directory 'processed/bam' \
--walltime '06:00:00' --nodes 2 --cores 12
# Also need to sort SAM and convert to BAM for mouse
python pdx_exomeseq.py samtools --sub_command 'sort_name' --genome 'mm' \
--input_directory 'processed/sam_mouse' \
--output_directory 'processed/bam_mouse' \
--walltime '03:00:00' --nodes 2 --cores 12
# Use ngs_disambiguate to identify high confidence human-based reads
python pdx_exomeseq.py disambiguate \
--human_dir 'processed/bam' --mouse_dir 'processed/bam_mouse' \
--output_directory 'processed/bam_disambiguate' \
--walltime '06:00:00' --nodes 2 --cores 8
###################
# STEP 4A - Data Conversion and Processing (after disambiguation)
###################
# Prep for duplicate removal by cleaning up readpair tags
python pdx_exomeseq.py samtools --sub_command 'fixmate' \
--input_directory 'processed/bam_disambiguate' \
--output_directory 'processed/bam_fixmate' \
--walltime '02:30:00' --nodes 2 --cores 4
# Prep for duplicate removal by sorting tagged bam files by position
python pdx_exomeseq.py samtools --sub_command 'sort_position' \
--input_directory 'processed/bam_fixmate' \
--output_directory 'processed/bam_sort_position' \
--walltime '02:00:00' --nodes 2 --cores 8
# Remove duplicate reads
python pdx_exomeseq.py samtools --sub_command 'rmdup' \
--input_directory 'processed/bam_sort_position' \
--output_directory 'processed/bam_rmdup' \
--walltime '03:30:00' --nodes 1 --cores 4
# Create BAM index for duplicate removal
python pdx_exomeseq.py samtools --sub_command 'index_bam' \
--input_directory 'processed/bam_rmdup' \
--output_directory 'processed/bam_rmdup' \
--walltime '03:30:00' --nodes 1 --cores 4
# Assign read groups
python pdx_exomeseq.py variant --sub_command 'add_read_groups' \
--input_directory 'processed/bam_rmdup' \
--output_directory 'processed/gatk_bam' \
--walltime '03:00:00' --nodes 1 --cores 8
###################
# STEP 4B - Data Conversion and Processing (human only for 004-M and 005-M)
###################
# Prep for duplicate removal by cleaning up readpair tags
python pdx_exomeseq.py samtools --sub_command 'fixmate' \
--humanonly \
--input_directory 'processed/bam' \
--output_directory 'processed/bam_fixmate_humanonly' \
--walltime '02:30:00' --nodes 2 --cores 4
# Prep for duplicate removal by sorting tagged bam files by position
python pdx_exomeseq.py samtools --sub_command 'sort_position' \
--humanonly \
--input_directory 'processed/bam_fixmate_humanonly' \
--output_directory 'processed/bam_sort_position_humanonly' \
--walltime '02:00:00' --nodes 2 --cores 8
# Remove duplicate reads
python pdx_exomeseq.py samtools --sub_command 'rmdup' \
--humanonly \
--input_directory 'processed/bam_sort_position_humanonly' \
--output_directory 'processed/bam_rmdup_humanonly' \
--walltime '03:30:00' --nodes 1 --cores 4
# Create BAM index for duplicate removal
python pdx_exomeseq.py samtools --sub_command 'index_bam' \
--humanonly \
--input_directory 'processed/bam_rmdup_humanonly' \
--output_directory 'processed/bam_rmdup_humanonly' \
--walltime '03:30:00' --nodes 1 --cores 4
# Merge bam files
python pdx_exomeseq.py samtools --sub_command 'merge' \
--humanonly \
--input_directory 'processed/bam_rmdup_humanonly' \
--output_directory 'processed/gatk_merged_bam_humanonly' \
--walltime '05:00:00' --nodes 1 --cores 8
# Assign read groups
python pdx_exomeseq.py variant --sub_command 'add_read_groups' \
--humanonly \
--input_directory 'processed/gatk_merged_bam_humanonly' \
--output_directory 'processed/gatk_merged_rg_bam_humanonly' \
--walltime '03:00:00' --nodes 1 --cores 8
# Create merged index files
python pdx_exomeseq.py samtools --sub_command 'index_bam_gatk' \
--humanonly \
--input_directory 'processed/gatk_merged_rg_bam_humanonly' \
--output_directory 'processed/gatk_merged_rg_bam_humanonly' \
--walltime '05:00:00' --nodes 1 --cores 8
# Run MuTect2 on merged bams
python pdx_exomeseq.py variant --sub_command 'mutect2' \
--input_directory 'processed/gatk_merged_rg_bam_humanonly' \
--output_directory 'processed/gatk_merged_vcf_humanonly' \
--walltime '10:00:00' --nodes 2 --cores 8
###################
# STEP 5 - Variant Calling
###################
# NOTE: Realigning around indels was NOT performed. They are legacy functions
# that are no longer best-practices for GATK HaplotypeCaller pipelines
# (see https://software.broadinstitute.org/gatk/blog?id=7847)
# 1) Call Variants using each replicate individually
# Create index for read group files
python pdx_exomeseq.py samtools --sub_command 'index_bam_gatk' \
--input_directory 'processed/gatk_bam' \
--output_directory 'processed/gatk_bam' \
--walltime '3:30:00' --nodes 1 --cores 4
# Call variants with MuTect2
python pdx_exomeseq.py variant --sub_command 'mutect2' \
--input_directory 'processed/gatk_bam' \
--output_directory 'results/gatk_vcf' \
--walltime '05:00:00' --nodes 1 --cores 8
# 2) Call Variants with a merged file across replicates of the same sample
python pdx_exomeseq.py samtools --sub_command 'merge' \
--input_directory 'processed/bam_rmdup' \
--output_directory 'processed/gatk_merged_bam' \
--walltime '05:00:00' --nodes 1 --cores 8
# Assign read groups to merged bam
python pdx_exomeseq.py variant --sub_command 'add_read_groups' \
--input_directory 'processed/gatk_merged_bam' \
--output_directory 'processed/gatk_merged_rg_bam' \
--walltime '04:00:00' --nodes 1 --cores 8
# Create merged index files
python pdx_exomeseq.py samtools --sub_command 'index_bam_gatk' \
--input_directory 'processed/gatk_merged_rg_bam' \
--output_directory 'processed/gatk_merged_rg_bam' \
--walltime '05:00:00' --nodes 1 --cores 8
# Run MuTect2 on merged bams
python pdx_exomeseq.py variant --sub_command 'mutect2' \
--input_directory 'processed/gatk_merged_rg_bam' \
--output_directory 'processed/gatk_merged_vcf' \
--walltime '10:00:00' --nodes 2 --cores 8
###################
# Step 6 - Sequencing Summary Statistics
###################
python pdx_exomeseq.py mosdepth \
--input_directory 'processed/gatk_merged_rg_bam' \
--output_directory 'results/wes_stats' \
--walltime '5:00:00' --nodes 2 --cores 8
python pdx_exomeseq.py samtools --sub_command 'flagstat' \
--input_directory 'processed/gatk_merged_rg_bam' \
--output_directory 'results/read_counts' \
--walltime '1:00:00' --nodes 1 --cores 4
###################
# STEP 7 - Annotate Variants
###################
# First, download ANNOVAR and associated databases
# Guide: http://annovar.openbioinformatics.org/en/latest/user-guide/startup/
# Db: https://github.com/WGLab/doc-ANNOVAR/blob/master/user-guide/download.md
# The databases we will use are:
# refGene,cosmic70,gnomad_exome,dbnsfp30a
# First use `convert2annovar` to convert MuTect2 derived VCF files to annovar
# compatible files and then use `table_annovar` to add annotations as columns
# to the converted VCF
python scripts/annotate_variants.py
# Annotate variants in merged vcf files
python scripts/annotate_variants.py --merged
# Also, make sure to run ANNOVAR on the human only samples
python scripts/annotate_variants.py --humanonly