forked from molgenis/ngs-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
concatFastQ.sh
129 lines (105 loc) · 3.11 KB
/
concatFastQ.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env bash
set -e
set -u
function usage () {
echo "
Arguments
Required:
-n|--name name of FastQ (SequencingStartData_Sequencer_Run_Flowcell, e.g. 150803_SN163_0661_AHKYM5ADXX)
Optional:
-t|--tmp where to write intermediate files (default: /gcc/groups/gaf/tmp03/tmp/)
-p|--prm location of the rawdata/ngs directory on the permanent storage (default:/gcc/groups/gaf/prm02/rawdata/ngs)
-o|--output outputfolder (default: /gcc/groups/gaf/tmp03/rawdata/ngs/)
"
}
PARSED_OPTIONS=$(getopt -n "$0" -o n:t:p:o: --long "name:tmp:prm:output" -- "$@")
#
# Bad arguments, something has gone wrong with the getopt command.
#
if [ $? -ne 0 ]; then
usage
echo "FATAL: Wrong arguments."
exit 1
fi
eval set -- "$PARSED_OPTIONS"
#
# Now goes through all the options with a case and using shift to analyse 1 argument at a time.
# $1 identifies the first argument, and when we use shift we discard the first argument, so $2 becomes $1 and goes again through the case.
#
while true; do
case "$1" in
-n|--name)
case "$2" in
"") shift 2 ;;
*) NAME=$2 ; shift 2 ;;
esac ;;
-t|--tmp)
case "$2" in
"") shift 2 ;;
*) TMP=$2 ; shift 2 ;;
esac ;;
-p|--prm)
case "$2" in
"") shift 2 ;;
*) PRM=$2 ; shift 2 ;;
esac ;;
-o|--output)
case "$2" in
"") shift 2 ;;
*) OUTPUT=$2 ; shift 2 ;;
esac ;;
--) shift ; break ;;
*) echo "Internal error!" ; exit 1 ;;
esac
done
#
# Check required options were provided.
if [[ -z "${NAME-}" ]]; then
usage
echo "FATAL: missing required parameter."
exit 1
fi
if [[ -z "${TMP-}" ]]; then
TMP="/gcc/resources/b37/intervals/"
fi
if [[ -z "${PRM-}" ]]; then
PRM="/gcc/groups/gaf/prm02/rawdata/ngs"
fi
if [[ -z "${OUTPUT-}" ]]; then
OUTPUT="/gcc/groups/gaf/tmp03/rawdata/ngs/"
fi
FASTQ=${NAME}
FASTQDIR=${PRM}/${FASTQ}
RAWDATATMP=${OUTPUT}/${FASTQ}_combined/
if [ ! -d ${RAWDATATMP} ]
then
mkdir -p ${RAWDATATMP}
echo "mkdir -p ${RAWDATATMP}"
fi
TMP="/gcc/groups/gaf/tmp03/tmp/"
if [ -f ${TMP}/allBarcodes.txt ]
then
rm ${TMP}/allBarcodes.txt
fi
OLDIFS=$IFS
IFS="_"
for i in $(ls -1 ${FASTQDIR}/*.fq.gz)
do
IN="${i}"
set -- "$IN"
declare -a Array=($*)
echo "${Array[8]}" >> ${TMP}/allBarcodes.txt
done
IFS=$OLDIFS
sort -u ${TMP}/allBarcodes.txt > ${TMP}/allUniqBarcodes.txt
while read line
do
cat ${FASTQDIR}/${FASTQ}_L1_${line}_1.fq.gz ${FASTQDIR}/${FASTQ}_L2_${line}_1.fq.gz > ${RAWDATATMP}/${FASTQ}_combined_${line}_1.fq.gz
echo "${RAWDATATMP}/${FASTQ}_combined_${line}_1.fq.gz done"
cat ${FASTQDIR}/${FASTQ}_L1_${line}_2.fq.gz ${FASTQDIR}/${FASTQ}_L2_${line}_2.fq.gz > ${RAWDATATMP}/${FASTQ}_combined_${line}_2.fq.gz
echo "${RAWDATATMP}/${FASTQ}_combined_${line}_2.fq.gz done"
done<${TMP}/allUniqBarcodes.txt
rm ${TMP}/allUniqBarcodes.txt
rm ${TMP}/allBarcodes.txt
echo "removed intermediate files"
echo "all combined fastq files can be found in ${RAWDATATMP}"