-
Notifications
You must be signed in to change notification settings - Fork 0
/
entities_ranks.awk
executable file
·145 lines (128 loc) · 5.43 KB
/
entities_ranks.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#! /usr/bin/gawk -f
###############################################################################
# script name: entities_ranks.awk
# path on oxygen: /data/databases/scripts/prego_statistics/
# developed by: Savvas Paragkamian
# framework: PREGO - WP4
###############################################################################
# GOAL:
# Aim of this script is to calculate the entities of higher included in higher
# levels across sources, channels and files. Phyla for taxa, second level of
# KEGG orhology metabolism and custom selections for ENVO and Biological
# Process.
# NOTE: this script is for ALL associations regardless their score!!!
###############################################################################
#
# usage: ./entities_ranks.awk
# /data/dictionary/prego_unicellular_ncbi.tsv \
# /data/dictionary/ncbi/ncbi_taxonomy/nodes.dmp\
# /data/dictionary/database_preferred.tsv \
# /data/dictionary/database_groups.tsv \
# /data/experiments/database_pairs.tsv
###############################################################################
BEGIN {
FS="\t"
superkingdoms["2"]="bacteria"
superkingdoms["2157"]="archaea"
}
# Load the data in associative arrays.
(ARGIND==1) {
#initiate an array with the desired NCBI ids to count only microbes.
unicellular_taxa[$2]=1
}
# Load the third file and fourth file for the higher taxonomy.
(ARGIND==2 ){
if ($5=="phylum"){
phyla[$1];
}
if ($5=="family"){
families[$1];
}
}
# load database_preferred.tsv
(ARGIND==3){
names[$2]=$3
}
#load database_groups
(ARGIND==4){
if ($4 in phyla){
child_phylum[$2]=$4 # is this unique? and why? because we kept only phyla
# in higher taxa
}
if ($4 in families){
child_family[$2]=$4
}
if (($2 in unicellular_taxa) && ($4 in superkingdoms)){
unicellular_superkingdom[$2]=superkingdoms[$4];
}
}
#Load all the rest files
(ARGIND>4){
file = FILENAME
if ($2 in unicellular_taxa){
if ($1 == -2){
if ($2 in unicellular_superkingdom){
entities["all"]["all"][$1][child_phylum[$2]][unicellular_superkingdom[$2]][$2]=1
entities["all"]["all"][$1]["all"][unicellular_superkingdom[$2]][$2]=1
entities["all"]["all"][$3][child_phylum[$2]][unicellular_superkingdom[$2]][$4]=1
entities["all"]["all"][$3]["no rank"]["no rank"][$4]=1
}
if (!($2 in unicellular_superkingdom)){
entities["all"]["all"][$1][child_phylum[$2]]["eukaryotes"][$2]=1
entities["all"]["all"][$1]["all"]["eukaryotes"][$2]=1
entities["all"]["all"][$3][child_phylum[$2]]["eukaryotes"][$4]=1
}
}
# Text mining file doesn't have a source field so this condition
# checks whether the path has the word textmining
if (file ~ /textmining/) {
# Only taxa have a rank for the moment so we have to condition
# that as well.
if ($1 == -2){
if ($2 in unicellular_superkingdom){
entities[file]["textmining"][$1][child_phylum[$2]][unicellular_superkingdom[$2]][$2]=1
entities[file]["textmining"][$1]["all"][unicellular_superkingdom[$2]][$2]=1
entities[file]["textmining"][$3][child_phylum[$2]][unicellular_superkingdom[$2]][$4]=1
entities[file]["textmining"][$3]["no rank"][unicellular_superkingdom[$2]][$4]=1
}
if (!($2 in unicellular_superkingdom)){
entities[file]["textmining"][$1][child_phylum[$2]]["eukaryotes"][$2]=1
entities[file]["textmining"][$1]["all"]["eukaryotes"][$2]=1
entities[file]["textmining"][$3][child_phylum[$2]]["eukaryotes"][$4]=1
}
}
}
else {
if ($1 == -2){
if ($2 in unicellular_superkingdom){
entities[file][$5][$1][child_phylum[$2]][unicellular_superkingdom[$2]][$2]=1
entities[file][$5][$1]["all"][unicellular_superkingdom[$2]][$2]=1
#count the entities associated with -2. There are cases where
#the associations are not symmetric between entities.
entities[file][$5][$3][child_phylum[$2]][unicellular_superkingdom[$2]][$4]=1
entities[file][$5][$3]["no rank"][unicellular_superkingdom[$2]][$4]=1
}
if (!($2 in unicellular_superkingdom)){
entities[file][$5][$1][child_phylum[$2]]["eukaryotes"][$2]=1
entities[file][$5][$1]["all"]["eukaryotes"][$2]=1
entities[file][$5][$3][child_phylum[$2]]["eukaryotes"][$4]=1
}
}
}
}
}
#print statistics for each source.
END{
print "file" FS "channel" FS "type" FS "phylum_id" FS "superkingdom" FS "phylum_name" FS "no_entities"
for (file in entities){
for (channel in entities[file]){
for (type in entities[file][channel]){
for (phylum_id in entities[file][channel][type]){
for (superkingdom in entities[file][channel][type][phylum_id]){
print file FS channel FS type FS phylum_id FS superkingdom FS names[phylum_id] FS length(entities[file][channel][type][phylum_id][superkingdom])
}
}
}
}
}
}