-
Notifications
You must be signed in to change notification settings - Fork 0
/
associations_intersection.awk
executable file
·139 lines (103 loc) · 3.94 KB
/
associations_intersection.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#! /usr/bin/gawk -f
###############################################################################
# script name: associations_intersection.awk
# path on oxygen: /data/databases/scripts/prego_statistics/
# developed by: Savvas Paragkamian
# framework: PREGO - WP4
###############################################################################
# GOAL:
# Aim of this script is to calculate the contents of the associations pairs
# from all the prediction channels, text mining, experiments and knowledge.
# There are terms of NCBI ids, ENVO ids, GO ids as well as their associations.
# NOTE: this script is for ALL associations regardless their score!!!
###############################################################################
#
# usage:./associations_intersection.awk \
# /path/to/prego_unicellular_ncbi.tsv \
# /path/to/ncbi/ncbi_taxonomy/nodes.dmp \
# /path/to/database_pairs.tsv /path/to/database_pairs.tsv \
# /path/to/database_pairs.tsv
#
# Execution time: 56 minutes
###############################################################################
BEGIN {
FS="\t"
}
# Load the data in associative arrays.
(ARGIND==1) {
#initiate an array with the desired NCBI ids to include only microbes.
unicellular_taxa[$2]=1
}
# Load the third file, NCBI taxonomy dump file with NCBI Ids and ranks.
(ARGIND==2){
rank[$1]=$5;
}
#Load all the database pairs files from all sources and channels of PREGO
(ARGIND>2){
# keep the channel name from the FILENAME and add it in the array
channels = gensub(/\/(.+)\/(.+)\/(.+)/,"\\2","g" ,FILENAME)
# The second dimension of the array is the type of the association, variabe $3
# The third dimension of the array is the ncbi id.
# The value of this multidimentional array is the number of associations
# of a NCBI id with a specific type (i.e -21, -27) of a specific channel.
# this block is for the environments associations
if ($1 == "-27"){
if ($3==-2){
# if the environment is associated with taxa then keep only unicellural
if ($4 in unicellular_taxa){
environments_associations[channels][$3][$2]++;
}
}
# if the environment isn't associated with taxa keep all
else {
environments_associations[channels][$3][$2]++;
}
}
# this block is for the biological processes associations
if ($1 == "-21"){
# if the process is associated with taxa then keep only unicellural
if ($3==-2){
if ($4 in unicellular_taxa){
processes_associations[channels][$3][$2]++;
}
}
else {
processes_associations[channels][$3][$2]++;
}
}
# this block is for the taxa associations
if ($1 == "-2" && $2 in unicellular_taxa){
if ($3==-2){
if ($4 in unicellular_taxa){
taxa_associations[channels][$3][$2]++;
}
}
else{
taxa_associations[channels][$3][$2]++;
}
}
}
# Print the results
END{
for (channel in taxa_associations){
for (type2 in taxa_associations[channel]){
for (taxon in taxa_associations[channel][type2]){
print "-2" "\t" taxon "\t" taxa_associations[channel][type2][taxon] "\t" type2 "\t" channel "\t" rank[taxon]
}
}
}
for (channel in environments_associations){
for (type2 in environments_associations[channel]){
for (env in environments_associations[channel][type2]){
print "-27" "\t" env "\t" environments_associations[channel][type2][env] "\t" type2 "\t" channel "\t" "na"
}
}
}
for (channel in processes_associations){
for (type2 in processes_associations[channel]){
for (proc in processes_associations[channel][type2]){
print "-21" "\t" proc "\t" processes_associations[channel][type2][proc] "\t" type2 "\t" channel "\t" "na"
}
}
}
}