-
Notifications
You must be signed in to change notification settings - Fork 0
/
associations_statistics.awk
executable file
·108 lines (88 loc) · 3.66 KB
/
associations_statistics.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#! /usr/bin/gawk -f
###############################################################################
# script name: associations_statistics.awk
# path on oxygen: /data/databases/scripts/prego_statistics
# developed by: Savvas Paragkamian
# framework: PREGO - WP4
###############################################################################
# GOAL:
# Aim of this script is to calculate the contents of the associations pairs
# from all the prediction channels, text mining, experiments and knowledge.
# Theare terms of NCBI ids, ENVO ids, GO ids as well as their assotiations.
# NOTE: this script is for ALL associations regardless their score!!!
###############################################################################
#
# usage:
# ./associations_statistics.awk /data/dictionary/prego_unicellular_ncbi.tsv \
# /data/dictionary/ncbi/ncbi_taxonomy/nodes.dmp \
# /data/textmining/database_pairs.tsv /data/experiments/database_pairs.tsv \
# /data/knowledge/database_pairs.tsv
# NOTE this script doesn't take into account score
###############################################################################
BEGIN{
FS="\t"
# Load the data in associative arrays.
}
(ARGIND==1) {
#initiate an array with the desired NCBI ids to count only microbes.
unicellular_taxa[$2]=1
}
# Load the second file, NCBI taxonomy dump file with NCBI Ids and ranks.
(ARGIND==2){
rank[$1]=$5;
}
#Load all the database pairs files from all sources and channels of PREGO
(ARGIND>2 ){
file = FILENAME
# count the taxa - environments associations first all together and then by
# channer and source.
# The textmining database_pairs file doesn't have a channel field so this
# if searches whether the file comes from textmining
# Here a multidimentional array is created that counts the number of
# associations per file, channel and types
total_associations[$1][$3]++
if ($1==-2 && ($2 in unicellular_taxa)){
total_associations_taxonomy[$1][$3][rank[$2]]++
}
if (file ~ /textmining/) {
associations[file]["textmining"][$1][$3]++
# taxonomy counts
if ($1==-2 && ($2 in unicellular_taxa)){
associations_taxonomy[file]["textmining"][$1][$3][rank[$2]]++
}
}
else {
associations[file][$5][$1][$3]++
# taxonomy counts
if ($1==-2 && ($2 in unicellular_taxa)){
associations_taxonomy[file][$5][$1][$3][rank[$2]]++
}
}
}
END{
print "file" FS "channel" FS "type 1" FS "type 2" FS "taxonomy" FS "# associations"
for (type1 in total_associations){
for (type2 in total_associations[type1]){
print "all" FS "all" FS type1 FS type2 FS "total" FS total_associations[type1][type2]
if (type1==-2){
for (taxonomy in total_associations_taxonomy[type1][type2]){
print "all" FS "all" FS type1 FS type2 FS taxonomy FS total_associations_taxonomy[type1][type2][taxonomy]
}
}
}
}
for (file in associations){
for (channel in associations[file]){
for (type1 in associations[file][channel]){
for (type2 in associations[file][channel][type1]){
print file FS channel FS type1 FS type2 FS "total" FS associations[file][channel][type1][type2]
if (type1==-2){
for (taxonomy in associations_taxonomy[file][channel][type1][type2]){
print file FS channel FS type1 FS type2 FS taxonomy FS associations_taxonomy[file][channel][type1][type2][taxonomy]
}
}
}
}
}
}
}