forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtree.py
115 lines (109 loc) · 3 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""\
Tree Schema
This schema assumes a document with subnodes elements
and a tree hierarchy.
For example:
SUBNODE1 - word1
//
NODE1 - SUBNODE2 - word2
//
ROOT - NODE2 - SUBNODE3 - word3
\\
NODE3 - SUBNODE4 - word4
\\
SUBNODE5 - word5
Schema structure:
"id": sentence_id,
"passage": {
"id": sentence_id,
"type": None,
"text": "word1 word2 word3 word4 word5"
"offsets": [0, 29]
},
"nodes": [
{
"id": 0,
"type": ROOT,
"text": "word1 word2 word3 word4 word5",
"offsets": [0, 29],
"subnodes": [1, 2, 3]
},
{
"id": 1,
"type": NODE1,
"text": "word1 word2",
"offsets": [0, 11],
"subnodes": [4, 5]
},
{
"id": 2,
"type": NODE2,
"text": "word3",
"offsets": [12, 17],
"subnodes": [6]
},
{
"id": 3,
"type": NODE3,
"text": "word4 word5",
"offsets": [18, 29],
"subnodes": [7, 8]
},
{
"id": 4,
"type": SUBNODE1,
"text": "word1",
"offsets": [0, 5],
"subnodes": []
},
{
"id": 5,
"type": SUBNODE2,
"text": "word2",
"offsets": [6, 11],
"subnodes": []
},
{
"id": 6,
"type": SUBNODE3,
"text": "word3",
"offsets": [12, 17],
"subnodes": []
},
{
"id": 7,
"type": SUBNODE4,
"text": "word4",
"offsets": [18, 23],
"subnodes": []
},
{
"id": 8,
"type": SUBNODE5,
"text": "word5",
"offsets": [24, 29],
"subnodes": []
}
]
"""
import datasets
features = datasets.Features(
{
"id": datasets.Value("string"),
"passage": {
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"text": datasets.Sequence(datasets.Value("string")),
"offsets": datasets.Sequence(datasets.Value("int32")),
},
"nodes": [
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"text": datasets.Value("string"),
"offsets": datasets.Sequence(datasets.Value("int32")),
"subnodes": datasets.Sequence(datasets.Value("string")), # ids of subnodes
}
],
}
)