-
Notifications
You must be signed in to change notification settings - Fork 24
/
metadata.jl
236 lines (204 loc) · 8.13 KB
/
metadata.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import Dates: Date, DateTime
"""NumPy array protocol type string (typestr) format
A string providing the basic type of the homogenous array. The basic string format
consists of 3 parts: a character describing the byteorder of the data
(<: little-endian, >: big-endian, |: not-relevant), a character code giving the basic
type of the array, and an integer providing the number of bytes the type uses.
https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
"""
include("MaxLengthStrings.jl")
using .MaxLengthStrings: MaxLengthString
primitive type ASCIIChar <: AbstractChar 8 end
ASCIIChar(x::UInt8) = reinterpret(ASCIIChar, x)
ASCIIChar(x::Integer) = ASCIIChar(UInt8(x))
UInt8(x::ASCIIChar) = reinterpret(UInt8, x)
Base.codepoint(x::ASCIIChar) = UInt8(x)
Base.show(io::IO, x::ASCIIChar) = print(io, Char(x))
Base.zero(::Union{ASCIIChar,Type{ASCIIChar}}) = ASCIIChar(Base.zero(UInt8))
using Dates: Period, TimeType, Date, DateTime, Dates
import Base.==
struct DateTime64{P} <: TimeType
i::Int64
end
Base.convert(::Type{Date},t::DateTime64{P}) where P = Date(1970)+P(t.i)
Base.convert(::Type{DateTime},t::DateTime64{P}) where P = DateTime(1970)+P(t.i)
Base.show(io::IO,t::DateTime64{P}) where P = print(io,"DateTime64[",P,"]: ",string(DateTime(t)))
Base.isless(x::DateTime64{P}, y::DateTime64{P}) where P = isless(x.i, y.i)
==(x::DateTime64{P}, y::DateTime64{P}) where P = x.i == y.i
strpairs = [Dates.Year => "Y", Dates.Month => "M", Dates.Week => "W", Dates.Day=>"D",
Dates.Hour => "h", Dates.Minute => "m", Dates.Second=>"s", Dates.Millisecond =>"ms",
Dates.Microsecond => "us", Dates.Nanosecond => "ns"]
const jlperiod = Dict{String,Any}()
const pdt64string = Dict{Any, String}()
for p in strpairs
jlperiod[p[2]] = p[1]
pdt64string[p[1]] = p[2]
end
Base.convert(::Type{DateTime64{P}}, t::Date) where P = DateTime64{P}(Dates.value(P(t-Date(1970))))
Base.convert(::Type{DateTime64{P}}, t::DateTime) where P = DateTime64{P}(Dates.value(P(t-DateTime(1970))))
Base.convert(::Type{DateTime64{P}}, t::DateTime64{Q}) where {P,Q} = DateTime64{P}(Dates.value(P(Q(t.i))))
Base.zero(t::Union{DateTime64, Type{<:DateTime64}}) = t(0)
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{Date}) = Date
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{DateTime}) = DateTime
# Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{Date}) = DateTime
# Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{DateTime}) = DateTime
typestr(t::Type) = string('<', 'V', sizeof(t))
typestr(t::Type{>:Missing}) = typestr(Base.nonmissingtype(t))
typestr(t::Type{Bool}) = string('<', 'b', sizeof(t))
typestr(t::Type{<:Signed}) = string('<', 'i', sizeof(t))
typestr(t::Type{<:Unsigned}) = string('<', 'u', sizeof(t))
typestr(t::Type{Complex{T}} where T<:AbstractFloat) = string('<', 'c', sizeof(t))
typestr(t::Type{<:AbstractFloat}) = string('<', 'f', sizeof(t))
typestr(::Type{MaxLengthString{N,UInt32}}) where N = string('<', 'U', N)
typestr(::Type{MaxLengthString{N,UInt8}}) where N = string('<', 'S', N)
typestr(::Type{<:Array}) = "|O"
typestr(::Type{<:DateTime64{P}}) where P = "<M8[$(pdt64string[P])]"
const typestr_regex = r"^([<|>])([tbiufcmMOSUV])(\d*)(\[\w+\])?$"
const typemap = Dict{Tuple{Char, Int}, DataType}(
('b', 1) => Bool,
('S', 1) => ASCIIChar,
('U', 1) => Char,
)
sizemapf(x::Type{<:Number}) = sizeof(x)
sizemapf(x::Type{<:Complex{T}}) where T = sizeof(T)
typecharf(::Type{<:Signed}) = 'i'
typecharf(::Type{<:Unsigned}) = 'u'
typecharf(::Type{<:AbstractFloat}) = 'f'
typecharf(::Type{<:Complex}) = 'c'
foreach([Float16,Float32,Float64,Int8,Int16,Int32,Int64,Int128,
UInt8,UInt16,UInt32,UInt64,UInt128,
Complex{Float16},Complex{Float32},Complex{Float64}]) do t
typemap[(typecharf(t),sizemapf(t))] = t
end
function typestr(s::AbstractString, filterlist=nothing)
m = match(typestr_regex, s)
if m === nothing
throw(ArgumentError("$s is not a valid numpy typestr"))
else
byteorder, typecode, typesize, typespec = m.captures
if byteorder == ">"
throw(ArgumentError("Big-endian data not yet supported"))
end
if typecode == "O"
if filterlist === nothing
throw(ArgumentError("Object array can only be parsed when an appropriate filter is defined"))
end
return Vector{sourcetype(first(filterlist))}
end
isempty(typesize) && throw((ArgumentError("$s is not a valid numpy typestr")))
tc, ts = first(typecode), parse(Int, typesize)
if (tc in ('U','S')) && ts > 1
return MaxLengthString{ts,tc=='U' ? UInt32 : UInt8}
end
if tc == 'M' && ts == 8
#We have a datetime64 value
return DateTime64{jlperiod[String(typespec)[2:end-1]]}
end
# convert typecode to Char and typesize to Int
typemap[(tc,ts)]
end
end
"""Metadata configuration of the stored array
Each array requires essential configuration metadata to be stored, enabling correct
interpretation of the stored data. This metadata is encoded using JSON and stored as the
value of the “.zarray” key within an array store.
https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata
"""
struct Metadata{T, N, C, F}
zarr_format::Int
shape::Ref{NTuple{N, Int}}
chunks::NTuple{N, Int}
dtype::String # structured data types not yet supported
compressor::C
fill_value::Union{T, Nothing}
order::Char
filters::F # not yet supported
end
#To make unit tests pass with ref shape
import Base.==
function ==(m1::Metadata, m2::Metadata)
m1.zarr_format == m2.zarr_format &&
m1.shape[] == m2.shape[] &&
m1.chunks == m2.chunks &&
m1.dtype == m2.dtype &&
m1.compressor == m2.compressor &&
m1.fill_value == m2.fill_value &&
m1.order == m2.order &&
m1.filters == m2.filters
end
"Construct Metadata based on your data"
function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int};
zarr_format::Integer=2,
compressor::C=BloscCompressor(),
fill_value::Union{T, Nothing}=nothing,
order::Char='C',
filters::Nothing=nothing
) where {T, N, C}
T2 = fill_value === nothing ? T : Union{T,Missing}
Metadata{T2, N, C, typeof(filters)}(
zarr_format,
size(A),
chunks,
typestr(eltype(A)),
compressor,
fill_value,
order,
filters
)
end
Metadata(s::Union{AbstractString, IO}) = Metadata(JSON.parse(s))
"Construct Metadata from Dict"
function Metadata(d::AbstractDict)
# create a Metadata struct from it
compdict = d["compressor"]
compressor = getCompressor(compdict)
filters = getfilters(d)
T = typestr(d["dtype"], filters)
N = length(d["shape"])
C = typeof(compressor)
F = typeof(filters)
fv = fill_value_decoding(d["fill_value"], T)
TU = fv === nothing ? T : Union{T,Missing}
Metadata{TU, N, C, F}(
d["zarr_format"],
NTuple{N, Int}(d["shape"]) |> reverse,
NTuple{N, Int}(d["chunks"]) |> reverse,
d["dtype"],
compressor,
fv,
first(d["order"]),
filters,
)
end
"Describes how to lower Metadata to JSON, used in json(::Metadata)"
function JSON.lower(md::Metadata)
Dict{String, Any}(
"zarr_format" => md.zarr_format,
"shape" => md.shape[] |> reverse,
"chunks" => md.chunks |> reverse,
"dtype" => md.dtype,
"compressor" => md.compressor,
"fill_value" => fill_value_encoding(md.fill_value),
"order" => md.order,
"filters" => md.filters
)
end
# Fill value encoding and decoding as described in
# https://zarr.readthedocs.io/en/stable/spec/v2.html#fill-value-encoding
fill_value_encoding(v) = v
fill_value_encoding(::Nothing)=nothing
function fill_value_encoding(v::AbstractFloat)
if isnan(v)
"NaN"
elseif isinf(v)
v>0 ? "Infinity" : "-Infinity"
else
v
end
end
Base.eltype(::Metadata{T}) where T = T
# this correctly parses "NaN" and "Infinity"
fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v)
fill_value_decoding(v::Nothing, ::Any) = v
fill_value_decoding(v, T) = T(v)
fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v