Skip to content

Commit

Permalink
Add HDF5 serialization, tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten committed Oct 2, 2024
1 parent 7233496 commit 0d29d96
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 26 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ version = "0.3.0"
AssigningSecondaryStructure = "8ed43e74-60fb-4e11-99b9-91deed37aef7"
Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7"
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
Backboner = "0.12"
BioStructures = "4"
HDF5 = "0.17"
LinearAlgebra = "1"
PeriodicTable = "1"
julia = "1"
Expand Down
2 changes: 2 additions & 0 deletions src/ProteinChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ export writecif, writepdb
export PDBFormat, MMCIFFormat
export pdbentry, @pdb_str

include("h5.jl")

end
20 changes: 11 additions & 9 deletions src/atom.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
using PeriodicTable: elements

const ELEMENT_SYMBOL_TO_ATOMIC_NUMBER = Dict(uppercase(elements[atomic_number].symbol) => atomic_number for atomic_number in 1:118)
const ATOMIC_NUMBER_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_ATOMIC_NUMBER)
const ELEMENT_SYMBOL_TO_number = Dict(uppercase(elements[number].symbol) => number for number in 1:118)
const number_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_number)

element_symbol_to_atomic_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_ATOMIC_NUMBER[uppercase(strip(element_symbol))]
atomic_number_to_element_symbol(atomic_number::Integer) = ATOMIC_NUMBER_TO_ELEMENT_SYMBOL[atomic_number]
element_symbol_to_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_number[uppercase(strip(element_symbol))]
number_to_element_symbol(number::Integer) = number_TO_ELEMENT_SYMBOL[number]

pad_atom_name(name::AbstractString, element_symbol::AbstractString) = rpad(" "^(2-length(strip(element_symbol)))*strip(name), 4)

Expand All @@ -13,19 +13,21 @@ decode_atom_name(name::UInt32) = String(reinterpret(UInt8, [name]))

struct Atom{T<:AbstractFloat}
name::UInt32
atomic_number::Int8
number::Int8
x::T
y::T
z::T
end

Atom(name::UInt32, atomic_number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, atomic_number, x, y, z)
Atom{T}(atom::Atom) where T = Atom(atom.name, atom.number, T(atom.x), T(atom.y), T(atom.z))

Atom(name::UInt32, number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, number, x, y, z)
Atom(name::AbstractString, element_symbol::AbstractString, coords::AbstractVector{T}) where T =
Atom(encode_atom_name(name, element_symbol), element_symbol_to_atomic_number(element_symbol), coords...)
Atom(encode_atom_name(name, element_symbol), element_symbol_to_number(element_symbol), coords...)

coords(atom::Atom) = SVector(atom.x, atom.y, atom.z)

Base.summary(atom::Atom) = "$(elements[atom.atomic_number].name) atom at [$(atom.x), $(atom.y), $(atom.z)])"
Base.summary(atom::Atom) = "$(elements[atom.number].name) atom at [$(atom.x), $(atom.y), $(atom.z)])"

Base.show(io::IO, atom::Atom{T}) where T = print(io,
"Atom(\"$(decode_atom_name(atom.name))\", \"$(atomic_number_to_element_symbol(atom.atomic_number))\", $(coords(atom)))")
"Atom(\"$(decode_atom_name(atom.name))\", \"$(number_to_element_symbol(atom.number))\", $(coords(atom)))")
22 changes: 13 additions & 9 deletions src/chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,13 @@ function Base.show(io::IO, chain::ProteinChain)
print(io, "ProteinChain(")
for fieldname in fieldnames(ProteinChain)
show(io, getproperty(chain, fieldname))
fieldname == :properties || print(", ")
fieldname == :properties || print(io, ", ")
end
print(")")
print(io, ")")
end

function Base.show(io::IO, ::MIME"text/plain", chain::ProteinChain)
print(io, summary(chain))
end

function get_atoms(backbone_coords::Array{T,3}) where T
Expand Down Expand Up @@ -94,16 +98,16 @@ end
using AssigningSecondaryStructure: assign_secondary_structure

calculate_property(x, name::Symbol, args...) = calculate_property(x, Val(name), args...)
annotate(chain::ProteinChain, names::Vararg{Symbol}) = annotate(chain; NamedTuple{names}(calculate_property(chain, name) for name in names)...)

calculate_property(chain::ProteinChain, ::Val{:backbone}) = get_backbone(chain) |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:secondary_structure}) = Int8.(assign_secondary_structure(get_backbone(chain))) |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:residue_rotations}) = Backboner.Frames(Backbone(get_backbone(chain)), chain.ideal_residue).rotations |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:residue_translations}) = dropdims(Backboner.centroid(get_backbone(chain); dims=2); dims=2) |> ResidueProperty
calculate_property(chain::ProteinChain{T}, ::Val{:residue_torsion_angles}) where T = [reshape(calculate_property(chain, :torsion_angles)[], 3, :) fill(T(NaN), 3)] |> ResidueProperty

calculate_property(chain::ProteinChain, ::Val{:ideal_residue}) = collect(STANDARD_RESIDUE) |> ChainProperty
calculate_property(chain::ProteinChain, ::Val{:bond_lengths}) = Backboner.get_bond_lengths(Backboner.Backbone(get_backbone(chain))) |> ChainProperty
calculate_property(chain::ProteinChain, ::Val{:bond_angles}) = Backboner.get_bond_angles(Backboner.Backbone(get_backbone(chain))) |> ChainProperty
calculate_property(chain::ProteinChain, ::Val{:torsion_angles}) = Backboner.get_torsion_angles(Backboner.Backbone(get_backbone(chain))) |> ChainProperty
calculate_property(chain::ProteinChain, ::Val{:is_knotted}) = Backboner.is_knotted(Backboner.Backbone(get_backbone(chain)[:,2,:])) |> ChainProperty

annotate(chain::ProteinChain, names::Vararg{Symbol}) = annotate(chain; NamedTuple{names}(calculate_property(chain, name) for name in names)...)
calculate_property(chain::ProteinChain, ::Val{:backbone}) = get_backbone(chain) |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:secondary_structure}) = Int8.(assign_secondary_structure(get_backbone(chain))) |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:residue_rotations}) = Backboner.Frames(Backbone(get_backbone(chain)), hasproperty(chain, :ideal_residue) ? chain.ideal_residue : STANDARD_RESIDUE).rotations |> ResidueProperty
calculate_property(chain::ProteinChain, ::Val{:residue_translations}) = dropdims(Backboner.centroid(get_backbone(chain); dims=2); dims=2) |> ResidueProperty
calculate_property(chain::ProteinChain{T}, ::Val{:residue_torsion_angles}) where T = [reshape(calculate_property(chain, :torsion_angles)[], 3, :) fill(T(NaN), 3)] |> ResidueProperty
95 changes: 95 additions & 0 deletions src/h5.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import HDF5

function Base.write(parent::Union{HDF5.File,HDF5.Group}, chain::ProteinChain{T}, path::AbstractString=chain.id) where T
chain_group = HDF5.create_group(parent, path)
HDF5.attributes(chain_group)["T"] = string(T)

chain_group["id"] = chain.id
chain_group["atom_chunk_sizes"] = map(UInt8length, chain.atoms)
chain_group["atoms_flattened"] = reduce(vcat, chain.atoms)
chain_group["sequence"] = chain.sequence
chain_group["numbering"] = map(UInt32, chain.numbering)

indexable = HDF5.create_group(chain_group, "indexable")
persistent = HDF5.create_group(chain_group, "persistent")
for (name, property) in pairs(chain.properties)
g = property isa ResidueProperty ? indexable : persistent
g[string(name)] = property[]
end

return parent
end

function Base.read(group::Union{Union{HDF5.File,HDF5.Group}}, ::Type{ProteinChain})
T = eval(Symbol(read(HDF5.attributes(group)["T"])))
id = read(group["id"])
atom_chunk_sizes = read(group["atom_chunk_sizes"])
atoms_flattened = [Atom(nt.name, nt.number, nt.x, nt.y, nt.z) for nt in read(group["atoms_flattened"])]
sequence = read(group["sequence"])
numbering = read(group["numbering"])

atoms = Vector{Atom{T}}[]
for (i, k) in zip(Iterators.accumulate(+, atom_chunk_sizes), atom_chunk_sizes)
residue_atoms = atoms_flattened[i-k+1:i]
push!(atoms, residue_atoms)
end

indexable = group["indexable"]
persistent = group["persistent"]
properties = merge(
NamedTuple((Symbol(key) => ResidueProperty(read(indexable[key])) for key in keys(group["indexable"]))),
NamedTuple((Symbol(key) => ChainProperty(read(persistent[key])) for key in keys(group["persistent"])))
)

ProteinChain(id, atoms, sequence, numbering, properties)
end

function Base.write(parent::Union{HDF5.File,HDF5.Group}, structure::ProteinStructure{T}, path::AbstractString=structure.name) where T
structure_group = HDF5.create_group(parent, path)
HDF5.attributes(structure_group)["T"] = string(T)

structure_group["name"] = structure.name
structure_group["atoms"] = structure.atoms
structure_group["numbering"] = structure.numbering

chains_group = HDF5.create_group(structure_group, "chains")

for (chain, number) in zip(structure.chains, structure.numbering)
write(chains_group, chain, string(number))
end

return parent
end

function Base.read(group::Union{Union{HDF5.File,HDF5.Group}}, ::Type{ProteinStructure})
T = eval(Symbol(read(HDF5.attributes(group)["T"])))
name = read(group["name"])
atoms = [Atom(nt.name, nt.number, nt.x, nt.y, nt.z) for nt in read(group["atoms"])]
numbering = read(group["numbering"])

chains_group = group["chains"]
chains = ProteinChain{T}[]
for i in numbering
chain = read(chains_group[string(i)], ProteinChain)
push!(chains, chain)
end

return ProteinStructure(name, atoms, chains, numbering)
end

function serialize(path::AbstractString, structures::AbstractVector{<:ProteinStructure}; mode="w")
HDF5.h5open(path, mode) do f
foreach(structure -> write(f, structure), structures)
end
return path
end

function deserialize(path::AbstractString)
structures = ProteinStructure[]
HDF5.h5open(path, "r") do f
for key in keys(f)
push!(structures, read(f[key], ProteinStructure))
end
end
return structures
end
12 changes: 6 additions & 6 deletions src/io/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ function ProteinChain(residues::Vector{BioStructures.AbstractResidue})
return ProteinChain(id, atoms, sequence, numbering)
end

function ProteinChain(chain::BioStructures.Chain, selector=backbone_residue_selector)
residues = BioStructures.collectresidues(chain, selector)
isempty(residues) && return ProteinChain(BioStructures.chainid(chain), "", zeros(3, 3, 0), Int[], Vector{Atom{Float64}}[])
function ProteinChain(chain::BioStructures.Chain, backbone_residue_selector=backbone_residue_selector)
residues = BioStructures.collectresidues(chain, backbone_residue_selector)
isempty(residues) && return ProteinChain(BioStructures.chainid(chain), Vector{Atom{Float64}}[], "", Int[])
return ProteinChain(residues)
end

function ProteinStructure(struc::BioStructures.MolecularStructure, selector=backbone_residue_selector; mmcifdict=nothing)
function ProteinStructure(struc::BioStructures.MolecularStructure, backbone_residue_selector=backbone_residue_selector; mmcifdict=nothing)
proteinchains = ProteinChain{Float64}[]
atoms = Atom.(BioStructures.collectatoms(BioStructures.collectresidues(struc, !selector)))
atoms = Atom.(BioStructures.collectatoms(BioStructures.collectresidues(struc, !backbone_residue_selector)))
for model in struc, chain in model
push!(proteinchains, ProteinChain(chain, selector))
push!(proteinchains, ProteinChain(chain, backbone_residue_selector))
end
proteinstructure = ProteinStructure(struc.name, atoms, proteinchains)
mmcifdict isa BioStructures.MMCIFDict && renumber!(proteinstructure, mmcifdict)
Expand Down
2 changes: 1 addition & 1 deletion src/io/write.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ function BioStructures.Chain(proteinchain::ProteinChain, model::BioStructures.Mo
atom_serial += 1
name = decode_atom_name(atom.name)
coords = [atom.x, atom.y, atom.z]
element = atomic_number_to_element_symbol(atom.atomic_number)
element = number_to_element_symbol(atom.number)
atom = BioStructures.Atom(atom_serial, name, ' ', coords, 1.0, 0.0, element, " ", residue)
push!(atom_list, atom.name)
atoms[atom.name] = atom
Expand Down
5 changes: 4 additions & 1 deletion src/structure.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,7 @@ function map_atoms!(f::Function, structure::ProteinStructure, args...)
structure.atoms[i] = f(structure.atoms[i], args...)
end
return structure
end
end

annotate(structure::ProteinStructure, names::Vararg{Symbol}) =
ProteinStructure(structure.name, structure.atoms, annotate.(structure.chains, names...), structure.numbering)

0 comments on commit 0d29d96

Please sign in to comment.