Skip to content

Commit

Permalink
Add ProteinDataset, fix atom chunk size accumulation integer overflow
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten committed Oct 3, 2024
1 parent 0d29d96 commit bc36e46
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 59 deletions.
46 changes: 16 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,51 +16,37 @@ Pkg.add("ProteinChains")

## Examples

The `ProteinChain` type is meant to only store a set of quintessential fields, from which most other properties can be derived.
The `ProteinChain` type is meant to only store a basic set of fields, from which some other properties might be derived.

```julia
julia> using ProteinChains

julia> structure = pdb"1EYE" # string macro to fetch proteins from the PDB
[ Info: Downloading file from PDB: 1EYE
1-chain ProteinStructure "1EYE.cif"
256-residue ProteinChain{Float64} (A)
256-residue ProteinChain{Float64, @NamedTuple{}} (A)

julia> propertynames(chain)
(:id, :sequence, :backbone, :numbering, :atoms)
(:id, :atoms, :sequence, :numbering)
```
To store additional properties, `AnnotatedProteinChain` can be used to add dynamic properties to the chain:
To store additional properties, `annotate` can be used to attach persistent chain-level properties or indexable residue-level properties:
```julia
julia> annotated_chain = annotate(chain; model=1)
256-residue AnnotatedProteinChain{Float64} (A):
6 fields:
id::String = "A"
sequence::String = <exceeds max length>
backbone::Array{Float64,3} = <exceeds max length>
numbering::Vector{Int64} = <exceeds max length>
atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = <exceeds max length>
indexable_properties::Vector{Symbol} = Symbol[]
1 property:
model::Int64 = 1
```
julia> chain = structure["A"]
256-residue ProteinChain{Float64, @NamedTuple{}} (A)

For properties of type `<:AbstractArray` that represent residue-level information, `annotate_indexable!` will index the last dimension of the property when the chain is indexed:
julia> annotated_chain = annotate(chain; taxid=ChainProperty(83332))
256-residue ProteinChain{Float64, @NamedTuple{taxid::ChainProperty{Int64}}} (A)

```julia
julia> annotate_indexable!(annotated_chain; secondary_structure=assign_secondary_structure(annotated_chain)
256-residue AnnotatedProteinChain{Float64} (A):
6 fields:
id::String = "A"
sequence::String = <exceeds max length>
backbone::Array{Float64,3} = <exceeds max length>
numbering::Vector{Int64} = <exceeds max length>
atoms::Vector{Vector{ProteinChains.Atom{Float64}}} = <exceeds max length>
indexable_properties::Vector{Symbol} = [:secondary_structure]
2 properties:
model::Int64 = 1
secondary_structure::Vector{Int64} = [1, 1, 3, 3, 3, 3, 3, 3, 3, 1 2, 2, 2, 2, 2, 2, 2, 1, 1, 1]
julia> annotated_chain = annotate(annotated_chain; some_residue_property=ResidueProperty(rand(3,256))) # last dimension gets indexed
256-residue ProteinChain{Float64, @NamedTuple{ORGANISM_TAXID::ChainProperty{Int64}, some_residue_property::ResidueProperty{Matrix{Float64}}}} (A)

julia> annotated_chain[1:100].some_residue_property
3×100 Matrix{Float64}:
0.273545 0.639173 0.92708 0.459441 0.196407 0.880034
0.981498 0.70263 0.279264 0.552049 0.89274 0.0328866
0.169268 0.117848 0.732741 0.301921 0.187094 0.281187
```
## See also
Expand Down
7 changes: 5 additions & 2 deletions src/ProteinChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,28 @@ export prepend_residue
include("atom.jl")
export Atom
export encode_atom_name, decode_atom_name
export coords

include("properties.jl")
export ChainProperty, ResidueProperty

include("chain.jl")
export ProteinChain
export annotate
export map_atoms!
export psi_angles, omega_angles, phi_angles
export get_atoms, get_backbone

include("structure.jl")
export ProteinStructure

include("dataset.jl")
export ProteinDataset

include("io/io.jl")
export readcif, readpdb
export writecif, writepdb
export PDBFormat, MMCIFFormat
export pdbentry, @pdb_str

include("h5.jl")

end
16 changes: 9 additions & 7 deletions src/atom.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
using PeriodicTable: elements

const ELEMENT_SYMBOL_TO_number = Dict(uppercase(elements[number].symbol) => number for number in 1:118)
const number_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_number)
const ELEMENT_SYMBOL_TO_NUMBER = Dict(uppercase(elements[number].symbol) => number for number in 1:118)
const number_TO_ELEMENT_SYMBOL = Dict(n => s for (s, n) in ELEMENT_SYMBOL_TO_NUMBER)

element_symbol_to_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_number[uppercase(strip(element_symbol))]
element_symbol_to_number(element_symbol::AbstractString) = ELEMENT_SYMBOL_TO_NUMBER[uppercase(strip(element_symbol))]
number_to_element_symbol(number::Integer) = number_TO_ELEMENT_SYMBOL[number]

pad_atom_name(name::AbstractString, element_symbol::AbstractString) = rpad(" "^(2-length(strip(element_symbol)))*strip(name), 4)
Expand All @@ -19,11 +19,13 @@ struct Atom{T<:AbstractFloat}
z::T
end

Atom{T}(atom::Atom) where T = Atom(atom.name, atom.number, T(atom.x), T(atom.y), T(atom.z))
Atom{T}(atom::Atom) where T = Atom{T}(atom.name, atom.number, atom.x, atom.y, atom.z)

Atom(name::UInt32, number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, number, x, y, z)
Atom(name::AbstractString, element_symbol::AbstractString, coords::AbstractVector{T}) where T =
Atom(encode_atom_name(name, element_symbol), element_symbol_to_number(element_symbol), coords...)
@inline Atom(name::UInt32, number::Integer, x::T, y::T, z::T) where T = Atom{T}(name, number, x, y, z)
@inline Atom(name::AbstractString, element_symbol::AbstractString, x::T, y::T, z::T) where T =
Atom(encode_atom_name(name, element_symbol), element_symbol_to_number(element_symbol), x, y, z)

@inline Atom(name, number, coords::AbstractVector{T}) where T = Atom(name, number, coords...)

coords(atom::Atom) = SVector(atom.x, atom.y, atom.z)

Expand Down
8 changes: 7 additions & 1 deletion src/chain.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""
ProteinChain{T<:Real,Ps<:NamedProperties}
"""
struct ProteinChain{T<:Real,Ps<:NamedProperties}
id::String
atoms::Vector{Vector{Atom{T}}}
Expand All @@ -20,12 +23,15 @@ end

ProteinChain(id, atoms, sequence, numbering) = ProteinChain(id, atoms, sequence, numbering, (;))

Base.:(==)(chain1::ProteinChain, chain2::ProteinChain) = propertynames(chain1) == propertynames(chain2) &&
!any(getproperty(chain1, name) != getproperty(chain2, name) for name in propertynames(chain1, false))

Base.length(chain::ProteinChain) = length(chain.atoms)

Base.getproperty(chain::ProteinChain, name::Symbol) =
name in fieldnames(ProteinChain) ? getfield(chain, name) : getfield(getfield(chain, :properties), name)[]

Base.propertynames(chain::ProteinChain, private::Bool) = union(fieldnames(ProteinChain), propertynames(chain.properties))
Base.propertynames(chain::ProteinChain, private::Bool=false) = (setdiff(fieldnames(ProteinChain), private ? () : (:properties,))..., propertynames(chain.properties)...)

function Base.getindex(chain::ProteinChain, i::AbstractVector)
properties = map(p -> p[i], chain.properties)
Expand Down
13 changes: 13 additions & 0 deletions src/dataset.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
ProteinDataset <: AbstractDict{String,ProteinStructure}
"""
struct ProteinDataset <: AbstractDict{String,ProteinStructure}
dict::Dict{String,ProteinStructure}
end

ProteinDataset(structures::AbstractVector{<:ProteinStructure}) =
ProteinDataset(Dict(structure.name => structure for structure in structures))

Base.length(d::ProteinDataset) = length(d.dict)
Base.iterate(d::ProteinDataset, args...) = iterate(d.dict, args...)
Base.:(==)(d1::ProteinDataset, d2::ProteinDataset) = d1.dict == d2.dict
28 changes: 14 additions & 14 deletions src/ideal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,21 @@ the N, Ca, and C atom positions of a residue positioned at the origin.
struct IdealResidue{T<:AbstractFloat} <: AbstractMatrix{T}
backbone_geometry::BackboneGeometry
N_Ca_C_coords::Matrix{T}
end

function IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY; template=nothing) where T
N_Ca_C_coords = Matrix{T}(undef, 3, 3)
N_pos, Ca_pos, C_pos = eachcol(N_Ca_C_coords)
Θ = backbone_geometry.N_Ca_C_angle - π/2
N_pos .= [0, 0, 0]
Ca_pos .= [backbone_geometry.N_Ca_length, 0, 0]
C_pos .= Ca_pos + backbone_geometry.Ca_C_length * [sin(Θ), cos(Θ), 0]
N_Ca_C_coords .-= Backboner.centroid(N_Ca_C_coords; dims=2)
if !isnothing(template)
wanted_orientation, current_offset, wanted_offset = Backboner.kabsch_algorithm(N_Ca_C_coords, template)
N_Ca_C_coords .= wanted_orientation * (N_Ca_C_coords .- current_offset) .+ wanted_offset
end
new{T}(backbone_geometry, N_Ca_C_coords)
function IdealResidue{T}(backbone_geometry::BackboneGeometry=DEFAULT_BACKBONE_GEOMETRY; template=nothing) where T
N_Ca_C_coords = Matrix{T}(undef, 3, 3)
N_pos, Ca_pos, C_pos = eachcol(N_Ca_C_coords)
Θ = backbone_geometry.N_Ca_C_angle - π/2
N_pos .= [0, 0, 0]
Ca_pos .= [backbone_geometry.N_Ca_length, 0, 0]
C_pos .= Ca_pos + backbone_geometry.Ca_C_length * [sin(Θ), cos(Θ), 0]
N_Ca_C_coords .-= Backboner.centroid(N_Ca_C_coords; dims=2)
if !isnothing(template)
wanted_orientation, current_offset, wanted_offset = Backboner.kabsch_algorithm(N_Ca_C_coords, template)
N_Ca_C_coords .= wanted_orientation * (N_Ca_C_coords .- current_offset) .+ wanted_offset
end
IdealResidue{T}(backbone_geometry, N_Ca_C_coords)
end

Base.size(::IdealResidue) = (3,3)
Expand All @@ -65,7 +65,7 @@ Thus, we can use this residue as a template for aligning other residues with ver
geometry to it.
```jldocotest
julia> IdealResidue{Float64}(BackboneGeometry(N_Ca_C_angle = 1.93); template=ProteinChains.STANDARD_RESIDUE)
julia> IdealResidue{Float64}(BackboneGeometry(N_Ca_C_angle = 1.93); template=ProteinChains.STANDARD_RESIDUE_TEMPLATE)
3×3 IdealResidue{Float64}:
-1.06447 -0.199174 1.26364
0.646303 -0.529648 -0.116655
Expand Down
4 changes: 3 additions & 1 deletion src/io/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ const AMINOACIDS = Set("ACDEFGHIKLMNPQRSTVWY")
const BACKBONE_ATOM_NAMES = ["N", "CA", "C"]
const BACKBONE_ATOM_SYMBOLS = ["N", "C", "C"]

const aa_to_threeletter = Dict{Char, String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa])
const aa_to_threeletter = Dict{Char,String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa])
threeletter_resname(aa::Char) = get(aa_to_threeletter, aa, "XXX")
oneletter_resname(threeletter::AbstractString) = Char(get(BioStructures.threeletter_to_aa, threeletter, 'X'))
oneletter_resname(residue::BioStructures.AbstractResidue) = oneletter_resname(BioStructures.resname(residue))
Expand All @@ -18,3 +18,5 @@ include("renumber.jl")
include("read.jl")
include("write.jl")
include("download.jl")

include("serialization.jl")
11 changes: 7 additions & 4 deletions src/h5.jl → src/io/serialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function Base.write(parent::Union{HDF5.File,HDF5.Group}, chain::ProteinChain{T},
HDF5.attributes(chain_group)["T"] = string(T)

chain_group["id"] = chain.id
chain_group["atom_chunk_sizes"] = map(UInt8length, chain.atoms)
chain_group["atom_chunk_sizes"] = map(UInt32length, chain.atoms)
chain_group["atoms_flattened"] = reduce(vcat, chain.atoms)
chain_group["sequence"] = chain.sequence
chain_group["numbering"] = map(UInt32, chain.numbering)
Expand Down Expand Up @@ -77,19 +77,22 @@ function Base.read(group::Union{Union{HDF5.File,HDF5.Group}}, ::Type{ProteinStru
return ProteinStructure(name, atoms, chains, numbering)
end

function serialize(path::AbstractString, structures::AbstractVector{<:ProteinStructure}; mode="w")
function serialize(path::AbstractString, dataset::ProteinDataset; mode="w")
HDF5.h5open(path, mode) do f
foreach(structure -> write(f, structure), structures)
foreach(((key, structure),) -> write(f, structure, key), dataset)
end
return path
end

serialize(path::AbstractString, structures::AbstractVector{<:ProteinStructure}; kwargs...) =
serialize(path, ProteinDataset(structures), kwargs...)

function deserialize(path::AbstractString)
structures = ProteinStructure[]
HDF5.h5open(path, "r") do f
for key in keys(f)
push!(structures, read(f[key], ProteinStructure))
end
end
return structures
return ProteinDataset(structures)
end
3 changes: 3 additions & 0 deletions src/structure.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""
ProteinStructure{T} <: AbstractVector{ProteinChain{T}}
"""
struct ProteinStructure{T} <: AbstractVector{ProteinChain{T}}
name::String
atoms::Vector{Atom{T}}
Expand Down
12 changes: 12 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,16 @@ using Test

end

@testset "serialization" begin
mktempdir() do dir
filename = "dataset.h5"
path = joinpath(dir, filename)
dataset1 = ProteinDataset([annotate(pdb"1EYE", :backbone)])
ProteinChains.serialize(path, dataset1)
dataset2 = ProteinChains.deserialize(path)
@test dataset1 == dataset2
@test hasproperty(first(values(dataset1))["A"], :backbone)
end
end

end

0 comments on commit bc36e46

Please sign in to comment.