Skip to content

Commit

Permalink
Merge pull request #9 from padix-key/main
Browse files Browse the repository at this point in the history
Ensure compatibility with Biotite 0.37
  • Loading branch information
padix-key authored May 30, 2023
2 parents 367c945 + 46127bf commit c704735
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 110 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastpdb"
version = "1.1.0"
version = "1.2.0"
edition = "2018"

[dependencies]
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "fastpdb"
version = "1.1.0"
version = "1.2.0"
description = "A high performance drop-in replacement for Biotite's PDBFile."
readme = "README.rst"
requires-python = ">=3.7"
Expand All @@ -25,7 +25,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
dependencies = [
"biotite >= 0.29"
"biotite >= 0.37"
]

[project.urls]
Expand Down
21 changes: 11 additions & 10 deletions python-src/fastpdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__name__ = "fastpdb"
__author__ = "Patrick Kunzmann"
__all__ = ["PDBFile"]
__version__ = "1.1.0"
__version__ = "1.2.0"

import os
import numpy as np
Expand Down Expand Up @@ -33,6 +33,7 @@ def read(file):

# Synchronize with PDB file representation in Rust
pdb_file.lines = pdb_file._pdb_file.lines
pdb_file._index_models_and_atoms()
return pdb_file

def get_model_count(self):
Expand Down Expand Up @@ -203,14 +204,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=None, include_b
bond_list = struc.BondList(
atoms.array_length(), self._pdb_file.parse_bonds(atom_id)
)
bond_list = bond_list.merge(struc.connect_via_residue_names(
atoms,
# The information for non-hetero residues and water
# are not part of CONECT records
(~atoms.hetero) | struc.filter_solvent(atoms)
))
# Remove bond order from inter residue bonds for consistency
bond_list.remove_bond_order()
bond_list = bond_list.merge(struc.connect_via_residue_names(atoms))
atoms.bonds = bond_list


Expand Down Expand Up @@ -318,4 +312,11 @@ def set_structure(self, atoms):
)

# Synchronize with PDB file representation in Rust
self.lines = self._pdb_file.lines
self.lines = self._pdb_file.lines
self._index_models_and_atoms()


def _index_models_and_atoms(self):
self._pdb_file.index_models_and_atoms()
self._model_start_i = self._pdb_file.model_start_i
self._atom_line_i = self._pdb_file.atom_line_i
227 changes: 130 additions & 97 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mod biotite {
#[derive(Debug)]
enum CoordArray {
Single(Array<f32, Ix2>),
Multi(Array<f32, Ix3>),
Multi(Array<f32, Ix3>)
}


Expand All @@ -35,7 +35,9 @@ enum CoordArray {
struct PDBFile {
/// Lines of text from the PDB file.
#[pyo3(get)]
lines: Vec<String>
lines: Vec<String>,
model_start_i: Vec<usize>,
atom_line_i: Vec<usize>
}


Expand All @@ -48,7 +50,13 @@ impl PDBFile {
#[new]
fn new(lines: Vec<String>) -> Self {
//let ljust_lines = lines.iter().map(|line| format!("{:<80}", line)).collect();
PDBFile { lines: lines }
let mut pdb_file = PDBFile {
lines: lines,
model_start_i: Vec::new(),
atom_line_i: Vec::new()
};
pdb_file.index_models_and_atoms();
pdb_file
}


Expand All @@ -59,14 +67,42 @@ impl PDBFile {
let contents = fs::read_to_string(file_path).map_err(
|_| exceptions::PyOSError::new_err(format!("'{}' cannot be read", file_path))
)?;
let lines = contents.lines().map(|line| format!("{:<80}", line)).collect();
Ok(PDBFile { lines: lines })
let lines = contents.lines().map(
|line| format!("{:<80}", line)
).collect();
let mut pdb_file = PDBFile {
lines: lines,
model_start_i: Vec::new(),
atom_line_i: Vec::new()
};
pdb_file.index_models_and_atoms();
Ok(pdb_file)
}


#[getter]
fn model_start_i(&self) -> PyResult<Py<PyArray<i64, Ix1>>> {
Python::with_gil(|py| {
Ok(PyArray::from_iter(
py, self.model_start_i.iter().map(|x| *x as i64)
).to_owned())
})
}


#[getter]
fn atom_line_i(&self) -> PyResult<Py<PyArray<i64, Ix1>>> {
Python::with_gil(|py| {
Ok(PyArray::from_iter(
py, self.atom_line_i.iter().map(|x| *x as i64)
).to_owned())
})
}


/// Get the number of models contained in the file.
fn get_model_count(&self) -> usize {
self.get_model_start_indices().len()
self.model_start_i.len()
}


Expand Down Expand Up @@ -174,10 +210,7 @@ impl PDBFile {
Option<Py<PyArray<f64, Ix1>>>,
Option<Py<PyArray<f64, Ix1>>>,
Option<Py<PyArray<i64, Ix1>>>)> {
let model_start_i = self.get_model_start_indices();
let (model_start, model_stop) = self.get_model_boundaries(model, &model_start_i)?;

let atom_line_i: Vec<usize> = self.get_atom_indices(model_start, model_stop);
let atom_line_i: Vec<usize> = self.get_atom_indices(model)?;

let mut chain_id: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 4));
let mut res_id: Array<i64, Ix1> = Array::zeros(atom_line_i.len());
Expand Down Expand Up @@ -475,6 +508,25 @@ impl PDBFile {
Ok(())
})
}

/// Index lines in the file that correspond to starts of new models and to
/// `ATOM` or `HETATM` records.
/// Must be called after the content of the file has been changed.
fn index_models_and_atoms(&mut self) {
self.atom_line_i = self.lines.iter().enumerate()
.filter(|(_i, line)| line.starts_with("ATOM") || line.starts_with("HETATM"))
.map(|(i, _line)| i)
.collect();
self.model_start_i= self.lines.iter().enumerate()
.filter(|(_i, line)| line.starts_with("MODEL"))
.map(|(i, _line)| i)
.collect();
// It could be an empty file or a file with a single model,
// where the 'MODEL' line is missing
if self.model_start_i.is_empty() && !self.atom_line_i.is_empty(){
self.model_start_i = vec![0]
}
}
}


Expand All @@ -485,123 +537,95 @@ impl PDBFile {
/// The number of returned dimensions is based on whether a `model` is
/// given.
fn parse_coord(&self, model: Option<isize>) -> PyResult<CoordArray> {
let model_start_i = self.get_model_start_indices();

let (model_start, model_stop) = match model {
Some(model_number) => self.get_model_boundaries(model_number, &model_start_i)?,
None => (0, self.lines.len())
};

let atom_line_i: Vec<usize> = self.get_atom_indices(model_start, model_stop);

let mut coord = Array::zeros((atom_line_i.len(), 3));
for (atom_i, line_i) in atom_line_i.iter().enumerate() {
let line = &self.lines[*line_i];
if line.len() < 80 {
return Err(biotite::InvalidFileError::new_err("Line is too short"))
}
coord[[atom_i, 0]] = line[30..38].trim().parse().map_err(|_|
biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a float", line[30..38].trim()
))
)?;
coord[[atom_i, 1]] = line[38..46].trim().parse().map_err(|_|
biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a float", line[38..46].trim()
))
)?;
coord[[atom_i, 2]] = line[46..54].trim().parse().map_err(|_|
biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a float", line[46..54].trim()
))
)?;
}

match model {
Some(_) => Ok(CoordArray::Single(coord)),
Some(model_number) => {
let atom_line_i = self.get_atom_indices(model_number)?;
let mut coord = Array::zeros((atom_line_i.len(), 3));
for (atom_i, line_i) in atom_line_i.iter().enumerate() {
let line = &self.lines[*line_i];
if line.len() < 80 {
return Err(biotite::InvalidFileError::new_err("Line is too short"))
}
coord[[atom_i, 0]] = parse_float_from_string(line, 30, 38)?;
coord[[atom_i, 1]] = parse_float_from_string(line, 38, 46)?;
coord[[atom_i, 2]] = parse_float_from_string(line, 46, 54)?;
}
Ok(CoordArray::Single(coord))
},

None => {
// Check whether all models have the same length
let length = self.get_model_length(&model_start_i, &atom_line_i)?;
let length = self.get_model_length()?;
let mut coord = Array::zeros((self.atom_line_i.len(), 3));
for (atom_i, line_i) in self.atom_line_i.iter().enumerate() {
let line = &self.lines[*line_i];
if line.len() < 80 {
return Err(biotite::InvalidFileError::new_err("Line is too short"))
}
coord[[atom_i, 0]] = parse_float_from_string(line, 30, 38)?;
coord[[atom_i, 1]] = parse_float_from_string(line, 38, 46)?;
coord[[atom_i, 2]] = parse_float_from_string(line, 46, 54)?;
}
Ok(
CoordArray::Multi(coord.into_shape((model_start_i.len(), length, 3))
CoordArray::Multi(coord.into_shape((self.model_start_i.len(), length, 3))
.expect("Model length is invalid"))
)
}
}
}


/// Get indices to lines of text containing `ATOM` and `HETATM` records.
/// The indices are limited to the range of a model given by the `model_start`
/// and the exclusive `model_stop`.
fn get_atom_indices(&self, model_start: usize, model_stop: usize) -> Vec<usize> {
self.lines[model_start..model_stop].iter().enumerate()
.filter(|(_i, line)| line.starts_with("ATOM") || line.starts_with("HETATM"))
.map(|(i, _line)| i + model_start)
.collect()
}


/// Get indices to lines of text containing `MODEL` records.
/// If no `MODEL` records is found, which occurs frequently in PDB files containing only one
/// model, a single start at the beginning of the file is returned.
fn get_model_start_indices(&self) -> Vec<usize> {
let mut model_start_i: Vec<usize> = self.lines.iter().enumerate()
.filter(|(_i, line)| line.starts_with("MODEL"))
.map(|(i, _line)| i)
.collect();
// Structures containing only one model may omit MODEL record
// In these cases model starting index is set to 0
if model_start_i.is_empty() {
model_start_i = vec![0]
}
model_start_i
}


/// Find the model start and stop index for the given `model` number (1-based).
/// `model_start_i` is the return value of [`get_model_start_indices()`].
fn get_model_boundaries(&self,
model: isize,
model_start_i: &[usize]) -> PyResult<(usize, usize)> {
/// Get indices to `ATOM` and `HETATM` records within the given model number.
fn get_atom_indices(&self, model: isize) -> PyResult<Vec<usize>> {
// Find the model index corresponding to the given model number
let model_i: isize;
match model.cmp(&0) {
Ordering::Greater => model_i = model - 1,
Ordering::Less => model_i = model_start_i.len() as isize + model,
Ordering::Less => model_i = self.model_start_i.len() as isize + model,
Ordering::Equal => return Err(exceptions::PyValueError::new_err(
"Model index must not be 0"
)),
};

if model_i >= model_start_i.len() as isize || model_i < 0 {
if model_i >= self.model_start_i.len() as isize || model_i < 0 {
return Err(exceptions::PyValueError::new_err(format!(
"The file has {} models, the given model {} does not exist",
model_start_i.len(), model
self.model_start_i.len(), model
)));
}

if model_i == model_start_i.len() as isize - 1 {

// Get the start and stop line index for this model index
let (model_start, model_stop) = match model_i.cmp(&(self.model_start_i.len() as isize - 1)){
Ordering::Less => (
self.model_start_i[model_i as usize],
self.model_start_i[(model_i+1) as usize]
),
// Last model -> Model reaches to end of file
Ok((model_start_i[model_i as usize], self.lines.len()))
}
else {
Ok((model_start_i[model_i as usize], model_start_i[(model_i+1) as usize]))
}
Ordering::Equal => (
self.model_start_i[model_i as usize],
self.lines.len()
),
// This case was excluded above
_ => panic!("This branch should not be reached")
};

// Get the atom records within these line boundaries
Ok(
self.atom_line_i.iter().copied()
.filter(|i| *i >= model_start && *i < model_stop)
.collect()
)
}


/// Get the number of atoms in each model of the PDB file.
/// A `PyErr` is returned if the number of atoms per model differ from each other.
fn get_model_length(&self,
model_start_i: &[usize],
atom_line_i: &[usize]) -> PyResult<usize> {
let n_models = model_start_i.len();
fn get_model_length(&self) -> PyResult<usize> {
let n_models = self.model_start_i.len();
let mut length: Option<usize> = None;
for model_i in 0..n_models {
let model_start: usize = model_start_i[model_i];
let model_stop: usize = if model_i+1 < n_models { model_start_i[model_i+1] }
let model_start: usize = self.model_start_i[model_i];
let model_stop: usize = if model_i+1 < n_models { self.model_start_i[model_i+1] }
else { self.lines.len() };
let model_length = atom_line_i.iter()
let model_length = self.atom_line_i.iter()
.filter(|&line_i| *line_i >= model_start && *line_i < model_stop)
.count();
match length {
Expand Down Expand Up @@ -672,6 +696,15 @@ fn parse_digit(digit: &u8) -> PyResult<i64> {
)
}

//
fn parse_float_from_string(line: &str, start: usize, stop: usize) -> PyResult<f32>{
line[start..stop].trim().parse().map_err(|_|
biotite::InvalidFileError::new_err(format!(
"'{}' cannot be parsed into a float", line[start..stop].trim()
))
)
}


/// If a given `id` exceeds `max_id` the returned ID restarts counting at 1.
/// Otherwise, `id` is returned.
Expand Down

0 comments on commit c704735

Please sign in to comment.