Skip to content

Commit

Permalink
Merge pull request #11 from padix-key/res-names
Browse files Browse the repository at this point in the history
Update to 5-character residue names.
  • Loading branch information
padix-key authored Jan 12, 2024
2 parents d46f055 + 8dcfe58 commit c8f187f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 26 deletions.
27 changes: 21 additions & 6 deletions python-src/fastpdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=None, include_b
# Interpret uint32 arrays as unicode arrays
chain_id = np.frombuffer(chain_id, dtype="U4")
ins_code = np.frombuffer(ins_code, dtype="U1")
res_name = np.frombuffer(res_name, dtype="U3")
res_name = np.frombuffer(res_name, dtype="U5")
atom_name = np.frombuffer(atom_name, dtype="U6")
element = np.frombuffer(element, dtype="U2")
altloc_id = np.frombuffer(altloc_id, dtype="U1")
Expand Down Expand Up @@ -254,11 +254,11 @@ def set_structure(self, atoms):

# Write 'ATOM' and 'MODEL' records
# Convert Unicode arrays into uint32 arrays for usage in Rust
chain_id = np.frombuffer(atoms.chain_id, dtype=np.uint32).reshape(-1, 4)
ins_code = np.frombuffer(atoms.ins_code, dtype=np.uint32).reshape(-1, 1)
res_name = np.frombuffer(atoms.res_name, dtype=np.uint32).reshape(-1, 3)
atom_name = np.frombuffer(atoms.atom_name, dtype=np.uint32).reshape(-1, 6)
element = np.frombuffer(atoms.element, dtype=np.uint32).reshape(-1, 2)
chain_id = _convert_unicode_to_uint32(atoms.chain_id)
ins_code = _convert_unicode_to_uint32(atoms.ins_code)
res_name = _convert_unicode_to_uint32(atoms.res_name)
atom_name = _convert_unicode_to_uint32(atoms.atom_name)
element = _convert_unicode_to_uint32(atoms.element)

categories = atoms.get_annotation_categories()
atom_id = atoms.atom_id if "atom_id" in categories else None
Expand Down Expand Up @@ -320,3 +320,18 @@ def _index_models_and_atoms(self):
self._pdb_file.index_models_and_atoms()
self._model_start_i = self._pdb_file.model_start_i
self._atom_line_i = self._pdb_file.atom_line_i


def _convert_unicode_to_uint32(array):
"""
Convert a unicode string array into a 2D uint32 array.
The second dimension corresponds to the character position within a
string.
"""
dtype = array.dtype
if not np.issubdtype(dtype, np.str_):
raise TypeError("Expected unicode string array")
length = array.shape[0]
n_char = dtype.itemsize // 4
return np.frombuffer(array, dtype=np.uint32).reshape(length, n_char)
40 changes: 20 additions & 20 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct PDBFile {

#[pymethods]
impl PDBFile {

/// Create an new [`PDBFile`].
/// The lines of text are given to `lines`.
/// An empty `Vec` represents and empty PDB file.
Expand Down Expand Up @@ -99,7 +99,7 @@ impl PDBFile {
})
}


/// Get the number of models contained in the file.
fn get_model_count(&self) -> usize {
self.model_start_i.len()
Expand Down Expand Up @@ -180,7 +180,7 @@ impl PDBFile {
/// Unicode *NumPy* arrays are represented by 2D *NumPy* arrays with
/// `uint32` *dtype*.
/// The returned tuple contains the following annotations in the given order:
///
///
/// - `chain_id`
/// - `res_id`
/// - `ins_code`
Expand Down Expand Up @@ -211,16 +211,16 @@ impl PDBFile {
Option<Py<PyArray<f64, Ix1>>>,
Option<Py<PyArray<i64, Ix1>>>)> {
let atom_line_i: Vec<usize> = self.get_atom_indices(model)?;

let mut chain_id: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 4));
let mut res_id: Array<i64, Ix1> = Array::zeros(atom_line_i.len());
let mut ins_code: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 1));
let mut res_name: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 3));
let mut res_name: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 5));
let mut hetero: Array<bool, Ix1> = Array::default(atom_line_i.len());
let mut atom_name: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 6));
let mut element: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 2));
let mut altloc_id: Array<u32, Ix2> = Array::zeros((atom_line_i.len(), 1));

let mut atom_id: Array<i64, Ix1>;
if include_atom_id {
atom_id = Array::zeros(atom_line_i.len());
Expand Down Expand Up @@ -250,7 +250,7 @@ impl PDBFile {
else {
charge = Array::zeros(0);
}

// Iterate over ATOM and HETATM records to write annotation arrays
for (atom_i, line_i) in atom_line_i.iter().enumerate() {
let line = &self.lines[*line_i];
Expand Down Expand Up @@ -293,7 +293,7 @@ impl PDBFile {
}
}
}

Python::with_gil(|py| {
Ok((
PyArray::from_array(py, &chain_id ).to_owned(),
Expand Down Expand Up @@ -326,7 +326,7 @@ impl PDBFile {
atom_id_to_index.insert(*id, i as u32);
}
});

// Cannot preemptively determine number of bonds
// -> Memory allocation for all bonds is not possible
// -> No benefit in finding 'CONECT' record lines prior to iteration
Expand Down Expand Up @@ -409,11 +409,11 @@ impl PDBFile {
// This procedure aims to increase the performance is repetitive formatting is omitted
let mut prefix: Vec<String> = Vec::new();
let mut suffix: Vec<String> = Vec::new();

for i in 0..coord.shape()[1] {
let element_i = parse_string_from_array(&element, i)?;
let atom_name_i = parse_string_from_array(&atom_name, i)?;

prefix.push(format!(
"{:6}{:>5} {:4} {:>3} {:1}{:>4}{:1} ",
if hetero[i] { "HETATM" } else { "ATOM" },
Expand Down Expand Up @@ -449,7 +449,7 @@ impl PDBFile {
if is_multi_model {
self.lines.push(format!("MODEL {:>8}", model_i+1));
}
for atom_i in 0..coord.shape()[1] {
for atom_i in 0..coord.shape()[1] {
let coord_string = format!(
"{:>8.3}{:>8.3}{:>8.3}",
coord[[model_i, atom_i, 0]],
Expand All @@ -471,13 +471,13 @@ impl PDBFile {
/// array containing indices pointing to bonded atoms in the `AtomArray`.
/// The `atom_id` annotation array is required to map the atom IDs in `CONECT` records
/// to atom indices.
fn write_bonds(&mut self,
fn write_bonds(&mut self,
bonds: Py<PyArray<i32, Ix2>>,
atom_id: Py<PyArray<i64, Ix1>>) -> PyResult<()> {
Python::with_gil(|py| {
let bonds = bonds.as_ref(py).to_owned_array();
let atom_id = atom_id.as_ref(py).to_owned_array();

for (center_i, bonded_indices) in bonds.outer_iter().enumerate() {
let mut n_added: usize = 0;
let mut line: String = String::new();
Expand Down Expand Up @@ -552,7 +552,7 @@ impl PDBFile {
}
Ok(CoordArray::Single(coord))
},

None => {
let length = self.get_model_length()?;
let mut coord = Array::zeros((self.atom_line_i.len(), 3));
Expand Down Expand Up @@ -591,7 +591,7 @@ impl PDBFile {
self.model_start_i.len(), model
)));
}

// Get the start and stop line index for this model index
let (model_start, model_stop) = match model_i.cmp(&(self.model_start_i.len() as isize - 1)){
Ordering::Less => (
Expand All @@ -600,21 +600,21 @@ impl PDBFile {
),
// Last model -> Model reaches to end of file
Ordering::Equal => (
self.model_start_i[model_i as usize],
self.model_start_i[model_i as usize],
self.lines.len()
),
// This case was excluded above
_ => panic!("This branch should not be reached")
};

// Get the atom records within these line boundaries
Ok(
self.atom_line_i.iter().copied()
.filter(|i| *i >= model_start && *i < model_stop)
.collect()
)
}


/// Get the number of atoms in each model of the PDB file.
/// A `PyErr` is returned if the number of atoms per model differ from each other.
Expand All @@ -635,7 +635,7 @@ impl PDBFile {
)); }
};
}

match length {
None => panic!("Length cannot be 'None'"),
Some(l) => Ok(l)
Expand Down

0 comments on commit c8f187f

Please sign in to comment.