From 4fa5ea8935fd5449ca7de861d49e02cb642a9c95 Mon Sep 17 00:00:00 2001
From: Oliver Beckstein <orbeckst@gmail.com>
Date: Mon, 15 Jun 2015 22:27:13 -0700
Subject: [PATCH] HydrogenBondAnalysis: small improvements

- generate_table(): do not build recarray but just dtype array and removed superfluous code
- added tests for generate_table()
- import itertools at module level instead multiple times in methods
---
 .../analysis/hbonds/hbond_analysis.py         | 32 +++++++------------
 testsuite/MDAnalysisTests/test_analysis.py    | 28 ++++++++++++----
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/package/MDAnalysis/analysis/hbonds/hbond_analysis.py b/package/MDAnalysis/analysis/hbonds/hbond_analysis.py
index ad170302814..f8df878e6b0 100644
--- a/package/MDAnalysis/analysis/hbonds/hbond_analysis.py
+++ b/package/MDAnalysis/analysis/hbonds/hbond_analysis.py
@@ -311,6 +311,7 @@ class HydrogenBondAnalysis_OtherFF(HydrogenBondAnalysis):
 """
 
 from collections import defaultdict
+import itertools
 import numpy
 
 from MDAnalysis import MissingDataWarning, NoDataError, SelectionError, SelectionWarning
@@ -942,8 +943,6 @@ def generate_table(self):
 
         .. _recsql: http://pypi.python.org/pypi/RecSQL
         """
-        from itertools import izip
-
         if self.timeseries is None:
             msg = "No timeseries computed, do run() first."
             warnings.warn(msg, category=MissingDataWarning)
@@ -951,31 +950,23 @@ def generate_table(self):
             return
 
         num_records = numpy.sum([len(hframe) for hframe in self.timeseries])
+        # build empty output table
         dtype = [
             ("time", float), ("donor_idx", int), ("acceptor_idx", int),
             ("donor_resnm", "|S4"), ("donor_resid", int), ("donor_atom", "|S4"),
             ("acceptor_resnm", "|S4"), ("acceptor_resid", int), ("acceptor_atom", "|S4"),
             ("distance", float), ("angle", float)]
-        self.table = numpy.recarray((num_records,), dtype=dtype)
-
         # according to Lukas' notes below, using a recarray at this stage is ineffective
-        # and speedups of ~x10 could be achieved by filling a standard array
-        # (perhaps at the cost of less clarity... but that might just be my code ;-) -- orbeckst)
+        # and speedups of ~x10 can be achieved by filling a standard array, like this:
+        out = numpy.empty((num_records,), dtype=dtype)
         cursor = 0  # current row
-        for t, hframe in izip(self.timesteps, self.timeseries):
-            if len(hframe) == 0:
-                continue  # not really necessary, should also work without
-            self.table[cursor:cursor + len(hframe)].time = t
+        for t, hframe in itertools.izip(self.timesteps, self.timeseries):
             for donor_idx, acceptor_idx, donor, acceptor, distance, angle in hframe:
-                r = self.table[cursor]
-                r.donor_idx = donor_idx
-                r.donor_resnm, r.donor_resid, r.donor_atom = parse_residue(donor)
-                r.acceptor_idx = acceptor_idx
-                r.acceptor_resnm, r.acceptor_resid, r.acceptor_atom = parse_residue(acceptor)
-                r.distance = distance
-                r.angle = angle
+                out[cursor] = (t, donor_idx, acceptor_idx) + parse_residue(donor) + \
+                    parse_residue(acceptor) + (distance, angle)
                 cursor += 1
         assert cursor == num_records, "Internal Error: Not all HB records stored"
+        self.table = out.view(numpy.recarray)
         logger.debug("HBond: Stored results as table with %(num_records)d entries.", vars())
 
     def save_table(self, filename="hbond_table.pickle"):
@@ -999,7 +990,6 @@ def count_by_time(self):
 
         :Returns: a class:`numpy.recarray`
         """
-        from itertools import izip, imap
 
         if self.timeseries is None:
             msg = "No timeseries computed, do run() first."
@@ -1008,7 +998,8 @@ def count_by_time(self):
             return
 
         out = numpy.empty((len(self.timesteps),), dtype=[('time', float), ('count', int)])
-        for cursor, time_count in enumerate(izip(self.timesteps, imap(len, self.timeseries))):
+        for cursor, time_count in enumerate(itertools.izip(self.timesteps,
+                                                           itertools.imap(len, self.timeseries))):
             out[cursor] = time_count
         return out.view(numpy.recarray)
 
@@ -1079,7 +1070,6 @@ def timesteps_by_type(self):
 
         :Returns: a class:`numpy.recarray`
         """
-        from itertools import izip
 
         if self.timeseries is None:
             msg = "No timeseries computed, do run() first."
@@ -1088,7 +1078,7 @@ def timesteps_by_type(self):
             return
 
         hbonds = defaultdict(list)
-        for (t, hframe) in izip(self.timesteps, self.timeseries):
+        for (t, hframe) in itertools.izip(self.timesteps, self.timeseries):
             for donor_idx, acceptor_idx, donor, acceptor, distance, angle in hframe:
                 donor_resnm, donor_resid, donor_atom = parse_residue(donor)
                 acceptor_resnm, acceptor_resid, acceptor_atom = parse_residue(acceptor)
diff --git a/testsuite/MDAnalysisTests/test_analysis.py b/testsuite/MDAnalysisTests/test_analysis.py
index aa69e5d9003..7d913c05f38 100644
--- a/testsuite/MDAnalysisTests/test_analysis.py
+++ b/testsuite/MDAnalysisTests/test_analysis.py
@@ -208,7 +208,11 @@ def setUp(self):
             'angle': 150.0,
         }
         # ideal helix with 1 proline:
-        self.num_bb_hbonds = u.atoms.numberOfResidues() - u.SYSTEM.PRO.numberOfResidues() - 4
+        self.values = {
+            'num_bb_hbonds':  u.atoms.numberOfResidues() - u.SYSTEM.PRO.numberOfResidues() - 4,
+            'donor_resid': numpy.array([5,  6,  8,  9, 10, 11, 12, 13]),
+            'acceptor_resnm': numpy.array(['ALA', 'ALA', 'ALA', 'ALA', 'ALA', 'PRO', 'ALA', 'ALA']),
+            }
 
     def _run(self, **kwargs):
         kw = self.kwargs.copy()
@@ -219,26 +223,36 @@ def _run(self, **kwargs):
 
     def test_helix_backbone(self):
         h = self._run()
-        assert_equal(len(h.timeseries[0]), self.num_bb_hbonds, "wrong number of backbone hydrogen bonds")
+        assert_equal(len(h.timeseries[0]),
+                     self.values['num_bb_hbonds'], "wrong number of backbone hydrogen bonds")
         assert_equal(h.timesteps, [0.0])
 
+    def test_generate_table(self):
+        h = self._run()
+        h.generate_table()
+        assert_equal(len(h.table),
+                     self.values['num_bb_hbonds'], "wrong number of backbone hydrogen bonds in table")
+        assert_(isinstance(h.table, numpy.core.records.recarray))
+        assert_array_equal(h.table.donor_resid, self.values['donor_resid'])
+        assert_array_equal(h.table.acceptor_resnm, self.values['acceptor_resnm'])
+
     # TODO: Expand tests because the following ones are a bit superficial
     #       because we should really run them on a trajectory
 
     def test_count_by_time(self):
         h = self._run()
         c = h.count_by_time()
-        assert_equal(c.tolist(), [(0.0, self.num_bb_hbonds)])
+        assert_equal(c.tolist(), [(0.0, self.values['num_bb_hbonds'])])
 
     def test_count_by_type(self):
         h = self._run()
         c = h.count_by_type()
-        assert_equal(c.frequency, self.num_bb_hbonds * [1.0])
+        assert_equal(c.frequency, self.values['num_bb_hbonds'] * [1.0])
 
     def test_count_by_type(self):
         h = self._run()
         t = h.timesteps_by_type()
-        assert_equal(t.time, self.num_bb_hbonds * [0.0])
+        assert_equal(t.time, self.values['num_bb_hbonds'] * [0.0])
 
     def tearDown(self):
         del self.universe
@@ -261,7 +275,9 @@ class TestHydrogenBondAnalysisHeavyFail(TestHydrogenBondAnalysisHeavy):
     def setUp(self):
         super(TestHydrogenBondAnalysisHeavyFail, self).setUp()
         self.kwargs["distance"] = 3.0
-        self.num_bb_hbonds = 0  # no H-bonds with a D-A distance < 3.0 A (they start at 3.05 A)
+        self.values['num_bb_hbonds'] = 0  # no H-bonds with a D-A distance < 3.0 A (they start at 3.05 A)
+        self.values['donor_resid'] = numpy.array([])
+        self.values['acceptor_resnm'] = numpy.array([], dtype="|S3")
 
 class TestHydrogenBondAnalysisChecking(object):
     def _setUp(self):