MAINT: Refactor Network.nsi_betweenness()

Changes: - Cythonize outer loop over nodes - remove repeated array allocations - enable parallel outer loop using `multiprocess` package (naive, no shared memory) - assert precondition on array sizes related to node degrees Context: - related: #142 - original Weave -> Cython port: 20ad40d
pik-copan · Oct 19, 2023 · 9cbc0bc · 9cbc0bc
1 parent ed8b158
commit 9cbc0bc
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 131 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -38,7 +38,7 @@ install:
     - eval "$(conda shell.bash hook)"
     - conda activate test-env
     - travis_retry conda install -c conda-forge python=$TRAVIS_PYTHON_VERSION
-    - travis_retry conda install -c conda-forge numpy scipy python-igraph h5netcdf tqdm
+    - travis_retry conda install -c conda-forge numpy scipy python-igraph h5netcdf multiprocess tqdm
     - travis_retry conda update  -c conda-forge --all
 
     # testing dependencies

diff --git a/setup.cfg b/setup.cfg
@@ -47,6 +47,7 @@ install_requires =
     scipy >= 1.10
     igraph >= 0.10
     h5netcdf >= 1.1
+    multiprocess >= 0.70
     tqdm >= 4.66
 python_requires = >=3.8
 packages = find:

diff --git a/src/pyunicorn/core/_ext/numerics.pyx b/src/pyunicorn/core/_ext/numerics.pyx
@@ -411,85 +411,101 @@ def _local_cliquishness_5thorder(
 
 def _nsi_betweenness(
     int N, ndarray[DWEIGHT_t, ndim=1] w,
-    ndarray[DEGREE_t, ndim=1] k, int j,
-    ndarray[DFIELD_t, ndim=1] betweenness_to_j,
-    ndarray[DFIELD_t, ndim=1] excess_to_j,
-    ndarray[NODE_t, ndim=1] offsets,
+    ndarray[DEGREE_t, ndim=1] k,
+    ndarray[NODE_t, ndim=1] targets,
     ndarray[NODE_t, ndim=1] flat_neighbors,
-    ndarray[MASK_t, ndim=1] is_source,
-    ndarray[NODE_t, ndim=1] flat_predecessors):
+    ndarray[MASK_t, ndim=1] is_source):
+    """
+    Performs Newman's algorithm. [Newman2001]_
+    """
 
     cdef:
-        int qi, oi, queue_len, l_index, ql
+        long int E = len(flat_neighbors)
+        int j, qi, oi, queue_len, l_index, ql
         NODE_t l, i, next_d, dl, ol, fi
         DFIELD_t base_factor
-        ndarray[NODE_t, ndim=1] distances_to_j =\
-            2 * N * np.ones(N, dtype=NODE)
+        ndarray[NODE_t, ndim=1] offsets = np.zeros(N, dtype=NODE)
+        ndarray[NODE_t, ndim=1] distances_to_j = np.ones(N, dtype=NODE)
         ndarray[NODE_t, ndim=1] n_predecessors = np.zeros(N, dtype=NODE)
+        ndarray[NODE_t, ndim=1] flat_predecessors = np.zeros(E, dtype=NODE)
         ndarray[NODE_t, ndim=1] queue = np.zeros(N, dtype=NODE)
         ndarray[DFIELD_t, ndim=1] multiplicity_to_j = np.zeros(N, dtype=DFIELD)
-
-    # init distances to j and queue of nodes by distance from j
-    for l in range(N):
-        # distances_to_j[l] = 2 * N
-        # n_predecessors[l] = 0
-        # multiplicity_to_j[l] = 0.0
-        # initialize contribution of paths ending in j to the betweenness of l
-        excess_to_j[l] = betweenness_to_j[l] = is_source[l] * w[l]
-
-    distances_to_j[j] = 0
-    queue[0] = j
-    queue_len = 1
-    multiplicity_to_j[j] = w[j]
-
-    # process the queue forward and grow it on the way: (this is the standard
-    # breadth-first search giving all the shortest paths to j)
-    qi = 0
-    while qi < queue_len:
-    #for qi in range(queue_len):
-        i = queue[qi]
-        if i == -1:
-            # this should never happen ...
-            print("Opps: %d,%d,%d\n" % qi, queue_len, i)
-            break
-        next_d = distances_to_j[i] + 1
-        #iterate through all neighbors l of i
-        oi = offsets[i]
-        for l_index in range(oi, oi+k[i]):
-            # if on a shortes j-l-path, register i as predecessor of l
-            l = flat_neighbors[l_index]
-            dl = distances_to_j[l]
-            if dl >= next_d:
-                fi = offsets[l] + n_predecessors[l]
-                n_predecessors[l] += 1
-                flat_predecessors[fi] = i
-                multiplicity_to_j[l] += w[l] * multiplicity_to_j[i]
-                if dl > next_d:
-                    distances_to_j[l] = next_d
-                    queue[queue_len] = l
-                    queue_len += 1
-        qi += 1
-
-    # process the queue again backward: (this is Newman's 2nd part where
-    # the contribution of paths ending in j to the betweenness of all nodes
-    # is computed recursively by traversing the shortest paths backwards)
-    for ql in range(queue_len-1, -1, -1):
-        l = queue[ql]
-        if l == -1:
-            print("Opps: %d,%d,%d\n" % ql, queue_len, l)
-            break
-        if l == j:
-            # set betweenness and excess to zero
-            betweenness_to_j[l] = excess_to_j[l] = 0
-        else:
-            # otherwise, iterate through all predecessors i of l:
-            base_factor = w[l] / multiplicity_to_j[l]
-            ol = offsets[l]
-            for fi in range(ol, ol+n_predecessors[l]):
-                # add betweenness to predecessor
-                i = flat_predecessors[fi]
-                betweenness_to_j[i] += betweenness_to_j[l] * base_factor * \
-                    multiplicity_to_j[i]
+        ndarray[DFIELD_t, ndim=1] betweenness_to_j = np.zeros(N, dtype=DFIELD)
+        ndarray[DFIELD_t, ndim=1] excess_to_j = np.zeros(N, dtype=DFIELD)
+        ndarray[DFIELD_t, ndim=1] betweenness_times_w = np.zeros(N, dtype=DFIELD)
+
+    # init node offsets
+    # NOTE: We don't use k.cumsum() since that uses too much memory!
+    for i in range(1, N):
+        offsets[i] = offsets[i-1] + k[i-1]
+
+    for j in targets:
+        # init distances to j and queue of nodes by distance from j
+        distances_to_j.fill(2 * N)
+        n_predecessors.fill(0)
+        flat_predecessors.fill(0)
+        queue.fill(0)
+        multiplicity_to_j.fill(0)
+
+        # init contribution of paths ending in j to the betweenness of l
+        for l in range(N):
+            excess_to_j[l] = betweenness_to_j[l] = is_source[l] * w[l]
+
+        distances_to_j[j] = 0
+        queue[0] = j
+        queue_len = 1
+        multiplicity_to_j[j] = w[j]
+
+        # process the queue forward and grow it on the way: (this is the
+        # standard breadth-first search giving all the shortest paths to j)
+        qi = 0
+        while qi < queue_len:
+            i = queue[qi]
+            if i == -1:
+                # this should never happen ...
+                print("Opps: %d,%d,%d\n" % qi, queue_len, i)
+                break
+            next_d = distances_to_j[i] + 1
+            # iterate through all neighbors l of i
+            oi = offsets[i]
+            for l_index in range(oi, oi+k[i]):
+                # if on a shortest j-l-path, register i as predecessor of l
+                l = flat_neighbors[l_index]
+                dl = distances_to_j[l]
+                if dl >= next_d:
+                    fi = offsets[l] + n_predecessors[l]
+                    n_predecessors[l] += 1
+                    flat_predecessors[fi] = i
+                    multiplicity_to_j[l] += w[l] * multiplicity_to_j[i]
+                    if dl > next_d:
+                        distances_to_j[l] = next_d
+                        queue[queue_len] = l
+                        queue_len += 1
+            qi += 1
+
+        # process the queue again backward: (this is Newman's 2nd part where the
+        # contribution of paths ending in j to the betweenness of all nodes is
+        # computed recursively by traversing the shortest paths backwards)
+        for ql in range(queue_len-1, -1, -1):
+            l = queue[ql]
+            if l == -1:
+                print("Opps: %d,%d,%d\n" % ql, queue_len, l)
+                break
+            if l == j:
+                # set betweenness and excess to zero
+                betweenness_to_j[l] = excess_to_j[l] = 0
+            else:
+                # otherwise, iterate through all predecessors i of l:
+                base_factor = w[l] / multiplicity_to_j[l]
+                ol = offsets[l]
+                for fi in range(ol, ol+n_predecessors[l]):
+                    # add betweenness to predecessor
+                    i = flat_predecessors[fi]
+                    betweenness_to_j[i] += betweenness_to_j[l] * base_factor * \
+                        multiplicity_to_j[i]
+
+        betweenness_times_w += w[j] * (betweenness_to_j - excess_to_j)
+    return betweenness_times_w
 
 
 def _mpi_newman_betweenness(

diff --git a/src/pyunicorn/core/network.py b/src/pyunicorn/core/network.py
@@ -45,6 +45,7 @@
 
 import igraph                       # high performance graph theory tools
 
+from multiprocess import Pool, cpu_count
 from ..utils import mpi             # parallelized computations
 
 from ._ext.types import \
@@ -261,8 +262,8 @@ def __init__(self, adjacency=None, n_nodes=None, edge_list=None,
         elif edge_list is not None:
             self.set_edge_list(edge_list, n_nodes)
         else:
-            raise NetworkError("An adjacency matrix or edge list has to be \
-                               given to initialize an instance of Network.")
+            raise NetworkError("An adjacency matrix or edge list has to be "
+                               "given to initialize an instance of Network.")
 
         self._set_node_weights(node_weights)
         self.degree()
@@ -854,7 +855,8 @@ def ErdosRenyi(n_nodes=100, link_probability=None, n_links=None,
             graph = igraph.Graph.Erdos_Renyi(n=n_nodes, m=n_links)
 
         else:
-            return None
+            raise ValueError("`ErdosRenyi()` requires either a "
+                             "`link_probability` or `n_links` argument.")
 
         # return adjacency matrix
         return np.array(graph.get_adjacency(type=2).data)
@@ -2681,7 +2683,7 @@ def assortativity(self):
         >>> r(Network.SmallTestNetwork().assortativity())
         -0.4737
 
-        :rtype: float between 0 and 1
+        :rtype: float
         """
         degrees = self.graph.degree()
         degrees_sq = [deg**2 for deg in degrees]
@@ -3067,7 +3069,7 @@ def link_betweenness(self):
          [ 0.   2.   0.   0.   3.   0. ] [ 3.5  3.5  0.   0.   0.   0. ]
          [ 5.5  2.5  3.   0.   0.   0. ] [ 5.   0.   0.   0.   0.   0. ]]
 
-        :rtype:  square numpy array [node,node] of floats between 0 and 1
+        :rtype:  square numpy array [node,node] of floats
         :return: Entry [i,j] is the betweenness of the link between i and j,
                  or 0 if i is not linked to j.
         """
@@ -3106,7 +3108,7 @@ def edge_betweenness(self):
          [ 0.   2.   0.   0.   3.   0. ] [ 3.5  3.5  0.   0.   0.   0. ]
          [ 5.5  2.5  3.   0.   0.   0. ] [ 5.   0.   0.   0.   0.   0. ]]
 
-        :rtype:  square numpy array [node,node] of floats between 0 and 1
+        :rtype:  square numpy array [node,node] of floats
         :return: Entry [i,j] is the betweenness of the link between i and j,
                  or 0 if i is not linked to j.
         """
@@ -3172,7 +3174,7 @@ def interregional_betweenness(self, sources=None, targets=None):
         :type targets: 1d numpy array or list of ints from 0 to n_nodes-1
         :arg  targets: Set of target node indices.
 
-        :rtype: 1d numpy array [node] of floats between 0 and 1
+        :rtype: 1d numpy array [node] of floats
         """
         return self.nsi_betweenness(sources=sources, targets=targets,
                                     aw=0, silent=1)
@@ -3201,13 +3203,13 @@ def nsi_interregional_betweenness(self, sources, targets):
         Calculating interregional betweenness...
         array([ 1.,  1.,  0.,  0.,  1.,  0.])
 
-        :rtype: 1d numpy array [node] of floats between 0 and 1
+        :rtype: 1d numpy array [node] of floats
         """
         return self.nsi_betweenness(sources=sources, targets=targets, silent=1)
 
-    def nsi_betweenness(self, **kwargs):
+    def nsi_betweenness(self, parallelize: bool = False, **kwargs):
         """
-        For each node, return its n.s.i. betweenness.
+        For each node, return its n.s.i. betweenness. [Newman2001]_
 
         This measures roughly how many shortest paths pass through the node,
         taking node weights into account.
@@ -3232,7 +3234,7 @@ def nsi_betweenness(self, **kwargs):
         Calculating node betweenness...
         array([ 8.5,  1.5,  0. ,  1.5,  4.5,  0. ,  0. ])
 
-        :rtype: 1d numpy array [node] of floats between 0 and 1
+        :rtype: 1d numpy array [node] of floats
         """
         if self.silence_level <= 1:
             if "silent" not in kwargs:
@@ -3242,50 +3244,39 @@ def nsi_betweenness(self, **kwargs):
         if "aw" in kwargs:
             if kwargs["aw"] == 0:
                 w = np.ones_like(w)
-
         N, k = self.N, self.degree()
-        rN = range(0, N)
-        betweenness_times_w = np.zeros(N, dtype=DFIELD)
 
-        # initialize node lists:
+        # initialize node lists
         is_source = np.zeros(N, dtype=MASK)
         if "sources" in kwargs and kwargs["sources"] is not None:
             is_source[kwargs["sources"]] = 1
         else:
-            is_source[rN] = 1
+            is_source[range(0, N)] = 1
         if "targets" in kwargs and kwargs["targets"] is not None:
-            targets = kwargs["targets"]
+            targets = np.array(list(map(int, kwargs["targets"])))
         else:
-            targets = rN
+            targets = np.arange(0, N)
 
-        # node offsets for flat arrays:
-        # NOTE: We don't use k.cumsum() since that uses too much memory!
-        offsets = np.zeros(N, dtype=NODE)
-        for i in range(1, N):
-            offsets[i] = offsets[i-1] + k[i-1]
-
-        # sort links by node indices (contains each link twice!):
+        # sort links by node indices (contains each link twice!)
         links = nz_coords(self.sp_A)
 
-        # neighbours of each node:
+        # neighbours of each node
         flat_neighbors = to_cy(np.array(links)[:, 1], NODE)
-        E = len(flat_neighbors)
-
-        # this main loop might be parallelized:
-        for j0 in targets:
-            j = int(j0)
-
-            betweenness_to_j = to_cy(w, DFIELD)
-            excess_to_j = to_cy(w, DFIELD)
-            flat_predecessors = np.zeros(E, dtype=NODE)
-            _nsi_betweenness(
-                N, to_cy(w, DWEIGHT), to_cy(k, DEGREE), j,
-                betweenness_to_j, excess_to_j, offsets, flat_neighbors,
-                is_source, flat_predecessors)
-            del flat_predecessors
-            betweenness_times_w += w[j] * (betweenness_to_j - excess_to_j)
-
-        return betweenness_times_w / w
+        assert k.sum() == len(flat_neighbors) == 2 * self.n_links
+        w, k = to_cy(w, DWEIGHT), to_cy(k, DEGREE)
+
+        def worker(batch):
+            return _nsi_betweenness(N, w, k, batch, flat_neighbors, is_source)
+
+        if parallelize:
+            # (naively) parallelize loop over nodes
+            n_workers = cpu_count()
+            batches = np.array_split(to_cy(targets, NODE), n_workers)
+            pool = Pool()
+            betw_w = np.sum(pool.map(worker, batches), axis=0)
+        else:
+            betw_w = worker(to_cy(targets, NODE))
+        return betw_w / w
 
     def _eigenvector_centrality_slow(self, link_attribute=None):
         """