From a3d9e42935fc3aa2e24b5d1a90c995d9910dc6c5 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Fri, 18 Dec 2020 12:28:12 +0100
Subject: [PATCH 1/8] pyln: Log mempool when we generate blocks in BitcoinD
 fixture

We sometimes have very specific sequences of tx broadcasts and blocks
being generated to confirm them. If the confirmation is missed the
test can completely get out of sync. Make debugging this easier by
logging what we confirmed.
---
 contrib/pyln-testing/pyln/testing/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py
index 9ab4e17437cc..2435df73dde0 100644
--- a/contrib/pyln-testing/pyln/testing/utils.py
+++ b/contrib/pyln-testing/pyln/testing/utils.py
@@ -401,6 +401,14 @@ def generate_block(self, numblocks=1, wait_for_mempool=0):
                 wait_for(lambda: all(txid in self.rpc.getrawmempool() for txid in wait_for_mempool))
             else:
                 wait_for(lambda: len(self.rpc.getrawmempool()) >= wait_for_mempool)
+
+        mempool = self.rpc.getrawmempool()
+        logging.debug("Generating {numblocks}, confirming {lenmempool} transactions: {mempool}".format(
+            numblocks=numblocks,
+            mempool=mempool,
+            lenmempool=len(mempool),
+        ))
+
         # As of 0.16, generate() is removed; use generatetoaddress.
         return self.rpc.generatetoaddress(numblocks, self.rpc.getnewaddress())
 

From 594b344eaf641c2d0c708f0bf22da4725871a871 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Fri, 18 Dec 2020 12:37:35 +0100
Subject: [PATCH 2/8] pyln: Add logging to bitcoind RPC calls

Useful if we want to debug a bit better
---
 contrib/pyln-testing/pyln/testing/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py
index 2435df73dde0..d60c0eba10b0 100644
--- a/contrib/pyln-testing/pyln/testing/utils.py
+++ b/contrib/pyln-testing/pyln/testing/utils.py
@@ -321,7 +321,16 @@ def __getattr__(self, name):
         proxy = BitcoinProxy(btc_conf_file=self.__btc_conf_file__)
 
         def f(*args):
-            return proxy._call(name, *args)
+            logging.debug("Calling {name} with arguments {args}".format(
+                name=name,
+                args=args
+            ))
+            res = proxy._call(name, *args)
+            logging.debug("Result for {name} call: {res}".format(
+                name=name,
+                res=res,
+            ))
+            return res
 
         # Make debuggers show <function bitcoin.rpc.name> rather than <function
         # bitcoin.rpc.<lambda>>

From 7e9f9cc621c3b976153005dedb2434efa7d537e7 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Fri, 11 Dec 2020 12:51:45 +0100
Subject: [PATCH 3/8] pytest: Add throttler to limit load on the test system

Both my machine and apparently the CI tester machines regularly run
into issues with load on the system, causing timeouts (and
unresponsiveness). The throttler throttles the speed with which new
instances of c-lightning get started to avoid overloading. Since the
plugin used for parallelism when testing spawns multiple processes we
need to lock on the fs. Since we have that file open already, we'll
also write a couple of performance metics to it.
---
 contrib/pyln-testing/pyln/testing/fixtures.py | 12 +++-
 contrib/pyln-testing/pyln/testing/utils.py    | 56 ++++++++++++++++++-
 contrib/pyln-testing/requirements.txt         |  2 +
 tests/fixtures.py                             |  2 +-
 4 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/contrib/pyln-testing/pyln/testing/fixtures.py b/contrib/pyln-testing/pyln/testing/fixtures.py
index ea6ccdc5f1b6..ae5fcd51f50e 100644
--- a/contrib/pyln-testing/pyln/testing/fixtures.py
+++ b/contrib/pyln-testing/pyln/testing/fixtures.py
@@ -1,6 +1,6 @@
 from concurrent import futures
 from pyln.testing.db import SqliteDbProvider, PostgresDbProvider
-from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, DEVELOPER, LightningNode, TEST_DEBUG
+from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, DEVELOPER, LightningNode, TEST_DEBUG, Throttler
 from typing import Dict
 
 import logging
@@ -198,7 +198,12 @@ def teardown_checks(request):
 
 
 @pytest.fixture
-def node_factory(request, directory, test_name, bitcoind, executor, db_provider, teardown_checks, node_cls):
+def throttler():
+    yield Throttler()
+
+
+@pytest.fixture
+def node_factory(request, directory, test_name, bitcoind, executor, db_provider, teardown_checks, node_cls, throttler):
     nf = NodeFactory(
         request,
         test_name,
@@ -206,7 +211,8 @@ def node_factory(request, directory, test_name, bitcoind, executor, db_provider,
         executor,
         directory=directory,
         db_provider=db_provider,
-        node_cls=node_cls
+        node_cls=node_cls,
+        throttler=throttler,
     )
 
     yield nf
diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py
index d60c0eba10b0..66fe4c47dee7 100644
--- a/contrib/pyln-testing/pyln/testing/utils.py
+++ b/contrib/pyln-testing/pyln/testing/utils.py
@@ -6,6 +6,7 @@
 from collections import OrderedDict
 from decimal import Decimal
 from ephemeral_port_reserve import reserve  # type: ignore
+from filelock import FileLock
 from pyln.client import LightningRpc
 from pyln.client import Millisatoshi
 
@@ -14,6 +15,7 @@
 import lzma
 import math
 import os
+import psutil  # type: ignore
 import random
 import re
 import shutil
@@ -1038,10 +1040,59 @@ def passes_filters(hmsg, filters):
         return msgs
 
 
+class Throttler(object):
+    """Throttles the creation of system-processes to avoid overload.
+
+    There is no reason to overload the system with too many processes
+    being spawned or run at the same time. It causes timeouts by
+    aggressively preempting processes and swapping if the memory limit is
+    reached. In order to reduce this loss of performance we provide a
+    `wait()` method which will serialize the creation of processes, but
+    also delay if the system load is too high.
+
+    Notice that technically we are throttling too late, i.e., we react
+    to an overload, but chances are pretty good that some other
+    already running process is about to terminate, and so the overload
+    is short-lived. We throttle when the process object is first
+    created, not when restarted, in order to avoid delaying running
+    tests, which could cause more timeouts.
+
+    """
+    def __init__(self, target: float = 75):
+        """If specified we try to stick to a load of target (in percent).
+        """
+        self.target = target
+        self.lock = FileLock("/tmp/ltest.lock")
+        self.current_load = self.target  # Start slow
+        psutil.cpu_percent()  # Prime the internal load metric
+
+    def wait(self):
+        start_time = time.time()
+        with self.lock.acquire(poll_intervall=0.250):
+            # We just got the lock, assume someone else just released it
+            self.current_load = 100
+            while self.load() >= self.target:
+                time.sleep(1)
+
+            delay = time.time() - start_time
+            with open("/tmp/ltest-throttler.csv", "a") as f:
+                f.write("{}, {}, {}, {}\n".format(time.time(), self.load(), self.target, delay))
+            self.current_load = 100  # Back off slightly to avoid triggering right away
+
+    def load(self):
+        """An exponential moving average of the load
+        """
+        decay = 0.5
+        load = psutil.cpu_percent()
+        self.current_load = decay * load + (1 - decay) * self.current_load
+        return self.current_load
+
+
 class NodeFactory(object):
     """A factory to setup and start `lightningd` daemons.
     """
-    def __init__(self, request, testname, bitcoind, executor, directory, db_provider, node_cls):
+    def __init__(self, request, testname, bitcoind, executor, directory,
+                 db_provider, node_cls, throttler):
         if request.node.get_closest_marker("slow_test") and SLOW_MACHINE:
             self.valgrind = False
         else:
@@ -1055,6 +1106,7 @@ def __init__(self, request, testname, bitcoind, executor, directory, db_provider
         self.lock = threading.Lock()
         self.db_provider = db_provider
         self.node_cls = node_cls
+        self.throttler = throttler
 
     def split_options(self, opts):
         """Split node options from cli options
@@ -1115,7 +1167,7 @@ def get_node(self, node_id=None, options=None, dbfile=None,
                  feerates=(15000, 11000, 7500, 3750), start=True,
                  wait_for_bitcoind_sync=True, may_fail=False,
                  expect_fail=False, cleandir=True, **kwargs):
-
+        self.throttler.wait()
         node_id = self.get_node_id() if not node_id else node_id
         port = self.get_next_port()
 
diff --git a/contrib/pyln-testing/requirements.txt b/contrib/pyln-testing/requirements.txt
index b28809f2dbdf..a4c3c932491c 100644
--- a/contrib/pyln-testing/requirements.txt
+++ b/contrib/pyln-testing/requirements.txt
@@ -5,3 +5,5 @@ cheroot==8.2.1
 ephemeral-port-reserve==1.1.1
 python-bitcoinlib==0.10.2
 psycopg2-binary==2.8.4
+filelock==3.0.*
+psutil==5.7.*
diff --git a/tests/fixtures.py b/tests/fixtures.py
index 5fb6ad39d993..8a7e9c249b5a 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -1,5 +1,5 @@
 from utils import DEVELOPER, TEST_NETWORK  # noqa: F401,F403
-from pyln.testing.fixtures import directory, test_base_dir, test_name, chainparams, node_factory, bitcoind, teardown_checks, db_provider, executor, setup_logging  # noqa: F401,F403
+from pyln.testing.fixtures import directory, test_base_dir, test_name, chainparams, node_factory, bitcoind, teardown_checks, throttler, db_provider, executor, setup_logging  # noqa: F401,F403
 from pyln.testing import utils
 from utils import COMPAT
 

From e80310ace6ab86e8626e5e85109d8aaa0b875584 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Tue, 15 Dec 2020 11:03:16 +0100
Subject: [PATCH 4/8] pyln: Check that the stderr line is not none in
 pyln-testing tail

I stumbled over this in a test run and it seems benign.
---
 contrib/pyln-testing/pyln/testing/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py
index 66fe4c47dee7..623fb5401148 100644
--- a/contrib/pyln-testing/pyln/testing/utils.py
+++ b/contrib/pyln-testing/pyln/testing/utils.py
@@ -227,7 +227,7 @@ def tail(self):
 
         if self.proc.stderr:
             for line in iter(self.proc.stderr.readline, ''):
-                if len(line) == 0:
+                if line is None or len(line) == 0:
                     break
                 self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip()
             self.proc.stderr.close()

From 78fc5b705d728f06a3e30ac992ce95d4a2e22a76 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Fri, 18 Dec 2020 10:42:00 +0100
Subject: [PATCH 5/8] pytest: Stabilize test_routing_gossip

We were getting bad gossip because some nodes discarded the channel
announcement for being in the future. This is because the node was, at
that time, below the confirmation height. It'd then discard the
followup messages because not preceded by an announcement, and getting
upset about that.
---
 tests/test_gossip.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_gossip.py b/tests/test_gossip.py
index b598a6d25612..121f64198102 100644
--- a/tests/test_gossip.py
+++ b/tests/test_gossip.py
@@ -510,6 +510,11 @@ def test_routing_gossip(node_factory, bitcoind):
         src.rpc.connect(dst.info['id'], 'localhost', dst.port)
         src.openchannel(dst, 25000)
 
+    # Avoid "bad gossip" caused by future announcements (a node below
+    # confirmation height receiving and ignoring the announcement,
+    # thus marking followup messages as bad).
+    sync_blockheight(bitcoind, nodes)
+
     # Allow announce messages.
     bitcoind.generate_block(5)
 

From df34ba779b04e23d0836376b339828cb9610b154 Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Fri, 18 Dec 2020 10:50:49 +0100
Subject: [PATCH 6/8] pyln: Do not rstrip() the return value of .append()

This was causing the following error

```
Exception in thread Thread-553:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/cirrus-ci-build/contrib/pyln-testing/pyln/testing/utils.py", line 232, in tail
    self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip()
AttributeError: 'NoneType' object has no attribute 'rstrip'

[gw5] [ 33%] FAILED tests/test_misc.py::test_bitcoin_failure
```

Notice the second call to `.rstrip()` on the return value of `.append()`
---
 contrib/pyln-testing/pyln/testing/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py
index 623fb5401148..310ebde5e221 100644
--- a/contrib/pyln-testing/pyln/testing/utils.py
+++ b/contrib/pyln-testing/pyln/testing/utils.py
@@ -227,9 +227,13 @@ def tail(self):
 
         if self.proc.stderr:
             for line in iter(self.proc.stderr.readline, ''):
+
                 if line is None or len(line) == 0:
                     break
-                self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip()
+
+                line = line.rstrip().decode('UTF-8', 'replace')
+                self.err_logs.append(line)
+
             self.proc.stderr.close()
 
     def is_in_log(self, regex, start=0):

From cc3a8e93a7d6d870bbd3abcc5c62b4fd2d1eb22f Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Mon, 21 Dec 2020 13:09:49 +0100
Subject: [PATCH 7/8] pytest: Disable test_funding_cancel_race on slow machines

The CI regularly trips up on this very taxing test (100 nodes) so only
run it if we have the required horsepower.
---
 tests/test_connection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_connection.py b/tests/test_connection.py
index 6988cf44c727..3f736eb2b729 100644
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@@ -11,6 +11,7 @@
     check_coin_moves, first_channel_id, account_balance, basic_fee,
     EXPERIMENTAL_FEATURES
 )
+from pyln.testing.utils import SLOW_MACHINE, VALGRIND
 from bitcoin.core import CMutableTransaction, CMutableTxOut
 
 import binascii
@@ -1042,6 +1043,7 @@ def test_funding_external_wallet_corners(node_factory, bitcoind):
     l1.rpc.close(l2.info['id'])
 
 
+@unittest.skipIf(SLOW_MACHINE and not VALGRIND, "Way too taxing on CI machines")
 def test_funding_cancel_race(node_factory, bitcoind, executor):
     l1 = node_factory.get_node()
 

From 336e972bf80028d2561f60b9685f888e73f0ae7f Mon Sep 17 00:00:00 2001
From: Christian Decker <decker.christian@gmail.com>
Date: Mon, 21 Dec 2020 15:06:07 +0100
Subject: [PATCH 8/8] pytest: Really only check DB statements if the env-var is
 "1"

---
 tests/fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 8a7e9c249b5a..c5a9aefadf17 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs):
 
         # If we opted into checking the DB statements we will attach the dblog
         # plugin before starting the node
-        check_dblog = os.environ.get("TEST_CHECK_DBSTMTS", None) is not None
+        check_dblog = os.environ.get("TEST_CHECK_DBSTMTS", None) == "1"
         db = os.environ.get("TEST_DB_PROVIDER", "sqlite3")
         if db == 'sqlite3' and check_dblog:
             dblog = os.path.join(os.path.dirname(__file__), 'plugins', 'dblog.py')