From a3d9e42935fc3aa2e24b5d1a90c995d9910dc6c5 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Fri, 18 Dec 2020 12:28:12 +0100 Subject: [PATCH 1/8] pyln: Log mempool when we generate blocks in BitcoinD fixture We sometimes have very specific sequences of tx broadcasts and blocks being generated to confirm them. If the confirmation is missed the test can completely get out of sync. Make debugging this easier by logging what we confirmed. --- contrib/pyln-testing/pyln/testing/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 9ab4e17437cc..2435df73dde0 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -401,6 +401,14 @@ def generate_block(self, numblocks=1, wait_for_mempool=0): wait_for(lambda: all(txid in self.rpc.getrawmempool() for txid in wait_for_mempool)) else: wait_for(lambda: len(self.rpc.getrawmempool()) >= wait_for_mempool) + + mempool = self.rpc.getrawmempool() + logging.debug("Generating {numblocks}, confirming {lenmempool} transactions: {mempool}".format( + numblocks=numblocks, + mempool=mempool, + lenmempool=len(mempool), + )) + # As of 0.16, generate() is removed; use generatetoaddress. return self.rpc.generatetoaddress(numblocks, self.rpc.getnewaddress()) From 594b344eaf641c2d0c708f0bf22da4725871a871 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Fri, 18 Dec 2020 12:37:35 +0100 Subject: [PATCH 2/8] pyln: Add logging to bitcoind RPC calls Useful if we want to debug a bit better --- contrib/pyln-testing/pyln/testing/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 2435df73dde0..d60c0eba10b0 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -321,7 +321,16 @@ def __getattr__(self, name): proxy = BitcoinProxy(btc_conf_file=self.__btc_conf_file__) def f(*args): - return proxy._call(name, *args) + logging.debug("Calling {name} with arguments {args}".format( + name=name, + args=args + )) + res = proxy._call(name, *args) + logging.debug("Result for {name} call: {res}".format( + name=name, + res=res, + )) + return res # Make debuggers show rather than > From 7e9f9cc621c3b976153005dedb2434efa7d537e7 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Fri, 11 Dec 2020 12:51:45 +0100 Subject: [PATCH 3/8] pytest: Add throttler to limit load on the test system Both my machine and apparently the CI tester machines regularly run into issues with load on the system, causing timeouts (and unresponsiveness). The throttler throttles the speed with which new instances of c-lightning get started to avoid overloading. Since the plugin used for parallelism when testing spawns multiple processes we need to lock on the fs. Since we have that file open already, we'll also write a couple of performance metics to it. --- contrib/pyln-testing/pyln/testing/fixtures.py | 12 +++- contrib/pyln-testing/pyln/testing/utils.py | 56 ++++++++++++++++++- contrib/pyln-testing/requirements.txt | 2 + tests/fixtures.py | 2 +- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/contrib/pyln-testing/pyln/testing/fixtures.py b/contrib/pyln-testing/pyln/testing/fixtures.py index ea6ccdc5f1b6..ae5fcd51f50e 100644 --- a/contrib/pyln-testing/pyln/testing/fixtures.py +++ b/contrib/pyln-testing/pyln/testing/fixtures.py @@ -1,6 +1,6 @@ from concurrent import futures from pyln.testing.db import SqliteDbProvider, PostgresDbProvider -from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, DEVELOPER, LightningNode, TEST_DEBUG +from pyln.testing.utils import NodeFactory, BitcoinD, ElementsD, env, DEVELOPER, LightningNode, TEST_DEBUG, Throttler from typing import Dict import logging @@ -198,7 +198,12 @@ def teardown_checks(request): @pytest.fixture -def node_factory(request, directory, test_name, bitcoind, executor, db_provider, teardown_checks, node_cls): +def throttler(): + yield Throttler() + + +@pytest.fixture +def node_factory(request, directory, test_name, bitcoind, executor, db_provider, teardown_checks, node_cls, throttler): nf = NodeFactory( request, test_name, @@ -206,7 +211,8 @@ def node_factory(request, directory, test_name, bitcoind, executor, db_provider, executor, directory=directory, db_provider=db_provider, - node_cls=node_cls + node_cls=node_cls, + throttler=throttler, ) yield nf diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index d60c0eba10b0..66fe4c47dee7 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -6,6 +6,7 @@ from collections import OrderedDict from decimal import Decimal from ephemeral_port_reserve import reserve # type: ignore +from filelock import FileLock from pyln.client import LightningRpc from pyln.client import Millisatoshi @@ -14,6 +15,7 @@ import lzma import math import os +import psutil # type: ignore import random import re import shutil @@ -1038,10 +1040,59 @@ def passes_filters(hmsg, filters): return msgs +class Throttler(object): + """Throttles the creation of system-processes to avoid overload. + + There is no reason to overload the system with too many processes + being spawned or run at the same time. It causes timeouts by + aggressively preempting processes and swapping if the memory limit is + reached. In order to reduce this loss of performance we provide a + `wait()` method which will serialize the creation of processes, but + also delay if the system load is too high. + + Notice that technically we are throttling too late, i.e., we react + to an overload, but chances are pretty good that some other + already running process is about to terminate, and so the overload + is short-lived. We throttle when the process object is first + created, not when restarted, in order to avoid delaying running + tests, which could cause more timeouts. + + """ + def __init__(self, target: float = 75): + """If specified we try to stick to a load of target (in percent). + """ + self.target = target + self.lock = FileLock("/tmp/ltest.lock") + self.current_load = self.target # Start slow + psutil.cpu_percent() # Prime the internal load metric + + def wait(self): + start_time = time.time() + with self.lock.acquire(poll_intervall=0.250): + # We just got the lock, assume someone else just released it + self.current_load = 100 + while self.load() >= self.target: + time.sleep(1) + + delay = time.time() - start_time + with open("/tmp/ltest-throttler.csv", "a") as f: + f.write("{}, {}, {}, {}\n".format(time.time(), self.load(), self.target, delay)) + self.current_load = 100 # Back off slightly to avoid triggering right away + + def load(self): + """An exponential moving average of the load + """ + decay = 0.5 + load = psutil.cpu_percent() + self.current_load = decay * load + (1 - decay) * self.current_load + return self.current_load + + class NodeFactory(object): """A factory to setup and start `lightningd` daemons. """ - def __init__(self, request, testname, bitcoind, executor, directory, db_provider, node_cls): + def __init__(self, request, testname, bitcoind, executor, directory, + db_provider, node_cls, throttler): if request.node.get_closest_marker("slow_test") and SLOW_MACHINE: self.valgrind = False else: @@ -1055,6 +1106,7 @@ def __init__(self, request, testname, bitcoind, executor, directory, db_provider self.lock = threading.Lock() self.db_provider = db_provider self.node_cls = node_cls + self.throttler = throttler def split_options(self, opts): """Split node options from cli options @@ -1115,7 +1167,7 @@ def get_node(self, node_id=None, options=None, dbfile=None, feerates=(15000, 11000, 7500, 3750), start=True, wait_for_bitcoind_sync=True, may_fail=False, expect_fail=False, cleandir=True, **kwargs): - + self.throttler.wait() node_id = self.get_node_id() if not node_id else node_id port = self.get_next_port() diff --git a/contrib/pyln-testing/requirements.txt b/contrib/pyln-testing/requirements.txt index b28809f2dbdf..a4c3c932491c 100644 --- a/contrib/pyln-testing/requirements.txt +++ b/contrib/pyln-testing/requirements.txt @@ -5,3 +5,5 @@ cheroot==8.2.1 ephemeral-port-reserve==1.1.1 python-bitcoinlib==0.10.2 psycopg2-binary==2.8.4 +filelock==3.0.* +psutil==5.7.* diff --git a/tests/fixtures.py b/tests/fixtures.py index 5fb6ad39d993..8a7e9c249b5a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1,5 +1,5 @@ from utils import DEVELOPER, TEST_NETWORK # noqa: F401,F403 -from pyln.testing.fixtures import directory, test_base_dir, test_name, chainparams, node_factory, bitcoind, teardown_checks, db_provider, executor, setup_logging # noqa: F401,F403 +from pyln.testing.fixtures import directory, test_base_dir, test_name, chainparams, node_factory, bitcoind, teardown_checks, throttler, db_provider, executor, setup_logging # noqa: F401,F403 from pyln.testing import utils from utils import COMPAT From e80310ace6ab86e8626e5e85109d8aaa0b875584 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Tue, 15 Dec 2020 11:03:16 +0100 Subject: [PATCH 4/8] pyln: Check that the stderr line is not none in pyln-testing tail I stumbled over this in a test run and it seems benign. --- contrib/pyln-testing/pyln/testing/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 66fe4c47dee7..623fb5401148 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -227,7 +227,7 @@ def tail(self): if self.proc.stderr: for line in iter(self.proc.stderr.readline, ''): - if len(line) == 0: + if line is None or len(line) == 0: break self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip() self.proc.stderr.close() From 78fc5b705d728f06a3e30ac992ce95d4a2e22a76 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Fri, 18 Dec 2020 10:42:00 +0100 Subject: [PATCH 5/8] pytest: Stabilize test_routing_gossip We were getting bad gossip because some nodes discarded the channel announcement for being in the future. This is because the node was, at that time, below the confirmation height. It'd then discard the followup messages because not preceded by an announcement, and getting upset about that. --- tests/test_gossip.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_gossip.py b/tests/test_gossip.py index b598a6d25612..121f64198102 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -510,6 +510,11 @@ def test_routing_gossip(node_factory, bitcoind): src.rpc.connect(dst.info['id'], 'localhost', dst.port) src.openchannel(dst, 25000) + # Avoid "bad gossip" caused by future announcements (a node below + # confirmation height receiving and ignoring the announcement, + # thus marking followup messages as bad). + sync_blockheight(bitcoind, nodes) + # Allow announce messages. bitcoind.generate_block(5) From df34ba779b04e23d0836376b339828cb9610b154 Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Fri, 18 Dec 2020 10:50:49 +0100 Subject: [PATCH 6/8] pyln: Do not rstrip() the return value of .append() This was causing the following error ``` Exception in thread Thread-553: Traceback (most recent call last): File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner self.run() File "/usr/lib/python3.6/threading.py", line 864, in run self._target(*self._args, **self._kwargs) File "/tmp/cirrus-ci-build/contrib/pyln-testing/pyln/testing/utils.py", line 232, in tail self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip() AttributeError: 'NoneType' object has no attribute 'rstrip' [gw5] [ 33%] FAILED tests/test_misc.py::test_bitcoin_failure ``` Notice the second call to `.rstrip()` on the return value of `.append()` --- contrib/pyln-testing/pyln/testing/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 623fb5401148..310ebde5e221 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -227,9 +227,13 @@ def tail(self): if self.proc.stderr: for line in iter(self.proc.stderr.readline, ''): + if line is None or len(line) == 0: break - self.err_logs.append(line.rstrip().decode('UTF-8', 'replace')).rstrip() + + line = line.rstrip().decode('UTF-8', 'replace') + self.err_logs.append(line) + self.proc.stderr.close() def is_in_log(self, regex, start=0): From cc3a8e93a7d6d870bbd3abcc5c62b4fd2d1eb22f Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Mon, 21 Dec 2020 13:09:49 +0100 Subject: [PATCH 7/8] pytest: Disable test_funding_cancel_race on slow machines The CI regularly trips up on this very taxing test (100 nodes) so only run it if we have the required horsepower. --- tests/test_connection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_connection.py b/tests/test_connection.py index 6988cf44c727..3f736eb2b729 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -11,6 +11,7 @@ check_coin_moves, first_channel_id, account_balance, basic_fee, EXPERIMENTAL_FEATURES ) +from pyln.testing.utils import SLOW_MACHINE, VALGRIND from bitcoin.core import CMutableTransaction, CMutableTxOut import binascii @@ -1042,6 +1043,7 @@ def test_funding_external_wallet_corners(node_factory, bitcoind): l1.rpc.close(l2.info['id']) +@unittest.skipIf(SLOW_MACHINE and not VALGRIND, "Way too taxing on CI machines") def test_funding_cancel_race(node_factory, bitcoind, executor): l1 = node_factory.get_node() From 336e972bf80028d2561f60b9685f888e73f0ae7f Mon Sep 17 00:00:00 2001 From: Christian Decker Date: Mon, 21 Dec 2020 15:06:07 +0100 Subject: [PATCH 8/8] pytest: Really only check DB statements if the env-var is "1" --- tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 8a7e9c249b5a..c5a9aefadf17 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs): # If we opted into checking the DB statements we will attach the dblog # plugin before starting the node - check_dblog = os.environ.get("TEST_CHECK_DBSTMTS", None) is not None + check_dblog = os.environ.get("TEST_CHECK_DBSTMTS", None) == "1" db = os.environ.get("TEST_DB_PROVIDER", "sqlite3") if db == 'sqlite3' and check_dblog: dblog = os.path.join(os.path.dirname(__file__), 'plugins', 'dblog.py')