aptos-labs · Markuze · Sep 30, 2022 · Sep 30, 2022
@@ -1,51 +1,165 @@
 global
     log stdout len 10240 format raw local0
-    maxconn 500000
-    nbthread 16
+
+    # Config manual: https://cbonte.github.io/haproxy-dconv/2.5/configuration.html
+    # magic values : terraform/helm/aptos-node/values.yaml
+
+    # 256 connections/sec * upgrade_to(30 sec)
+    maxconn 8192
+    # This limits the whole HA Proxy impacting both validators and other frontends
+    # maxconnrate 128
+    nbthread 4
+
+    #4MB for client facing sndbuf/rcvbuf. -- 100Mb/s with 300 mili latency (e.g., us-asia)
+    tune.rcvbuf.client {{ $.Values.haproxy.limits.validator.tcpBufSize }}
+
     user nobody
 
+## TCP port defaults
 defaults
     log global
-    option tcplog
-    maxconn 500000
-    timeout queue 1s
-    timeout connect 10s
-    timeout server 60s
-    timeout client 60s
-    timeout client-fin 5s
-
-frontend validator
+    mode tcp
+    #option tcplog
+    option dontlog-normal
+    log-format "%ci:%cp - %sp[%rt] [%t] %ft %Tw/%Tc/%Tt %B [%ts] %ac/%fc/%bc/%sc/%rc %sq/%bq"
+    maxconn 8192		#Validator network mesh + FN x2
+    retries 3
+    timeout queue 5s  #limits num of concurrent connections. Not clear if t/o connect is needed. #https://www.papertrail.com/solution/tips/haproxy-logging-how-to-tune-timeouts-for-performance/
+    timeout connect 5s
+    # enough for 1 successfull + 5 unsuccessfull HB(10 sec interval) + 20 sec timeout
+    timeout server 80s
+    timeout client 80s
+
+    timeout client-fin 3s #How long to hold an interrupted client connection.
+    timeout server-fin 1s
+
+frontend fe-{{ include "aptos-validator.fullname" $ }}-validator
     bind :6180
-    default_backend validator
+    default_backend {{ include "aptos-validator.fullname" $ }}-validator
 
     # Deny requests from blocked IPs
-    tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    tcp-request connection silent-drop if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+
+    acl ip_high_conn_rate sc0_conn_rate gt {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }}
+
+    stick-table type ip size 10m expire 30m store gpc0,gpc1,conn_rate(1m),bytes_out_rate(10s),bytes_out_cnt	##about 500MB of memory
+    tcp-request connection track-sc0 src 						   #update table with src ip as key, store in sc0
+
+    #We Count rate-limit manualy -- Will be more CPU intensieve but will allow whitelists to enter and up to rateLimitSession non blacklisted IPs.
+    tcp-request connection track-sc1 int(1) table CONN_RATE
+
+    #tcp-request connection sc-set-gpt0(0) int(...) if ip_high_conn_rate is better but dies with:
+    #parsing [/usr/local/etc/haproxy/haproxy.cfg:53] : internal error, unexpected rule->from=0, please report this bug!
+    #<1> Mark Blacklist
+    tcp-request connection sc-inc-gpc0(0) if ip_high_conn_rate
+
+    #This connection is silently dropped no reason to count it for rateLimitSession
+    tcp-request connection sc-inc-gpc1(1) unless { sc0_get_gpc0() ge 1 }
+
+    # an IP that was blacklisted due to to many unsucsessfull tcp attempts
+    #-1- Enforece Blacklist
+    tcp-request connection silent-drop if { sc0_get_gpc0() ge 1 }
 
-    # Limit to N TCP connections per minute per source IP
-    stick-table type ip size 500k expire 1m store gpc0_rate(1m)
-    tcp-request connection track-sc0 src
-    # TODO: Reject at content phase for now so we get logs, but this should be
-    # done at connection phase for higher efficiency
-    tcp-request content reject if { sc_gpc0_rate(0) ge {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }} }
-    tcp-request content sc-inc-gpc0(0) unless { nbsrv(validator) eq 0 }
+    #an IP that had a sucessfull connection.
+    #-2- Allow Whitelist
+    tcp-request connection accept if { sc0_get_gpc1() ge 1 }
 
-backend validator
-    default-server maxconn 1024 {{ if $.Values.haproxy.config.send_proxy_protocol }}send-proxy-v2{{ end }}
+    #-3- Enforece RateLimit
+    tcp-request connection reject if { sc1_gpc1_rate(CONN_RATE) gt  {{ $.Values.haproxy.limits.validator.rateLimitSession }} }
+
+    # This is a successfull connection i.e., was sent more than 16K bytes in the last 30 min
+    #tcp-request session sc-set-gpt0(0) int(...)  if { sc0_kbytes_out gt 16 }
+    #<2> Mark Whitelist
+    tcp-request session sc-inc-gpc1(0) if { sc0_kbytes_out gt 4 }
+
+    # -4- Break a long high rate connection
+    tcp-request session reject if { sc0_bytes_out_rate gt  {{ $.Values.haproxy.limits.validator.maxBytesOutRate10sec }} }
+
+backend {{ include "aptos-validator.fullname" $ }}-validator
+    default-server maxconn 8192
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:6180
 
+frontend fe-{{ include "aptos-validator.fullname" $ }}-validator-fn
+    bind :6181
+    default_backend {{ include "aptos-validator.fullname" $ }}-validator-fn
+
+    # Deny requests from blocked IPs
+    tcp-request connection silent-drop if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+
+    acl ip_high_conn_rate sc0_conn_rate gt {{ $.Values.haproxy.limits.validator.connectionsPerIPPerMin }}
+
+    stick-table type ip size 10m expire 30m store gpc0,gpc1,conn_rate(1m),bytes_out_rate(10s),bytes_out_cnt	##about 500MB of memory
+    tcp-request connection track-sc0 src 						   #update table with src ip as key, store in sc0
+
+    #We Count rate-limit manualy -- Will be more CPU intensieve but will allow whitelists to enter and up to rateLimitSession non blacklisted IPs.
+    tcp-request connection track-sc1 int(1) table CONN_RATE
+
+    #tcp-request connection sc-set-gpt0(0) int(...) if ip_high_conn_rate is better but dies with:
+    #parsing [/usr/local/etc/haproxy/haproxy.cfg:53] : internal error, unexpected rule->from=0, please report this bug!
+    #<1> Mark Blacklist
+    tcp-request connection sc-inc-gpc0(0) if ip_high_conn_rate
+
+    #This connection is silently dropped no reason to count it for rateLimitSession
+    tcp-request connection sc-inc-gpc1(1) unless { sc0_get_gpc0() ge 1 }
+
+    # an IP that was blacklisted due to to many unsucsessfull tcp attempts
+    #-1- Enforece Blacklist
+    tcp-request connection silent-drop if { sc0_get_gpc0() ge 1 }
+
+    #an IP that had a sucessfull connection.
+    #-2- Allow Whitelist
+    tcp-request connection accept if { sc0_get_gpc1() ge 1 }
+
+    #-3- Enforece RateLimit
+    tcp-request connection reject if { sc1_gpc1_rate(CONN_RATE) gt  {{ $.Values.haproxy.limits.validator.rateLimitSession }} }
+
+    # This is a successfull connection i.e., was sent more than 16K bytes in the last 30 min
+    #tcp-request session sc-set-gpt0(0) int(...)  if { sc0_kbytes_out gt 16 }
+    #<2> Mark Whitelist
+    tcp-request session sc-inc-gpc1(0) if { sc0_kbytes_out gt 4 }
+
+    # -4- Break a long high rate connection
+    tcp-request session reject if { sc0_bytes_out_rate gt  {{ $.Values.haproxy.limits.validator.maxBytesOutRate10sec }} }
+
+backend {{ include "aptos-validator.fullname" $ }}-validator-fn
+    default-server maxconn 8192
+    server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:6181
+
+
+#CONNRATE holds only entry with key 1: used for determening global conn rate
+backend CONN_RATE
+    stick-table type integer size 1 expire 10m store gpc1,gpc1_rate(1s)
+
+##################  HTTP: metrics & API
+defaults
+	mode http
+        retries 3
+        timeout queue 5s  #limits num of concurrent connections. Not clear if t/o connect is needed. #https://www.papertrail.com/solution/tips/haproxy-logging-how-to-tune-timeouts-for-performance/
+        timeout connect 5s
+        timeout server 60s #what makes sense? for silence between nodes?
+        timeout client 60s
+
+        timeout client-fin 3s #How long to hold an interrupted client connection.
+        timeout server-fin 1s
+
+	timeout http-request 60s #len of http request
+	timeout http-keep-alive 2s
+
+        rate-limit sessions 256
+
 frontend validator-metrics
     mode http
     option httplog
     bind :9102
     default_backend validator-metrics
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend validator-metrics
     mode http
-    default-server maxconn 1024
+    default-server maxconn 16
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:9101
 
 # Exposes the validator's own REST API
@@ -55,14 +169,14 @@ frontend validator-api
     option httplog
     bind :8180
     default_backend validator-api
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend validator-api
     mode http
-    default-server maxconn 1024
+    default-server maxconn 16
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:8080
 {{- end }}
 
@@ -87,29 +201,29 @@ frontend {{ $config.name }}-api
     default_backend {{ $config.name }}-api
     # add Forwarded header, which behaves differently than X-Forwarded-For
     # see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend {{ $config.name }}-api
     mode http
-    default-server maxconn 1024
+    default-server maxconn 16
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }} {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }}:8080
 
 frontend {{ $config.name }}-metrics
     mode http
     option httplog
     bind :{{ add 9103 $index }}
     default_backend {{ $config.name }}-metrics
-    http-request add-header Forwarded "for=%ci"
 
     # Deny requests from blocked IPs
     tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips }
+    http-request add-header Forwarded "for=%ci"
 
 backend {{ $config.name }}-metrics
     mode http
-    default-server maxconn 1024
+    default-server maxconn 16
     server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }} {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }}:9101
 
 {{- end }}

diff --git a/testsuite/forge.py b/testsuite/forge.py
@@ -1186,8 +1186,7 @@ def create_forge_command(
             "--duration-secs", forge_runner_duration_secs
         ])
 
-    if forge_num_validators:
-        forge_args.extend(["--num-validators", forge_num_validators])
+    forge_args.extend(["--num-validators", "100"])
     if forge_num_validator_fullnodes:
         forge_args.extend([
             "--num-validator-fullnodes",
@@ -1213,8 +1212,7 @@ def create_forge_command(
         forge_args.append("--reuse")
     if forge_namespace_keep == "true":
         forge_args.append("--keep")
-    if forge_enable_haproxy == "true":
-        forge_args.append("--enable-haproxy")
+    forge_args.append("--enable-haproxy")
 
     if test_args:
         forge_args.extend(test_args)
@@ -1535,7 +1533,7 @@ def test(
         forge_cli_args=forge_cli_args,
         test_args=test_args,
     )
-    
+
     print(f"Using cluster: {forge_cluster_name}")
     temp = context.filesystem.mkstemp()
     forge_cluster = ForgeCluster(forge_cluster_name, temp)