diff --git a/ansible/config_sonic_basedon_testbed.yml b/ansible/config_sonic_basedon_testbed.yml index 0c41728bd9..aafb30ed5a 100644 --- a/ansible/config_sonic_basedon_testbed.yml +++ b/ansible/config_sonic_basedon_testbed.yml @@ -144,6 +144,49 @@ become: true when: stat_result.stat.exists is defined and stat_result.stat.exists + - name: Init account key and proxy + set_fact: + core_key: "" + core_proxy: "" + + - name: read account key + set_fact: + core_key: "{{ corefile_uploader['azure_sonic_core_storage']['account_key'] }}" + when: corefile_uploader['azure_sonic_core_storage']['account_key'] is defined + + - name: read https proxy + set_fact: + core_proxy: "{{ corefile_uploader['env']['https_proxy'] }}" + when: corefile_uploader['env']['https_proxy'] is defined + + - name: Put secret in core_analyzer.rc.json + lineinfile: + name: /etc/sonic/core_analyzer.rc.json + regexp: '(^.*)account_key' + line: '\1account_key": "{{ core_key }}",' + backrefs: yes + become: true + when: core_key != "" + + - name: Put https-proxy in core_analyzer.rc.json + lineinfile: + name: /etc/sonic/core_analyzer.rc.json + regexp: '(^.*)https_proxy' + line: '\1https_proxy": "{{ core_proxy }}"' + backrefs: yes + become: true + when: core_proxy != "" + + - name: enable core uploader service + become: true + command: systemctl enable core_uploader.service + when: core_key != "" + + - name: start core uploader service + become: true + command: systemctl start core_uploader.service + when: core_key != "" + - name: Replace snmp community string lineinfile: name: /etc/sonic/snmp.yml diff --git a/ansible/doc/README.testbed.cEOS.md b/ansible/doc/README.testbed.cEOS.md new file mode 100644 index 0000000000..07fb01a6de --- /dev/null +++ b/ansible/doc/README.testbed.cEOS.md @@ -0,0 +1,131 @@ +# cEOS + +This document discusses how to use cEOS as DUT neighbor device. + +cEOS is the container-based EOS. All the software running inside +the container. Compared with vEOS, cEOS has much smaller memory +footprint. + +Follow [instruction](README.testbed.VsSetup.md) to setup cEOS testbed. + +In below example, there are four cEOS containers. + +``` +lgh@jenkins-worker-15:~$ docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +fe48c207a51c ceosimage:4.23.2F-1 "/sbin/init systemd.…" 8 days ago Up 8 days ceos_vms6-1_VM0103 +52297010e66a ceosimage:4.23.2F-1 "/sbin/init systemd.…" 8 days ago Up 8 days ceos_vms6-1_VM0102 +8dd95269b312 ceosimage:4.23.2F-1 "/sbin/init systemd.…" 8 days ago Up 8 days ceos_vms6-1_VM0101 +3a50dd481bfb ceosimage:4.23.2F-1 "/sbin/init systemd.…" 8 days ago Up 8 days ceos_vms6-1_VM0100 +b91b48145def debian:jessie "bash" 8 days ago Up 8 days net_vms6-1_VM0103 +d1ff26d84249 debian:jessie "bash" 8 days ago Up 8 days net_vms6-1_VM0102 +1489f52b9617 debian:jessie "bash" 8 days ago Up 8 days net_vms6-1_VM0101 +ce1214a008ed debian:jessie "bash" 8 days ago Up 8 days net_vms6-1_VM0100 +``` + +## Resource consumption + +A cEOS containers consumes around 1G memory. + +``` +lgh@jenkins-worker-15:~$ docker stats --no-stream +CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS 6 +fe48c207a51c ceos_vms6-1_VM0103 2.04% 970.9MiB / 125.9GiB 0.75% 0B / 0B 365MB / 55.8GB 138 +52297010e66a ceos_vms6-1_VM0102 2.19% 965.4MiB / 125.9GiB 0.75% 0B / 0B 237MB / 55.6GB 139 +8dd95269b312 ceos_vms6-1_VM0101 1.93% 980.9MiB / 125.9GiB 0.76% 0B / 0B 300MB / 55.9GB 138 +3a50dd481bfb ceos_vms6-1_VM0100 2.05% 970.2MiB / 125.9GiB 0.75% 0B / 0B 365MB / 56.1GB 138 +``` + +## Network Setup + +We first create a base container `net_${testbed_name}_${vm_name}`, inject six ethernet ports into the base container, +and then start cEOS `ceos_${testbed_name}_${vm_name}` container on top of the base container. The six ethernet ports +are used for +- 1 management port +- 4 front panel ports to DUT +- 1 backplane port to PTF docker + +``` + +------------+ +----+ + | cEOS Ma0 +--------- VM0100-m ---+ br | + | | +----+ + | | + | | +--------------+ + | Et1 +----------VM0100-t0---+ br-VM0100-0 | + | | +--------------+ + | | + | | +--------------+ + | Et2 +----------VM0100-t1---+ br-VM0100-1 | + | | +--------------+ + | | + | | +--------------+ + | Et3 +----------VM0100-t2---+ br-VM0100-2 | + | | +--------------+ + | | + | | +--------------+ + | Et4 +----------VM0100-t3---+ br-VM0100-3 | + | | +--------------+ + | | + | | +--------------+ + | Et5 +----------VM0100-back--+ br-b-vms6-1 | + | | +--------------+ + +------------+ +``` + +## Configuration + +The `/mnt/flash` in cEOS container is mount to `/data/ceos/ceos_${testbed_name}_${vm_name}` on the host. The `/mnt/flash` +contiains the configuration file and logs. + +``` +lgh@jenkins-worker-15:~$ ls -l /data/ceos/ceos_vms6-1_VM0100/ +total 40 +-rw-rw-r--+ 1 root root 924 Mar 31 07:35 AsuFastPktTransmit.log +drwxrwxr-x+ 2 root root 4096 Mar 31 03:31 Fossil +-rw-rw-r--+ 1 root root 568 Mar 31 07:35 SsuRestore.log +-rw-rw-r--+ 1 root root 568 Mar 31 07:35 SsuRestoreLegacy.log +drwxr-xr-x+ 4 897 88 4096 Mar 31 07:35 archive +drwxrwx---+ 3 root root 4096 Mar 18 06:12 debug +drwxrwxr-x+ 2 root root 4096 Mar 18 06:12 fastpkttx.backup +-rw-rw-r--+ 1 root root 180 Mar 31 07:35 kickstart-config +drwxrwxr-x+ 3 root root 4096 Apr 8 09:11 persist +-rw-rwxr--+ 1 root root 1915 Mar 18 06:12 startup-config +``` + +## Login + +There are two ways to get into cEOS container + +1. docker exec +``` +lgh@jenkins-worker-15:~$ docker exec -it ceos_vms6-1_VM0100 Cli +ARISTA01T1>show int status +Port Name Status Vlan Duplex Speed Type Flags Encapsulation +Et1 connected in Po1 full unconf EbraTestPhyPort +Et2 connected 1 full unconf EbraTestPhyPort +Et3 connected 1 full unconf EbraTestPhyPort +Et4 connected 1 full unconf EbraTestPhyPort +Et5 backplane connected routed full unconf EbraTestPhyPort +Ma0 connected routed full 10G 10/100/1000 +Po1 connected routed full unconf N/A + +ARISTA01T1> +``` + +2. ssh +``` +lgh@jenkins-worker-15:~$ ssh admin@10.250.0.51 +Password: +ARISTA01T1>show int status +Port Name Status Vlan Duplex Speed Type Flags Encapsulation +Et1 connected in Po1 full unconf EbraTestPhyPort +Et2 connected 1 full unconf EbraTestPhyPort +Et3 connected 1 full unconf EbraTestPhyPort +Et4 connected 1 full unconf EbraTestPhyPort +Et5 backplane connected routed full unconf EbraTestPhyPort +Ma0 connected routed full 10G 10/100/1000 +Po1 connected routed full unconf N/A + +ARISTA01T1> +``` + diff --git a/ansible/group_vars/all/corefile_uploader.yml b/ansible/group_vars/all/corefile_uploader.yml new file mode 100644 index 0000000000..c2c57b86d5 --- /dev/null +++ b/ansible/group_vars/all/corefile_uploader.yml @@ -0,0 +1,7 @@ +# Configure core file storage secret key and https-proxy as required +# +#corefile_uploader: +# azure_sonic_core_storage: +# account_key: "Your Secret" +# env: +# https_proxy: "http://10.10.10.10:8000" diff --git a/ansible/roles/test/files/ptftests/advanced-reboot.py b/ansible/roles/test/files/ptftests/advanced-reboot.py index 2c6c2e1af9..c63a117d35 100644 --- a/ansible/roles/test/files/ptftests/advanced-reboot.py +++ b/ansible/roles/test/files/ptftests/advanced-reboot.py @@ -57,12 +57,12 @@ import re from collections import defaultdict import json -import paramiko import Queue import pickle from operator import itemgetter import scapy.all as scapyall import itertools +from device_connection import DeviceConnection from arista import Arista import sad_path as sp @@ -125,6 +125,7 @@ def __init__(self): self.test_params = testutils.test_params_get() self.check_param('verbose', False, required=False) self.check_param('dut_username', '', required=True) + self.check_param('dut_password', '', required=True) self.check_param('dut_hostname', '', required=True) self.check_param('reboot_limit_in_seconds', 30, required=False) self.check_param('reboot_type', 'fast-reboot', required=False) @@ -217,6 +218,12 @@ def __init__(self): self.allow_vlan_flooding = bool(self.test_params['allow_vlan_flooding']) + self.dut_connection = DeviceConnection( + self.test_params['dut_hostname'], + self.test_params['dut_username'], + password=self.test_params['dut_password'] + ) + return def read_json(self, name): @@ -411,7 +418,7 @@ def get_sad_info(self): def init_sad_oper(self): if self.sad_oper: self.log("Preboot/Inboot Operations:") - self.sad_handle = sp.SadTest(self.sad_oper, self.ssh_targets, self.portchannel_ports, self.vm_dut_map, self.test_params, self.dut_ssh, self.vlan_ports) + self.sad_handle = sp.SadTest(self.sad_oper, self.ssh_targets, self.portchannel_ports, self.vm_dut_map, self.test_params, self.vlan_ports) (self.ssh_targets, self.portchannel_ports, self.neigh_vm, self.vlan_ports), (log_info, fails) = self.sad_handle.setup() self.populate_fail_info(fails) for log in log_info: @@ -480,7 +487,6 @@ def setUp(self): self.reboot_type = self.test_params['reboot_type'] if self.reboot_type not in ['fast-reboot', 'warm-reboot']: raise ValueError('Not supported reboot_type %s' % self.reboot_type) - self.dut_ssh = self.test_params['dut_username'] + '@' + self.test_params['dut_hostname'] self.dut_mac = self.test_params['dut_mac'] # get VM info @@ -509,7 +515,7 @@ def setUp(self): self.from_server_dst_ports = self.portchannel_ports self.log("Test params:") - self.log("DUT ssh: %s" % self.dut_ssh) + self.log("DUT ssh: %s@%s" % (self.test_params['dut_username'], self.test_params['dut_hostname'])) self.log("DUT reboot limit in seconds: %s" % self.limit) self.log("DUT mac address: %s" % self.dut_mac) @@ -1004,7 +1010,7 @@ def reboot_dut(self): time.sleep(self.reboot_delay) self.log("Rebooting remote side") - stdout, stderr, return_code = self.cmd(["ssh", "-oStrictHostKeyChecking=no", self.dut_ssh, "sudo " + self.reboot_type]) + stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type) if stdout != []: self.log("stdout from %s: %s" % (self.reboot_type, str(stdout))) if stderr != []: diff --git a/ansible/roles/test/files/ptftests/device_connection.py b/ansible/roles/test/files/ptftests/device_connection.py new file mode 100644 index 0000000000..a29ea493b0 --- /dev/null +++ b/ansible/roles/test/files/ptftests/device_connection.py @@ -0,0 +1,63 @@ +import paramiko +import logging +from paramiko.ssh_exception import BadHostKeyException, AuthenticationException, SSHException + +logger = logging.getLogger(__name__) + +DEFAULT_CMD_EXECUTION_TIMEOUT_SEC = 10 + +class DeviceConnection: + ''' + DeviceConnection uses Paramiko module to connect to devices + + Paramiko module uses fallback mechanism where it would first try to use + ssh key and that fails, it will attempt username/password combination + ''' + def __init__(self, hostname, username, password=None): + ''' + Class constructor + + @param hostname: hostname of device to connect to + @param username: username for device connection + @param password: password for device connection + ''' + self.hostname = hostname + self.username = username + self.password = password + + def execCommand(self, cmd, timeout=DEFAULT_CMD_EXECUTION_TIMEOUT_SEC): + ''' + Executes command on remote device + + @param cmd: command to be run on remote device + @param timeout: timeout for command run session + @return: stdout, stderr, value + stdout is a list of lines of the remote stdout gathered during command execution + stderr is a list of lines of the remote stderr gathered during command execution + value: 0 if command execution raised no exception + nonzero if exception is raised + ''' + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + if isinstance(cmd, list): + cmd = ' '.join(cmd) + + stdOut = stdErr = [] + retValue = 1 + try: + client.connect(self.hostname, username=self.username, password=self.password, allow_agent=False) + si, so, se = client.exec_command(cmd, timeout=timeout) + stdOut = so.readlines() + stdErr = se.readlines() + retValue = 0 + except SSHException as sshException: + logger.error('SSH Command failed with message: %s' % sshException) + except AuthenticationException as authenticationException: + logger.error('SSH Authentiaction failure with message: %s' % authenticationException) + except BadHostKeyException as badHostKeyException: + logger.error('SSH Authentiaction failure with message: %s' % badHostKeyException) + finally: + client.close() + + return stdOut, stdErr, retValue diff --git a/ansible/roles/test/files/ptftests/sad_path.py b/ansible/roles/test/files/ptftests/sad_path.py index 8fcb5b7db5..85e61d20e5 100644 --- a/ansible/roles/test/files/ptftests/sad_path.py +++ b/ansible/roles/test/files/ptftests/sad_path.py @@ -1,25 +1,24 @@ import datetime import ipaddress import re -import subprocess import time from arista import Arista +from device_connection import DeviceConnection class SadTest(object): - def __init__(self, oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, dut_ssh, vlan_ports): + def __init__(self, oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, vlan_ports): self.oper_type = oper_type self.vm_list = vm_list self.portchannel_ports = portchannel_ports self.vm_dut_map = vm_dut_map self.test_args = test_args - self.dut_ssh = dut_ssh self.vlan_ports = vlan_ports self.fails_vm = set() self.fails_dut = set() self.log = [] - self.shandle = SadOper(self.oper_type, self.vm_list, self.portchannel_ports, self.vm_dut_map, self.test_args, self.dut_ssh, self.vlan_ports) + self.shandle = SadOper(self.oper_type, self.vm_list, self.portchannel_ports, self.vm_dut_map, self.test_args, self.vlan_ports) def setup(self): self.shandle.sad_setup(is_up=False) @@ -55,6 +54,7 @@ def __init__(self, oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, self.portchannel_ports = portchannel_ports self.vm_dut_map = vm_dut_map self.test_args = test_args + self.dut_connection = DeviceConnection(test_args['dut_hostname'], test_args['dut_username'], password=test_args['dut_password']) self.vlan_ports = vlan_ports self.vlan_if_port = self.test_args['vlan_if_port'] self.neigh_vms = [] @@ -97,16 +97,6 @@ def extract_oper_info(self, oper_type): else: self.oper_type = oper_type - def cmd(self, cmds): - process = subprocess.Popen(cmds, - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - return_code = process.returncode - - return stdout, stderr, return_code - def select_vm(self): self.vm_list.sort() vm_len = len(self.vm_list) @@ -203,9 +193,8 @@ def retreive_logs(self): class SadOper(SadPath): - def __init__(self, oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, dut_ssh, vlan_ports): + def __init__(self, oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, vlan_ports): super(SadOper, self).__init__(oper_type, vm_list, portchannel_ports, vm_dut_map, test_args, vlan_ports) - self.dut_ssh = dut_ssh self.dut_needed = dict() self.lag_members_down = dict() self.neigh_lag_members_down = dict() @@ -335,7 +324,7 @@ def get_bgp_route_cnt(self, is_up=True, v4=True): else: cmd = 'show ipv6 bgp summary | sed \'1,/Neighbor/d;/^$/,$d\' | sed \'s/\s\s*/ /g\' | cut -d\' \' -f 1,10' - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, cmd]) + stdout, stderr, return_code = self.dut_connection.execCommand(cmd) if return_code != 0: self.fails['dut'].add('%s: Failed to retreive BGP route info from DUT' % self.msg_prefix[1 - is_up]) self.fails['dut'].add('%s: Return code: %d' % (self.msg_prefix[1 - is_up], return_code)) @@ -345,15 +334,15 @@ def get_bgp_route_cnt(self, is_up=True, v4=True): def build_neigh_rt_map(self, neigh_rt_info): # construct neigh to route cnt map self.neigh_rt_map = dict() - for line in neigh_rt_info.strip().split('\n'): - key, value = line.split(' ') + for line in neigh_rt_info: + key, value = line.strip().split(' ') self.neigh_rt_map.update({key:value}) def verify_route_cnt(self, rt_incr, is_up=True, v4=True): neigh_rt_info, ret = self.get_bgp_route_cnt(is_up=is_up, v4=v4) if not ret: - for line in neigh_rt_info.strip().split('\n'): - neigh_ip, rt_cnt = line.split(' ') + for line in neigh_rt_info: + neigh_ip, rt_cnt = line.strip().split(' ') exp_cnt = int(self.neigh_rt_map[neigh_ip]) + rt_incr if int(rt_cnt) != exp_cnt: self.fails['dut'].add('%s: Route cnt incorrect for neighbor %s Expected: %d Obtained: %d' % (self.msg_prefix[is_up], neigh_ip, exp_cnt, int(rt_cnt))) @@ -386,7 +375,7 @@ def change_vlan_port_state(self, is_up=True): for intf, port in self.down_vlan_info: if not re.match('Ethernet\d+', intf): continue self.log.append('Changing state of %s from DUT side to %s' % (intf, state[is_up])) - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'sudo config interface %s %s' % (state[is_up], intf)]) + stdout, stderr, return_code = self.dut_connection.execCommand('sudo config interface %s %s' % (state[is_up], intf)) if return_code != 0: self.fails['dut'].add('%s: State change not successful from DUT side for %s' % (self.msg_prefix[1 - is_up], intf)) self.fails['dut'].add('%s: Return code: %d' % (self.msg_prefix[1 - is_up], return_code)) @@ -400,9 +389,9 @@ def verify_vlan_port_state(self, state='down', pre_check=True): # extract the admin status pat = re.compile('(\S+\s+){7}%s' % state) for intf, port in self.down_vlan_info: - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'show interfaces status %s' % intf]) + stdout, stderr, return_code = self.dut_connection.execCommand('show interfaces status %s' % intf) if return_code == 0: - for line in stdout.split('\n'): + for line in stdout: if intf in line: is_match = pat.match(line.strip()) if is_match: @@ -426,7 +415,7 @@ def change_bgp_dut_state(self, is_up=True): continue self.log.append('Changing state of BGP peer %s from DUT side to %s' % (self.neigh_bgps[vm][key], state[is_up])) - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'sudo config bgp %s neighbor %s' % (state[is_up], self.neigh_bgps[vm][key])]) + stdout, stderr, return_code = self.dut_connection.execCommand('sudo config bgp %s neighbor %s' % (state[is_up], self.neigh_bgps[vm][key])) if return_code != 0: self.fails['dut'].add('State change not successful from DUT side for peer %s' % self.neigh_bgps[vm][key]) self.fails['dut'].add('Return code: %d' % return_code) @@ -442,9 +431,9 @@ def verify_bgp_dut_state(self, state='Idle'): if key not in ['v4', 'v6']: continue self.log.append('Verifying if the DUT side BGP peer %s is %s' % (self.neigh_bgps[vm][key], states)) - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'show ip bgp neighbor %s' % self.neigh_bgps[vm][key]]) + stdout, stderr, return_code = self.dut_connection.execCommand('show ip bgp neighbor %s' % self.neigh_bgps[vm][key]) if return_code == 0: - for line in stdout.split('\n'): + for line in stdout: if 'BGP state' in line: curr_state = re.findall('BGP state = (\w+)', line)[0] bgp_state[vm][key] = (curr_state in states) @@ -507,7 +496,7 @@ def change_dut_lag_state(self, is_up=True): for intf in down_intfs: if not re.match('(PortChannel|Ethernet)\d+', intf): continue self.log.append('Changing state of %s from DUT side to %s' % (intf, state[is_up])) - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'sudo config interface %s %s' % (state[is_up], intf)]) + stdout, stderr, return_code = self.dut_connection.execCommand('sudo config interface %s %s' % (state[is_up], intf)) if return_code != 0: self.fails['dut'].add('%s: State change not successful from DUT side for %s' % (self.msg_prefix[1 - is_up], intf)) self.fails['dut'].add('%s: Return code: %d' % (self.msg_prefix[1 - is_up], return_code)) @@ -549,9 +538,9 @@ def verify_dut_lag_state(self, pre_check=True): po_list.append(po_name) self.po_neigh_map[po_name] = self.neigh_names[vm] - stdout, stderr, return_code = self.cmd(['ssh', '-oStrictHostKeyChecking=no', self.dut_ssh, 'show interfaces portchannel']) + stdout, stderr, return_code = self.dut_connection.execCommand('show interfaces portchannel') if return_code == 0: - for line in stdout.split('\n'): + for line in stdout: for po_name in po_list: if po_name in line: is_match = pat.match(line) diff --git a/ansible/roles/test/tasks/crm/crm_test_fdb_entry.yml b/ansible/roles/test/tasks/crm/crm_test_fdb_entry.yml index ff3c91b4df..cb0e394b6e 100644 --- a/ansible/roles/test/tasks/crm/crm_test_fdb_entry.yml +++ b/ansible/roles/test/tasks/crm/crm_test_fdb_entry.yml @@ -1,5 +1,11 @@ - block: + - name: Stop arp_update + command: docker exec -i swss supervisorctl stop arp_update + + - name: Remove FDB entry + command: fdbclear + - name: Get "crm_stats_fdb_entry" used and available counter value command: redis-cli --raw -n 2 HMGET CRM:STATS crm_stats_fdb_entry_used crm_stats_fdb_entry_available register: out @@ -84,3 +90,6 @@ - name: Remove FDB JSON config from SWSS container command: docker exec -i swss rm /fdb.json + + - name: Restart arp_update + command: docker exec -i swss supervisorctl start arp_update diff --git a/ansible/roles/test/tasks/ptf_runner_reboot.yml b/ansible/roles/test/tasks/ptf_runner_reboot.yml index 3111e8cbed..ba809ad1ee 100644 --- a/ansible/roles/test/tasks/ptf_runner_reboot.yml +++ b/ansible/roles/test/tasks/ptf_runner_reboot.yml @@ -51,7 +51,8 @@ ptf_qlen: 1000 ptf_test_params: - verbose=False - - dut_username=\"{{ ansible_ssh_user }}\" + - dut_username=\"{{ sonicadmin_user }}\" + - dut_password=\"{{ sonicadmin_password }}\" - dut_hostname=\"{{ ansible_host }}\" - reboot_limit_in_seconds={{ reboot_limit }} - reboot_type=\"{{ reboot_type }}\" diff --git a/ansible/roles/test/tasks/qos_sai.yml b/ansible/roles/test/tasks/qos_sai.yml index 7b6d6ef550..4c73d4a24e 100644 --- a/ansible/roles/test/tasks/qos_sai.yml +++ b/ansible/roles/test/tasks/qos_sai.yml @@ -37,14 +37,18 @@ - lldpd - lldp-syncd - - name: Disable bgpd + - name: Ensure BGP Daemon stopped become: yes - lineinfile: dest=/etc/quagga/daemons - regexp=^bgpd=.*$ - line='bgpd=no' - notify: - - Restart Quagga Daemon + supervisorctl: state=stopped name=bgpd delegate_to: "{{ ansible_host }}_bgp" + + - name: Add iptables rule to drop BGP SYN Packet from peer so that we do not ACK back + shell: "iptables -A INPUT -j DROP -p tcp --destination-port bgp" + become: true + + - name: Add ip6tables rule to drop BGP SYN Packet from peer so that we do not ACK back + shell: "ip6tables -A INPUT -j DROP -p tcp --destination-port bgp" + become: true - meta: flush_handlers @@ -467,14 +471,20 @@ - lldpd - lldp-syncd + - name: Remove iptables rule to drop BGP SYN Packet from Peer + shell: "iptables -D INPUT -j DROP -p tcp --destination-port bgp" + become: true + + - name: Remove ip6tables rule to drop BGP SYN Packet from Peer + shell: "ip6tables -D INPUT -j DROP -p tcp --destination-port bgp" + become: true + - name: Enable bgpd become: yes - lineinfile: dest=/etc/quagga/daemons - regexp=^bgpd=.*$ - line='bgpd=yes' + supervisorctl: state=started name=bgpd + delegate_to: "{{ ansible_host }}_bgp" notify: - Restart Quagga Daemon - delegate_to: "{{ ansible_host }}_bgp" - name: Restore original watermark polling status shell: counterpoll watermark {{watermark_status.stdout}} diff --git a/ansible/roles/test/tasks/vxlan-decap.yml b/ansible/roles/test/tasks/vxlan-decap.yml index 937182aa51..532dd23cab 100644 --- a/ansible/roles/test/tasks/vxlan-decap.yml +++ b/ansible/roles/test/tasks/vxlan-decap.yml @@ -1,115 +1,4 @@ -# example - -- block: - - fail: msg="Please set ptf_host variable" - when: ptf_host is not defined - - - name: Remove existing ip from ptf host - script: roles/test/files/helpers/remove_ip.sh - delegate_to: "{{ ptf_host }}" - - - name: Make all mac addresses in ptf unique - should be done in vm_set - script: roles/test/files/helpers/change_mac.sh - delegate_to: "{{ ptf_host }}" - - - name: Copy tests to the PTF container - copy: src=roles/test/files/ptftests dest=/root - delegate_to: "{{ ptf_host }}" - - - name: Copy arp responder to the PTF container - copy: src=roles/test/files/helpers/arp_responder.py dest=/opt - delegate_to: "{{ ptf_host }}" - - - name: Copy arp responder supervisor configuration to the PTF container - template: src=arp_responder.conf.j2 dest=/etc/supervisor/conf.d/arp_responder.conf - vars: - - arp_responder_args: '--conf /tmp/vxlan_arpresponder.conf' - delegate_to: "{{ ptf_host }}" - - - name: Update supervisor configuration - include_tasks: "roles/test/tasks/common_tasks/update_supervisor.yml" - vars: - supervisor_host: "{{ ptf_host }}" - - - name: Restart DUT. Wait 240 seconds after SONiC started ssh - include_tasks: reboot.yml - vars: - ready_timeout: 240 - - - name: Render DUT parameters to json file for the test - template: src=vxlan_decap.json.j2 dest=/tmp/vxlan_decap.json - delegate_to: "{{ ptf_host }}" - - - name: Render DUT vxlan configuration. Tunnel - template: src=vxlan_db.tunnel.json.j2 dest=/tmp/vxlan_db.tunnel.json - - - name: Render DUT vxlan configuration. Tunnel Maps - template: src=vxlan_db.maps.json.j2 dest=/tmp/vxlan_db.maps.{{ item }}.json - with_items: "{{ minigraph_vlans }}" - - - set_fact: - send_packet_count: 10 - when: send_packet_count is not defined - - - include_tasks: ptf_runner.yml - vars: - ptf_test_name: Vxlan decap test - No vxlan configuration - ptf_test_dir: ptftests - ptf_test_path: vxlan-decap.Vxlan - ptf_platform: remote - ptf_platform_dir: ptftests - ptf_qlen: 1000 - ptf_test_params: - - vxlan_enabled=False - - config_file='/tmp/vxlan_decap.json' - - count={{ send_packet_count }} - - - name: Configure vxlan decap tunnel - shell: sonic-cfggen -j /tmp/vxlan_db.tunnel.json --write-to-db - - - name: Configure vxlan decap tunnel maps - shell: sonic-cfggen -j /tmp/vxlan_db.maps.{{ item }}.json --write-to-db - with_items: "{{ minigraph_vlans }}" - - - include_tasks: ptf_runner.yml - vars: - ptf_test_name: Vxlan decap test - vxlan configuration applied - ptf_test_dir: ptftests - ptf_test_path: vxlan-decap.Vxlan - ptf_platform: remote - ptf_platform_dir: ptftests - ptf_qlen: 1000 - ptf_test_params: - - vxlan_enabled=True - - config_file='/tmp/vxlan_decap.json' - - count={{ send_packet_count }} - - - name: Remove vxlan tunnel maps configuration - shell: docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL_MAP|tunnelVxlan|map{{ item }}" - with_items: "{{ minigraph_vlans }}" - - - name: Remove vxlan tunnel configuration - shell: docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL|tunnelVxlan" - - - include_tasks: ptf_runner.yml - vars: - ptf_test_name: Vxlan decap test - vxlan configuration removed - ptf_test_dir: ptftests - ptf_test_path: vxlan-decap.Vxlan - ptf_platform: remote - ptf_platform_dir: ptftests - ptf_qlen: 1000 - ptf_test_params: - - vxlan_enabled=False - - config_file='/tmp/vxlan_decap.json' - - count={{ send_packet_count }} - -- block: - - name: Remove vxlan tunnel maps configuration - shell: docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL_MAP|tunnelVxlan|map{{ item }}" - with_items: "{{ minigraph_vlans }}" - - - name: Remove vxlan tunnel configuration - shell: docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL|tunnelVxlan" - tags: - - always +- name: run test + include_tasks: roles/test/tasks/pytest_runner.yml + vars: + test_node: test_vxlan_decap.py diff --git a/ansible/roles/test/templates/vxlan_db.maps.json.j2 b/ansible/roles/test/templates/vxlan_db.maps.json.j2 deleted file mode 100644 index 1be0cf7c6e..0000000000 --- a/ansible/roles/test/templates/vxlan_db.maps.json.j2 +++ /dev/null @@ -1,9 +0,0 @@ -{ - "VXLAN_TUNNEL_MAP": { - "tunnelVxlan|map{{ item }}": { - "vni": "{{ item | replace("Vlan", "") | int + 336 }}", - "vlan": "{{ item }}" - } - } -} - diff --git a/ansible/roles/test/templates/vxlan_db.tunnel.json.j2 b/ansible/roles/test/templates/vxlan_db.tunnel.json.j2 deleted file mode 100644 index f4671fe6e2..0000000000 --- a/ansible/roles/test/templates/vxlan_db.tunnel.json.j2 +++ /dev/null @@ -1,8 +0,0 @@ -{ - "VXLAN_TUNNEL": { - "tunnelVxlan": { - "src_ip": "{{ minigraph_lo_interfaces[0]['addr'] }}", - "dst_ip": "8.8.8.8" - } - } -} diff --git a/ansible/roles/test/templates/vxlan_decap.json.j2 b/ansible/roles/test/templates/vxlan_decap.json.j2 deleted file mode 100644 index ab68c860c3..0000000000 --- a/ansible/roles/test/templates/vxlan_decap.json.j2 +++ /dev/null @@ -1,9 +0,0 @@ -{ - "minigraph_port_indices": {{ minigraph_port_indices | to_nice_json }}, - "minigraph_portchannel_interfaces": {{ minigraph_portchannel_interfaces | to_nice_json }}, - "minigraph_portchannels": {{ minigraph_portchannels | to_nice_json }}, - "minigraph_lo_interfaces": {{ minigraph_lo_interfaces | to_nice_json }}, - "minigraph_vlans": {{ minigraph_vlans | to_nice_json }}, - "minigraph_vlan_interfaces": {{ minigraph_vlan_interfaces | to_nice_json }}, - "dut_mac": {{ ansible_Ethernet0['macaddress'] | to_nice_json }} -} diff --git a/tests/common/devices.py b/tests/common/devices.py index 44ccdf0e55..13066fb4be 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -323,7 +323,7 @@ def get_networking_uptime(self): return self.get_now_time() - datetime.strptime(start_time["ExecMainStartTimestamp"], "%a %Y-%m-%d %H:%M:%S UTC") except Exception as e: - self.logger.error("Exception raised while getting networking restart time: %s" % repr(e)) + logging.error("Exception raised while getting networking restart time: %s" % repr(e)) return None def get_image_info(self): @@ -348,7 +348,7 @@ def get_image_info(self): return ret def get_asic_type(self): - return dut.facts["asic_type"] + return self.facts["asic_type"] class EosHost(AnsibleHostBase): @@ -364,5 +364,86 @@ def __init__(self, ansible_adhoc, hostname, user, passwd, gather_facts=False): 'ansible_network_os':'eos', \ 'ansible_user': user, \ 'ansible_password': passwd, \ + 'ansible_ssh_user': user, \ + 'ansible_ssh_pass': passwd, \ 'ansible_become_method': 'enable' } self.host.options['variable_manager'].extra_vars.update(evars) + + def shutdown(self, interface_name): + out = self.host.eos_config( + lines=['shutdown'], + parents='interface %s' % interface_name) + if not self.check_intf_link_state(interface_name): + logging.info('Shut interface [%s]' % interface_name) + return out + raise RunAnsibleModuleFail("The interface state is Up but expect Down, detail output: %s" % out[self.hostname]) + + def no_shutdown(self, interface_name): + out = self.host.eos_config( + lines=['no shutdown'], + parents='interface %s' % interface_name) + if self.check_intf_link_state(interface_name): + logging.info('No shut interface [%s]' % interface_name) + return out + raise RunAnsibleModuleFail("The interface state is Down but expect Up, detail output: %s" % out[self.hostname]) + + def check_intf_link_state(self, interface_name): + show_int_result = self.host.eos_command( + commands=['show interface %s' % interface_name])[self.hostname] + return 'Up' in show_int_result['stdout_lines'][0] + + def command(self, cmd): + out = self.host.eos_command(commands=[cmd]) + return out + +class FanoutHost(): + """ + @summary: Class for Fanout switch + + For running ansible module on the Fanout switch + """ + + def __init__(self, ansible_adhoc, os, hostname, device_type, user, passwd): + self.hostname = hostname + self.type = device_type + self.host_to_fanout_port_map = {} + self.fanout_to_host_port_map = {} + if os == 'sonic': + self.os = os + self.host = SonicHost(ansible_adhoc, hostname) + else: + # Use eos host if the os type is unknown + self.os = 'eos' + self.host = EosHost(ansible_adhoc, hostname, user, passwd) + + def get_fanout_os(self): + return self.os + + def get_fanout_type(self): + return self.type + + def shutdown(self, interface_name): + return self.host.shutdown(interface_name)[self.hostname] + + def no_shutdown(self, interface_name): + return self.host.no_shutdown(interface_name)[self.hostname] + + def command(self, cmd): + return self.host.command(cmd)[self.hostname] + + def __str__(self): + return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type) + + def __repr__(self): + return self.__str__() + + def add_port_map(self, host_port, fanout_port): + """ + Fanout switch is build from the connection graph of the + DUT. So each fanout switch instance is relevant to the + DUT instance in the test. As result the port mapping is + unique from the DUT perspective. However, this function + need update when supporting multiple DUT + """ + self.host_to_fanout_port_map[host_port] = fanout_port + self.fanout_to_host_port_map[fanout_port] = host_port diff --git a/tests/common/fixtures/advanced_reboot.py b/tests/common/fixtures/advanced_reboot.py index 874bb8499b..af78bfd481 100644 --- a/tests/common/fixtures/advanced_reboot.py +++ b/tests/common/fixtures/advanced_reboot.py @@ -65,6 +65,7 @@ def __extractTestParam(self): self.newSonicImage = self.request.config.getoption("--new_sonic_image") self.cleanupOldSonicImages = self.request.config.getoption("--cleanup_old_sonic_images") self.readyTimeout = self.request.config.getoption("--ready_timeout") + self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script") def getHostMaxLen(self): ''' @@ -115,9 +116,14 @@ def __buildTestbedData(self): self.rebootData['dut_hostname'] = self.mgFacts['minigraph_mgmt_interface']['addr'] self.rebootData['dut_mac'] = hostFacts['ansible_Ethernet0']['macaddress'] - self.rebootData['dut_username'] = hostFacts['ansible_env']['SUDO_USER'] self.rebootData['vlan_ip_range'] = self.mgFacts['minigraph_vlan_interfaces'][0]['subnet'] self.rebootData['dut_vlan_ip'] = self.mgFacts['minigraph_vlan_interfaces'][0]['addr'] + + invetory = self.duthost.host.options['inventory'].split('/')[-1] + secrets = self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname]['secret_group_vars'] + self.rebootData['dut_username'] = secrets[invetory]['sonicadmin_user'] + self.rebootData['dut_password'] = secrets[invetory]['sonicadmin_password'] + self.rebootData['default_ip_range'] = str( ipaddress.ip_interface(self.mgFacts['minigraph_vlan_interfaces'][0]['addr'] + '/16').network ) @@ -223,13 +229,24 @@ def __prepareTestbedSshKeys(self, dutUsername, dutIp): self.ptfhost.shell('ssh-keygen -f /root/.ssh/known_hosts -R ' + dutIp) logger.info('Generate public key for ptf host') - self.ptfhost.shell('ssh-keygen -b 2048 -t rsa -f /root/.ssh/id_rsa -q -N ""') - result = self.ptfhost.shell('cat /root/.ssh/id_rsa.pub') + self.ptfhost.file(path='/root/.ssh/', mode='u+rwx,g-rwx,o-rwx', state='directory') + result = self.ptfhost.openssh_keypair( + path='/root/.ssh/id_rsa', + size=2048, + force=True, + type='rsa', + mode='u=rw,g=,o=' + ) + # There is an error with id_rsa.pub access permissions documented in: + # https://github.com/ansible/ansible/issues/61411 + # @TODO: remove the following line when upgrading to Ansible 2.9x + self.ptfhost.file(path='/root/.ssh/id_rsa.pub', mode='u=rw,g=,o=') + cmd = ''' mkdir -p /home/{0}/.ssh && echo "{1}" >> /home/{0}/.ssh/authorized_keys && chown -R {0}:{0} /home/{0}/.ssh/ - '''.format(dutUsername, result['stdout']) + '''.format(dutUsername, result['public_key']) self.duthost.shell(cmd) def __handleMellanoxDut(self): @@ -307,6 +324,11 @@ def __setupTestbed(self): logger.info('Copy ARP responder to the PTF container {}'.format(self.ptfhost.hostname)) self.ptfhost.copy(src='scripts/arp_responder.py', dest='/opt') + # Replace fast-reboot script + if self.replaceFastRebootScript: + logger.info('Replace fast-reboot script on DUT {}'.format(self.duthost.hostname)) + self.duthost.copy(src='scripts/fast-reboot', dest='/usr/bin/') + def __clearArpAndFdbTables(self): ''' Clears ARP and FDB entries @@ -408,8 +430,12 @@ def __runPtfRunner(self, rebootOper=None): ''' logger.info("Running PTF runner on PTF host: {0}".format(self.ptfhost)) - prebootOper = rebootOper if rebootOper is not None and 'routing' in rebootOper else None - inbootOper = rebootOper if rebootOper is not None and 'routing' not in rebootOper else None + # Non-routing neighbor/dut lag/bgp, vlan port up/down operation is performed before dut reboot process + # lack of routing indicates it is preboot operation + prebootOper = rebootOper if rebootOper is not None and 'routing' not in rebootOper else None + # Routing add/remove is performed during dut reboot process + # presence of routing in reboot operation indicates it is during reboot operation (inboot) + inbootOper = rebootOper if rebootOper is not None and 'routing' in rebootOper else None self.__updateAndRestartArpResponder(rebootOper) @@ -423,6 +449,7 @@ def __runPtfRunner(self, rebootOper=None): platform="remote", params={ "dut_username" : self.rebootData['dut_username'], + "dut_password" : self.rebootData['dut_password'], "dut_hostname" : self.rebootData['dut_hostname'], "reboot_limit_in_seconds" : self.rebootLimit, "reboot_type" :self.rebootType, @@ -452,15 +479,17 @@ def __restorePrevImage(self): ''' Resotre previous image and reboot DUT ''' - logger.info('Restore current image') - self.duthost.shell('sonic_installer set_default {0}'.format(self.currentImage)) - - rebootDut( - self.duthost, - self.localhost, - reboot_type=self.rebootType.replace('-reboot', ''), - wait = 180 + self.readyTimeout - ) + currentImage = self.duthost.shell('sonic_installer list | grep Current | cut -f2 -d " "')['stdout'] + if currentImage != self.currentImage: + logger.info('Restore current image') + self.duthost.shell('sonic_installer set_default {0}'.format(self.currentImage)) + + rebootDut( + self.duthost, + self.localhost, + reboot_type=self.rebootType.replace('-reboot', ''), + wait = self.readyTimeout + ) def tearDown(self): ''' diff --git a/tests/common/plugins/psu_controller/snmp_psu_controllers.py b/tests/common/plugins/psu_controller/snmp_psu_controllers.py index 30b7d427bd..fc926d380d 100644 --- a/tests/common/plugins/psu_controller/snmp_psu_controllers.py +++ b/tests/common/plugins/psu_controller/snmp_psu_controllers.py @@ -6,43 +6,83 @@ import logging from controller_base import PsuControllerBase -from controller_base import run_local_cmd +from pysnmp.proto import rfc1902 +from pysnmp.entity.rfc3413.oneliner import cmdgen -def get_psu_controller_type(psu_controller_host): +class snmpPsuController(PsuControllerBase): """ - @summary: Use SNMP to get the type of PSU controller host - @param psu_controller_host: IP address of PSU controller host - @return: Returns type string of the specified PSU controller host - """ - result = None - cmd = "snmpget -v 1 -c public -Ofenqv %s .1.3.6.1.2.1.1.1.0" % psu_controller_host - try: - stdout = run_local_cmd(cmd) - - lines = stdout.splitlines() - if len(lines) > 0: - result = lines[0].strip() - result = result.replace('"', '') - except Exception as e: - logging.debug("Failed to get psu controller type, exception: " + repr(e)) - - return result - + PSU Controller class for SNMP conrolled PSUs - 'Sentry Switched CDU' and 'APC Web/SNMP Management Card' -class SentrySwitchedCDU(PsuControllerBase): + This class implements the interface defined in PsuControllerBase class for SNMP conrtolled PDU type + 'Sentry Switched CDU' and 'APC Web/SNMP Management Card' """ - PSU Controller class for 'Sentry Switched CDU' - This class implements the interface defined in PsuControllerBase class for PDU type 'Sentry Switched CDU' - """ - PORT_NAME_BASE_OID = ".1.3.6.1.4.1.1718.3.2.3.1.3.1" - PORT_STATUS_BASE_OID = ".1.3.6.1.4.1.1718.3.2.3.1.5.1" - PORT_CONTROL_BASE_OID = ".1.3.6.1.4.1.1718.3.2.3.1.11.1" - STATUS_ON = "1" - STATUS_OFF = "0" - CONTROL_ON = "1" - CONTROL_OFF = "2" + def get_psu_controller_type(self): + """ + @summary: Use SNMP to get the type of PSU controller host + @param psu_controller_host: IP address of PSU controller host + @return: Returns type string of the specified PSU controller host + """ + pSYSDESCR = ".1.3.6.1.2.1.1.1.0" + SYSDESCR = "1.3.6.1.2.1.1.1.0" + psu = None + cmdGen = cmdgen.CommandGenerator() + snmp_auth = cmdgen.CommunityData('public') + errorIndication, errorStatus, errorIndex, varBinds = cmdGen.getCmd( + snmp_auth, + cmdgen.UdpTransportTarget((self.controller, 161), timeout=5.0), + cmdgen.MibVariable(pSYSDESCR,), + ) + if errorIndication: + logging.info("Failed to get psu controller type, exception: " + str(errorIndication)) + for oid, val in varBinds: + current_oid = oid.prettyPrint() + current_val = val.prettyPrint() + if current_oid == SYSDESCR: + psu = current_val + if psu is None: + self.psuType = None + return + if 'Sentry Switched CDU' in psu: + self.psuType = "SENTRY" + if 'APC Web/SNMP Management Card' in psu: + self.psuType = "APC" + return + + def psuCntrlOid(self): + """ + Define Oids based on the PSU Type + """ + # MIB OIDs for 'APC Web/SNMP Management PSU' + APC_PORT_NAME_BASE_OID = "1.3.6.1.4.1.318.1.1.4.4.2.1.4" + APC_PORT_STATUS_BASE_OID = "1.3.6.1.4.1.318.1.1.12.3.5.1.1.4" + APC_PORT_CONTROL_BASE_OID = "1.3.6.1.4.1.318.1.1.12.3.3.1.1.4" + # MIB OID for 'Sentry Switched CDU' + SENTRY_PORT_NAME_BASE_OID = "1.3.6.1.4.1.1718.3.2.3.1.3.1" + SENTRY_PORT_STATUS_BASE_OID = "1.3.6.1.4.1.1718.3.2.3.1.5.1" + SENTRY_PORT_CONTROL_BASE_OID = "1.3.6.1.4.1.1718.3.2.3.1.11.1" + self.STATUS_ON = "1" + self.STATUS_OFF = "0" + self.CONTROL_ON = "1" + self.CONTROL_OFF = "2" + if self.psuType == "APC": + self.pPORT_NAME_BASE_OID = '.'+APC_PORT_NAME_BASE_OID + self.pPORT_STATUS_BASE_OID = '.'+APC_PORT_STATUS_BASE_OID + self.pPORT_CONTROL_BASE_OID = '.'+APC_PORT_CONTROL_BASE_OID + self.PORT_NAME_BASE_OID = APC_PORT_NAME_BASE_OID + self.PORT_STATUS_BASE_OID = APC_PORT_STATUS_BASE_OID + self.PORT_CONTROL_BASE_OID = APC_PORT_CONTROL_BASE_OID + elif self.psuType == "SENTRY": + self.pPORT_NAME_BASE_OID = '.'+SENTRY_PORT_NAME_BASE_OID + self.pPORT_STATUS_BASE_OID = '.'+SENTRY_PORT_STATUS_BASE_OID + self.pPORT_CONTROL_BASE_OID = '.'+SENTRY_PORT_CONTROL_BASE_OID + self.PORT_NAME_BASE_OID = SENTRY_PORT_NAME_BASE_OID + self.PORT_STATUS_BASE_OID = SENTRY_PORT_STATUS_BASE_OID + self.PORT_CONTROL_BASE_OID = SENTRY_PORT_CONTROL_BASE_OID + else: + pass + def _get_pdu_ports(self): """ @@ -51,17 +91,22 @@ def _get_pdu_ports(self): The PDU ports connected to DUT must have hostname of DUT configured in port name/description. This method depends on this configuration to find out the PDU ports connected to PSUs of specific DUT. """ - try: - cmd = "snmpwalk -v 1 -c public -Ofenq %s %s " % (self.controller, self.PORT_NAME_BASE_OID) - stdout = run_local_cmd(cmd) - for line in stdout.splitlines(): - if self.hostname in line: # PDU port name/description should have DUT hostname - fields = line.split() - if len(fields) == 2: - # Remove the preceding PORT_NAME_BASE_OID, remaining string is the PDU port ID - self.pdu_ports.append(fields[0].replace(self.PORT_NAME_BASE_OID, '')) - except Exception as e: - logging.debug("Failed to get ports controlling PSUs of DUT, exception: " + repr(e)) + cmdGen = cmdgen.CommandGenerator() + snmp_auth = cmdgen.CommunityData('public') + errorIndication, errorStatus, errorIndex, varTable = cmdGen.nextCmd( + snmp_auth, + cmdgen.UdpTransportTarget((self.controller, 161)), + cmdgen.MibVariable(self.pPORT_NAME_BASE_OID,), + ) + if errorIndication: + logging.debug("Failed to get ports controlling PSUs of DUT, exception: " + str(errorIndication)) + for varBinds in varTable: + for oid, val in varBinds: + current_oid = oid.prettyPrint() + current_val = val.prettyPrint() + if self.hostname.lower() in current_val.lower(): + # Remove the preceding PORT_NAME_BASE_OID, remaining string is the PDU port ID + self.pdu_ports.append(current_oid.replace(self.PORT_NAME_BASE_OID, '')) def __init__(self, hostname, controller): logging.info("Initializing " + self.__class__.__name__) @@ -69,6 +114,9 @@ def __init__(self, hostname, controller): self.hostname = hostname self.controller = controller self.pdu_ports = [] + self.psuType = None + self.get_psu_controller_type() + self.psuCntrlOid() self._get_pdu_ports() logging.info("Initialized " + self.__class__.__name__) @@ -89,16 +137,17 @@ def turn_on_psu(self, psu_id): @param psu_id: ID of the PSU on SONiC DUT @return: Return true if successfully execute the command for turning on power. Otherwise return False. """ - try: - idx = int(psu_id) % len(self.pdu_ports) - port_oid = self.PORT_CONTROL_BASE_OID + self.pdu_ports[idx] - cmd = "snmpset -v1 -C q -c private %s %s i %s" % (self.controller, port_oid, self.CONTROL_ON) - run_local_cmd(cmd) - logging.info("Turned on PSU %s" % str(psu_id)) - return True - except Exception as e: - logging.debug("Failed to turn on PSU %s, exception: %s" % (str(psu_id), repr(e))) + port_oid = self.pPORT_CONTROL_BASE_OID + self.pdu_ports[rfc1902.Integer(psu_id)] + errorIndication, errorStatus, _, _ = \ + cmdgen.CommandGenerator().setCmd( + cmdgen.CommunityData('private'), + cmdgen.UdpTransportTarget((self.controller, 161)), + (port_oid, rfc1902.Integer(self.CONTROL_ON)), + ) + if errorIndication or errorStatus != 0: + logging.debug("Failed to turn on PSU %s, exception: %s" % (str(psu_id), str(errorStatus))) return False + return True def turn_off_psu(self, psu_id): """ @@ -117,16 +166,17 @@ def turn_off_psu(self, psu_id): @param psu_id: ID of the PSU on SONiC DUT @return: Return true if successfully execute the command for turning off power. Otherwise return False. """ - try: - idx = int(psu_id) % len(self.pdu_ports) - port_oid = self.PORT_CONTROL_BASE_OID + self.pdu_ports[idx] - cmd = "snmpset -v1 -C q -c private %s %s i %s" % (self.controller, port_oid, self.CONTROL_OFF) - run_local_cmd(cmd) - logging.info("Turned off PSU %s" % str(psu_id)) - return True - except Exception as e: - logging.debug("Failed to turn off PSU %s, exception: %s" % (str(psu_id), repr(e))) + port_oid = self.pPORT_CONTROL_BASE_OID + self.pdu_ports[rfc1902.Integer(psu_id)] + errorIndication, errorStatus, _, _ = \ + cmdgen.CommandGenerator().setCmd( + cmdgen.CommunityData('private'), + cmdgen.UdpTransportTarget((self.controller, 161)), + (port_oid, rfc1902.Integer(self.CONTROL_OFF)), + ) + if errorIndication or errorStatus != 0: + logging.debug("Failed to turn on PSU %s, exception: %s" % (str(psu_id), str(errorStatus))) return False + return True def get_psu_status(self, psu_id=None): """ @@ -149,22 +199,28 @@ def get_psu_status(self, psu_id=None): The psu_id in returned result is integer starts from 0. """ results = [] - try: - cmd = "snmpwalk -v 1 -c public -Ofenq %s %s " % (self.controller, self.PORT_STATUS_BASE_OID) - stdout = run_local_cmd(cmd) - for line in stdout.splitlines(): + cmdGen = cmdgen.CommandGenerator() + snmp_auth = cmdgen.CommunityData('public') + errorIndication, errorStatus, errorIndex, varTable = cmdGen.nextCmd( + snmp_auth, + cmdgen.UdpTransportTarget((self.controller, 161)), + cmdgen.MibVariable(self.pPORT_STATUS_BASE_OID,), + ) + if errorIndication: + logging.debug("Failed to get ports controlling PSUs of DUT, exception: " + str(errorIndication)) + for varBinds in varTable: + for oid, val in varBinds: + current_oid = oid.prettyPrint() + current_val = val.prettyPrint() for idx, port in enumerate(self.pdu_ports): port_oid = self.PORT_STATUS_BASE_OID + port - fields = line.strip().split() - if len(fields) == 2 and fields[0] == port_oid: - status = {"psu_id": idx, "psu_on": True if fields[1] == self.STATUS_ON else False} + if current_oid == port_oid: + status = {"psu_id": idx, "psu_on": True if current_val == self.STATUS_ON else False} results.append(status) - if psu_id is not None: - idx = int(psu_id) % len(self.pdu_ports) - results = results[idx:idx+1] - logging.info("Got PSU status: %s" % str(results)) - except Exception as e: - logging.debug("Failed to get psu status, exception: " + repr(e)) + if psu_id is not None: + idx = int(psu_id) % len(self.pdu_ports) + results = results[idx:idx+1] + logging.info("Got PSU status: %s" % str(results)) return results def close(self): @@ -176,13 +232,4 @@ def get_psu_controller(controller_ip, dut_hostname): @summary: Factory function to create the actual PSU controller object. @return: The actual PSU controller object. Returns None if something went wrong. """ - - psu_controller_type = get_psu_controller_type(controller_ip) - if not psu_controller_type: - return None - - if "Sentry Switched CDU" in psu_controller_type: - logging.info("Initializing PSU controller") - return SentrySwitchedCDU(dut_hostname, controller_ip) - - return None + return snmpPsuController(dut_hostname, controller_ip) diff --git a/tests/common/plugins/tacacs.py b/tests/common/plugins/tacacs.py index 322ac0bf7e..31eabf3300 100644 --- a/tests/common/plugins/tacacs.py +++ b/tests/common/plugins/tacacs.py @@ -6,11 +6,11 @@ def setup_tacacs(ptfhost, duthost, creds): """setup tacacs client and server""" # disable tacacs server - ptfhost.shell("service tacacs_plus stop") + ptfhost.service(name="tacacs_plus", state="stopped") # configure tacacs client duthost.shell("sudo config tacacs passkey %s" % creds['tacacs_passkey']) - + # get default tacacs servers config_facts = duthost.config_facts(host=duthost.hostname, source="running")['ansible_facts'] for tacacs_server in config_facts.get('TACPLUS_SERVER', {}): @@ -34,12 +34,12 @@ def setup_tacacs(ptfhost, duthost, creds): ptfhost.template(src="tacacs/tac_plus.conf.j2", dest="/etc/tacacs+/tac_plus.conf") # start tacacs server - ptfhost.shell("service tacacs_plus start") + ptfhost.service(name="tacacs_plus", state="started") yield # stop tacacs server - ptfhost.shell("service tacacs_plus stop") + ptfhost.service(name="tacacs_plus", state="stopped") # reset tacacs client configuration duthost.shell("sudo config tacacs delete %s" % ptfip) diff --git a/tests/conftest.py b/tests/conftest.py index eb9af0a335..7db45290d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,8 @@ from ansible_host import AnsibleHost from collections import defaultdict -from common.devices import SonicHost, Localhost, PTFHost, EosHost +from common.fixtures.conn_graph_facts import conn_graph_facts +from common.devices import SonicHost, Localhost, PTFHost, EosHost, FanoutHost logger = logging.getLogger(__name__) @@ -190,7 +191,7 @@ def ptfhost(testbed_devices): @pytest.fixture(scope="module") def nbrhosts(ansible_adhoc, testbed, creds): """ - Shortcut fixture for getting PTF host + Shortcut fixture for getting VM host """ vm_base = int(testbed['vm_base'][2:]) @@ -199,6 +200,29 @@ def nbrhosts(ansible_adhoc, testbed, creds): devices[k] = EosHost(ansible_adhoc, "VM%04d" % (vm_base + v['vm_offset']), creds['eos_login'], creds['eos_password']) return devices +@pytest.fixture(scope="module") +def fanouthosts(ansible_adhoc, conn_graph_facts, creds): + """ + Shortcut fixture for getting Fanout hosts + """ + + dev_conn = conn_graph_facts['device_conn'] + fanout_hosts = {} + for dut_port in dev_conn.keys(): + fanout_rec = dev_conn[dut_port] + fanout_host = fanout_rec['peerdevice'] + fanout_port = fanout_rec['peerport'] + if fanout_host in fanout_hosts.keys(): + fanout = fanout_hosts[fanout_host] + else: + # FIXME: assuming all fanout hosts are EOS for now. Needs to figure out the os type and + # create fanout switch with the right type. + fanout = FanoutHost(ansible_adhoc, 'eos', fanout_host, 'FanoutLeaf', creds['fanout_admin_user'], creds['fanout_admin_password']) + fanout_hosts[fanout_host] = fanout + fanout.add_port_map(dut_port, fanout_port) + + return fanout_hosts + @pytest.fixture(scope='session') def eos(): """ read and yield eos configuration """ @@ -207,10 +231,11 @@ def eos(): return eos -@pytest.fixture(scope="session") -def creds(): - """ read and yield lab configuration """ - files = glob.glob("../ansible/group_vars/lab/*.yml") +@pytest.fixture(scope="module") +def creds(duthost): + """ read credential information according to the dut inventory """ + inv = duthost.host.options['inventory'].split('/')[-1] + files = glob.glob("../ansible/group_vars/{}/*.yml".format(inv)) files += glob.glob("../ansible/group_vars/all/*.yml") creds = {} for f in files: diff --git a/tests/fib/test_fib.py b/tests/fib/test_fib.py index 50e88ba8c7..26aee30364 100644 --- a/tests/fib/test_fib.py +++ b/tests/fib/test_fib.py @@ -61,6 +61,31 @@ def build_fib(duthost, config_facts, fibfile, t): else: ofp.write("{} []\n".format(prefix)) +def get_router_interface_ports(config_facts, testbed): + """ + get all physical ports associated with router interface (physical router interface, port channel router interface and vlan router interface) + """ + + ports = config_facts.get('INTERFACE', {}).keys() + portchannels_member_ports = [] + vlan_untag_ports = [] + portchannels_name = config_facts.get('PORTCHANNEL_INTERFACE', {}).keys() + if portchannels_name: + for po_name in portchannels_name: + for port_name in config_facts.get('PORTCHANNEL', {})[po_name]['members']: + portchannels_member_ports.append(port_name) + if 't0' in testbed['topo']['name']: + vlans = config_facts.get('VLAN_INTERFACE', {}).keys() + for vlan in vlans: + vlan_member_info = config_facts.get('VLAN_MEMBER', {}).get(vlan, {}) + if vlan_member_info: + for port_name, tag_mode in vlan_member_info.items(): + if tag_mode['tagging_mode'] == 'untagged': + vlan_untag_ports.append(port_name) + + router_interface_ports = ports + portchannels_member_ports + vlan_untag_ports + + return router_interface_ports @pytest.mark.parametrize("ipv4, ipv6, mtu", [pytest.param(True, True, 1514)]) def test_fib(testbed, duthost, ptfhost, ipv4, ipv6, mtu): @@ -135,25 +160,7 @@ def setup_hash(self, testbed, duthost, ptfhost): g_vars['testbed_type'] = testbed['topo']['name'] g_vars['router_mac'] = duthost.shell('sonic-cfggen -d -v \'DEVICE_METADATA.localhost.mac\'')["stdout_lines"][0].decode("utf-8") - # generate available send packet ports - ports = config_facts.get('INTERFACE', {}).keys() - portchannels_member_ports = [] - vlan_untag_ports = [] - portchannels_name = config_facts.get('PORTCHANNEL_INTERFACE', {}).keys() - if portchannels_name: - for po_name in portchannels_name: - for p in config_facts.get('PORTCHANNEL', {})[po_name]['members']: - portchannels_member_ports.append(p) - if 't0' in g_vars['testbed_type']: - vlans = config_facts.get('VLAN_INTERFACE', {}).keys() - for vlan in vlans: - vlan_member_info = config_facts.get('VLAN_MEMBER', {}).get(vlan, {}) - if vlan_member_info: - for port_name, tag_mode in vlan_member_info.items(): - if tag_mode['tagging_mode'] == 'untagged': - vlan_untag_ports.append(port_name) - - in_ports_name = ports + portchannels_member_ports + vlan_untag_ports + in_ports_name = get_router_interface_ports(config_facts, testbed) g_vars['in_ports'] = [config_facts.get('port_index_map', {})[p] for p in in_ports_name] # add some vlan for hash_key vlan-id test diff --git a/tests/platform/args/advanced_reboot_args.py b/tests/platform/args/advanced_reboot_args.py index 00886bcb76..df103078ff 100644 --- a/tests/platform/args/advanced_reboot_args.py +++ b/tests/platform/args/advanced_reboot_args.py @@ -76,3 +76,11 @@ def add_advanced_reboot_args(parser): default=180, help="DUT reboot ready timout", ) + + parser.addoption( + "--replace_fast_reboot_script", + action="store", + type=bool, + default=False, + help="Replace fast-reboot script on DUT", + ) diff --git a/tests/platform/files/invalid_value_policy.json b/tests/platform/files/invalid_value_policy.json index edf35114a5..95cee699f2 100644 --- a/tests/platform/files/invalid_value_policy.json +++ b/tests/platform/files/invalid_value_policy.json @@ -5,6 +5,9 @@ }, { "type": "psu_info" + }, + { + "type": "chassis_info" } ], "policies": [ diff --git a/tests/platform/mellanox/mellanox_thermal_control_test_helper.py b/tests/platform/mellanox/mellanox_thermal_control_test_helper.py index 65816a0af9..7e04a9c747 100644 --- a/tests/platform/mellanox/mellanox_thermal_control_test_helper.py +++ b/tests/platform/mellanox/mellanox_thermal_control_test_helper.py @@ -592,7 +592,8 @@ def mock_data(self): for index in range(1, psu_count + 1): try: fan_data = FanData(self.mock_helper, naming_rule, index) - speed = random.randint(0, RandomFanStatusMocker.PSU_FAN_MAX_SPEED) + # PSU fan speed display PWM not percentage, it should not be less than 100 + speed = random.randint(101, RandomFanStatusMocker.PSU_FAN_MAX_SPEED) fan_data.mock_speed(speed) self.expected_data[fan_data.name] = [ diff --git a/tests/platform/mellanox/test_check_sfp_presence.py b/tests/platform/mellanox/test_check_sfp_presence.py index d0dd52862b..93e8121ec1 100644 --- a/tests/platform/mellanox/test_check_sfp_presence.py +++ b/tests/platform/mellanox/test_check_sfp_presence.py @@ -5,7 +5,7 @@ import os import json -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts def test_check_sfp_presence(testbed_devices, conn_graph_facts): """This test case is to check SFP presence status with CLI and sysfs. diff --git a/tests/platform/mellanox/test_check_sfp_using_ethtool.py b/tests/platform/mellanox/test_check_sfp_using_ethtool.py index 5f1e1dc268..e4f7a600aa 100644 --- a/tests/platform/mellanox/test_check_sfp_using_ethtool.py +++ b/tests/platform/mellanox/test_check_sfp_using_ethtool.py @@ -8,7 +8,7 @@ import os import json -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts from check_hw_mgmt_service import check_hw_management_service diff --git a/tests/platform/test_advanced_reboot.py b/tests/platform/test_advanced_reboot.py index fbff39f8d5..8a1cc05c20 100644 --- a/tests/platform/test_advanced_reboot.py +++ b/tests/platform/test_advanced_reboot.py @@ -4,6 +4,9 @@ def test_fast_reboot(request, get_advanced_reboot): ''' Fast reboot test case is run using advacned reboot test fixture + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture ''' advancedReboot = get_advanced_reboot(rebootType='fast-reboot') advancedReboot.runRebootTestcase() @@ -12,6 +15,189 @@ def test_fast_reboot(request, get_advanced_reboot): def test_warm_reboot(request, get_advanced_reboot): ''' Warm reboot test case is run using advacned reboot test fixture + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture ''' advancedReboot = get_advanced_reboot(rebootType='warm-reboot') advancedReboot.runRebootTestcase() + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_sad(request, get_advanced_reboot): + ''' + Warm reboot with sad path + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + prebootList = [ + 'neigh_bgp_down', # Shutdown single BGP session on remote device (VM) before reboot DUT + 'dut_bgp_down', # Shutdown single BGP session on DUT brefore rebooting it + 'dut_lag_down', # Shutdown single LAG session on DUT brefore rebooting it + 'neigh_lag_down', # Shutdown single LAG session on remote device (VM) before reboot DUT + 'dut_lag_member_down:1:1', # Shutdown 1 LAG member corresponding to 1 remote device (VM) on DUT + 'neigh_lag_member_down:1:1', # Shutdown 1 LAG member on 1 remote device (VM) + 'vlan_port_down', # Shutdown 1 vlan port (interface) on DUT + ] + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_multi_sad(request, get_advanced_reboot): + ''' + Warm reboot with multi sad path + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + lagMemberCnt = advancedReboot.getlagMemberCnt() + prebootList = [ + 'neigh_bgp_down:2', # Shutdown single BGP session on 2 remote devices (VMs) before reboot DUT + 'dut_bgp_down:3', # Shutdown 3 BGP sessions on DUT brefore rebooting it + 'dut_lag_down:2', # Shutdown 2 LAG sessions on DUT brefore rebooting it + 'neigh_lag_down:3', # Shutdown 1 LAG session on 3 remote devices (VMs) before reboot DUT + 'dut_lag_member_down:3:1', # Shutdown 1 LAG member of 3 LAG sessions corresponding to 3 remote devices (VM) + # on DUT + 'neigh_lag_member_down:2:1', # Shutdown 1 LAG member of 2 LAG sessions on 2 remote devices (VM) (1 each) + 'vlan_port_down:4', + ] + ([ + 'dut_lag_member_down:2:{0}'.format(lagMemberCnt), + # Shutdown LAG member(s) of 2 LAG sessions corresponding to 2 remote + # devices (VM) on DUT + 'neigh_lag_member_down:3:{0}'.format(lagMemberCnt), + # Shutdown LAG member(s) of 3 LAG sessions on 3 remote devices (VM) + # (1 each) + ] if advancedReboot.getTestbedType() in ['t0-64', 't0-116', 't0-64-32'] else []) + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_multi_sad_inboot(request, get_advanced_reboot): + ''' + Warm reboot with multi sad path (during boot) + + inboot list format: 'inboot_oper:route_cnt' + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + inbootList = [ + 'routing_del:50', # Delete 50 routes IPv4/IPv6 each (100 total) from each BGP session + 'routing_add:50', # Add 50 routes IPv4/IPv6 each (100 total) from each BGP session + ] + + advancedReboot.runRebootTestcase( + inbootList=inbootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_sad_bgp(request, get_advanced_reboot): + ''' + Warm reboot with sad (bgp) + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + prebootList = [ + 'neigh_bgp_down:2', # Shutdown single BGP session on 2 remote devices (VMs) before reboot DUT + 'dut_bgp_down:3', # Shutdown 3 BGP sessions on DUT brefore rebooting it + ] + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_sad_lag_member(request, get_advanced_reboot): + ''' + Warm reboot with sad path (lag member) + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + lagMemberCnt = advancedReboot.getlagMemberCnt() + prebootList = [ + 'dut_lag_member_down:3:1', # Shutdown 1 LAG member of 3 LAG sessions corresponding to 3 remote devices (VM) + # on DUT + 'neigh_lag_member_down:2:1', # Shutdown 1 LAG member of 2 LAG sessions on 2 remote devices (VM) (1 each) + ] + ([ + 'dut_lag_member_down:2:{0}'.format(lagMemberCnt), + # Shutdown LAG member(s) of 2 LAG sessions corresponding to 2 remote + # devices (VM) on DUT + 'neigh_lag_member_down:3:{0}'.format(lagMemberCnt), + # Shutdown LAG member(s) of 3 LAG sessions on 3 remote devices (VM) + # (1 each) + ] if advancedReboot.getTestbedType() in ['t0-64', 't0-116', 't0-64-32'] else []) + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_sad_lag(request, get_advanced_reboot): + ''' + Warm reboot with sad path (lag) + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + prebootList = [ + 'dut_lag_down:2', # Shutdown 2 LAG sessions on DUT brefore rebooting it + 'neigh_lag_down:3', # Shutdown 1 LAG session on 3 remote devices (VMs) before reboot DUT + ] + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) + +@pytest.mark.usefixtures('get_advanced_reboot') +def test_warm_reboot_sad_vlan_port(request, get_advanced_reboot): + ''' + Warm reboot with sad path (vlan port) + + prebootList format is 'preboot oper type:number of VMS down:number of lag members down'. + For non lag member cases, this parameter will be skipped + + @param request: Spytest commandline argument + @param get_advanced_reboot: advanced reboot test fixture + ''' + advancedReboot = get_advanced_reboot(rebootType='warm-reboot') + prebootList = [ + 'vlan_port_down:4', # Shutdown 4 vlan ports (interfaces) on DUT + ] + + advancedReboot.runRebootTestcase( + prebootList=prebootList, + prebootFiles='peer_dev_info,neigh_port_info' + ) diff --git a/tests/platform/test_platform_info.py b/tests/platform/test_platform_info.py index 589ac9c027..15c8c7fcb5 100644 --- a/tests/platform/test_platform_info.py +++ b/tests/platform/test_platform_info.py @@ -275,7 +275,6 @@ def test_show_platform_syseeprom(testbed_devices): logging.info("Check output of '%s'" % CMD_PLATFORM_SYSEEPROM) show_output = ans_host.command(CMD_PLATFORM_SYSEEPROM) - assert show_output["rc"] == 0, "Run command '%s' failed" % CMD_PLATFORM_SYSEEPROM if ans_host.facts["asic_type"] in ["mellanox"]: expected_fields = [ "Product Name", @@ -326,7 +325,6 @@ def test_show_platform_fanstatus(testbed_devices, mocker_factory): dut = testbed_devices["dut"] logging.info("Check output of '%s'" % CMD_PLATFORM_FANSTATUS) cli_fan_status = dut.command(CMD_PLATFORM_FANSTATUS) - assert cli_fan_status["rc"] == 0, "Run command '%s' failed" % CMD_PLATFORM_FANSTATUS lines = cli_fan_status["stdout_lines"] check_show_platform_fanstatus_output(lines) @@ -366,7 +364,6 @@ def test_show_platform_temperature(testbed_devices, mocker_factory): dut = testbed_devices["dut"] logging.info("Check output of '%s'" % CMD_PLATFORM_TEMPER) cli_thermal_status = dut.command(CMD_PLATFORM_TEMPER) - assert cli_thermal_status["rc"] == 0, "Run command '%s' failed" % CMD_PLATFORM_TEMPER # Mock data and check mocker = mocker_factory(dut, 'ThermalStatusMocker') @@ -388,7 +385,7 @@ def test_thermal_control_load_invalid_format_json(testbed_devices): control daemon is up and there is an error log printed """ logging.info('Loading invalid format policy file...') - check_thermal_control_load_invalid_file(testbed_devices, THERMAL_POLICY_INVALID_VALUE_FILE) + check_thermal_control_load_invalid_file(testbed_devices, THERMAL_POLICY_INVALID_FORMAT_FILE) @pytest.mark.disable_loganalyzer @@ -441,12 +438,14 @@ def test_thermal_control_psu_absence(testbed_devices, psu_controller, mocker_fac if fan_mocker is None: pytest.skip("No FanStatusMocker for %s, skip rest of the testing in this case" % dut.facts['asic_type']) - logging.info('Mock FAN status data...') - fan_mocker.mock_data() # make data random restart_thermal_control_daemon(dut) logging.info('Wait and check all FAN speed turn to 60%...') - wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, fan_mocker.check_all_fan_speed, - 60) + wait_result = wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, + THERMAL_CONTROL_TEST_CHECK_INTERVAL, + fan_mocker.check_all_fan_speed, + 60) + if not wait_result: + pytest.skip("FAN speed is not 60%, there might be abnormal in FAN/PSU, skip rest of the testing in this case") check_thermal_algorithm_status(dut, mocker_factory, False) @@ -466,8 +465,10 @@ def test_thermal_control_psu_absence(testbed_devices, psu_controller, mocker_fac pytest.skip("Some PSU are still down, skip rest of the testing in this case") logging.info('Wait and check all FAN speed turn to 65%...') - wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, fan_mocker.check_all_fan_speed, - 65) + assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, + THERMAL_CONTROL_TEST_CHECK_INTERVAL, + fan_mocker.check_all_fan_speed, + 65), 'FAN speed not change to 65% according to policy' def turn_off_psu_and_check_thermal_control(dut, psu_ctrl, psu, mocker): @@ -490,9 +491,13 @@ def turn_off_psu_and_check_thermal_control(dut, psu_ctrl, psu, mocker): assert psu_under_test is not None, "No PSU is turned off" logging.info('Wait and check all FAN speed turn to 100%...') - wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, mocker.check_all_fan_speed, 100) + assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, + THERMAL_CONTROL_TEST_CHECK_INTERVAL, + mocker.check_all_fan_speed, + 100), 'FAN speed not turn to 100% after PSU off' psu_ctrl.turn_on_psu(psu["psu_id"]) + time.sleep(5) @pytest.mark.disable_loganalyzer diff --git a/tests/platform/test_reload_config.py b/tests/platform/test_reload_config.py index 1cf025b46e..2787cf3790 100644 --- a/tests/platform/test_reload_config.py +++ b/tests/platform/test_reload_config.py @@ -10,7 +10,7 @@ import pytest -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts from common.utilities import wait_until from check_critical_services import check_critical_services from check_transceiver_status import check_transceiver_basic diff --git a/tests/platform/test_sequential_restart.py b/tests/platform/test_sequential_restart.py index bcab5c8152..7de1782283 100644 --- a/tests/platform/test_sequential_restart.py +++ b/tests/platform/test_sequential_restart.py @@ -10,7 +10,7 @@ import pytest -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts from common.utilities import wait_until from check_critical_services import check_critical_services from check_transceiver_status import check_transceiver_basic diff --git a/tests/platform/test_sfp.py b/tests/platform/test_sfp.py index bd6dcf2e85..510afaf5f0 100644 --- a/tests/platform/test_sfp.py +++ b/tests/platform/test_sfp.py @@ -13,7 +13,7 @@ import pytest -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts from common.plugins.loganalyzer.loganalyzer import LogAnalyzer ans_host = None diff --git a/tests/platform/test_xcvr_info_in_db.py b/tests/platform/test_xcvr_info_in_db.py index 264a0e7885..087a27e139 100644 --- a/tests/platform/test_xcvr_info_in_db.py +++ b/tests/platform/test_xcvr_info_in_db.py @@ -9,7 +9,7 @@ import os from check_transceiver_status import check_transceiver_status -from platform_fixtures import conn_graph_facts +from common.fixtures.conn_graph_facts import conn_graph_facts def test_xcvr_info_in_db(testbed_devices, conn_graph_facts): diff --git a/tests/scripts/fast-reboot b/tests/scripts/fast-reboot new file mode 100755 index 0000000000..c4ea1805cb --- /dev/null +++ b/tests/scripts/fast-reboot @@ -0,0 +1,547 @@ +#!/bin/bash -e + +REBOOT_USER=$(logname) +REBOOT_TIME=$(date) +REBOOT_CAUSE_FILE="/host/reboot-cause/reboot-cause.txt" +WARM_DIR=/host/warmboot +REDIS_FILE=dump.rdb +REBOOT_SCRIPT_NAME=$(basename $0) +REBOOT_TYPE="${REBOOT_SCRIPT_NAME}" +VERBOSE=no +FORCE=no +STRICT=no +REBOOT_METHOD="/sbin/kexec -e" +ASSISTANT_IP_LIST="" +ASSISTANT_SCRIPT="/usr/bin/neighbor_advertiser" + +# Require 100M available on the hard drive for warm reboot temp files, +# Size is in 1K blocks: +MIN_HD_SPACE_NEEDED=100000 + +EXIT_SUCCESS=0 +EXIT_FAILURE=1 +EXIT_NOT_SUPPORTED=2 +EXIT_FILE_SYSTEM_FULL=3 +EXIT_NEXT_IMAGE_NOT_EXISTS=4 +EXIT_ORCHAGENT_SHUTDOWN=10 +EXIT_SYNCD_SHUTDOWN=11 +EXIT_FAST_REBOOT_DUMP_FAILURE=12 +EXIT_NO_CONTROL_PLANE_ASSISTANT=20 + +function error() +{ + echo $@ >&2 +} + +function debug() +{ + if [[ x"${VERBOSE}" == x"yes" ]]; then + echo `date` $@ + fi + logger "$@" +} + +function showHelpAndExit() +{ + echo "Usage: ${REBOOT_SCRIPT_NAME} [options]" + echo " -h,-? : get this help" + echo " -v : turn on verbose" + echo " -f : force execution" + echo " -r : reboot with /sbin/reboot" + echo " -k : reboot with /sbin/kexec -e [default]" + echo " -x : execute script with -x flag" + echo " -c : specify control plane assistant IP list" + echo " -s : strict mode: do not proceed without:" + echo " - control plane assistant IP list." + + exit "${EXIT_SUCCESS}" +} + +function parseOptions() +{ + while getopts "vfh?rkxc:s" opt; do + case ${opt} in + h|\? ) + showHelpAndExit + ;; + v ) + VERBOSE=yes + ;; + f ) + FORCE=yes + ;; + r ) + REBOOT_METHOD="/sbin/reboot" + ;; + k ) + REBOOT_METHOD="/sbin/kexec -e" + ;; + x ) + set -x + ;; + c ) + ASSISTANT_IP_LIST=${OPTARG} + ;; + s ) + STRICT=yes + ;; + esac + done +} + +function clear_fast_boot() +{ + debug "${REBOOT_TYPE} failure ($?) cleanup ..." + + /sbin/kexec -u || /bin/true + + teardown_control_plane_assistant +} + +function clear_warm_boot() +{ + clear_fast_boot + + result=`timeout 10s config warm_restart disable; if [[ $? == 124 ]]; then echo timeout; else echo "code ($?)"; fi` || /bin/true + debug "Cancel warm-reboot: ${result}" + + TIMESTAMP=`date +%Y%m%d-%H%M%S` + if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then + mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true + fi +} + +function init_warm_reboot_states() +{ + # If the current running instanace was booted up with warm reboot. Then + # the current DB contents will likely mark warm reboot is done. + # Clear these states so that the next boot up image won't get confused. + if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + redis-cli -n 6 eval " + for _, key in ipairs(redis.call('keys', 'WARM_RESTART_TABLE|*')) do + redis.call('hdel', key, 'state') + end + " 0 >/dev/null + fi +} + +function initialize_pre_shutdown() +{ + debug "Initialize pre-shutdown ..." + TABLE="WARM_RESTART_TABLE|warm-shutdown" + RESTORE_COUNT=`/usr/bin/redis-cli -n 6 hget "${TABLE}" restore_count` + if [[ -z "$RESTORE_COUNT" ]]; then + /usr/bin/redis-cli -n 6 hset "${TABLE}" "restore_count" "0" > /dev/null + fi + /usr/bin/redis-cli -n 6 hset "${TABLE}" "state" "requesting" > /dev/null +} + +function request_pre_shutdown() +{ + debug "Requesting pre-shutdown ..." + /usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || { + error "Failed to request pre-shutdown" + } +} + +function wait_for_pre_shutdown_complete_or_fail() +{ + debug "Waiting for pre-shutdown ..." + TABLE="WARM_RESTART_TABLE|warm-shutdown" + STATE="requesting" + declare -i waitcount + declare -i retrycount + waitcount=0 + retrycount=0 + # Wait up to 60 seconds for pre-shutdown to complete + while [[ ${waitcount} -lt 600 ]]; do + # timeout doesn't work with -i option of "docker exec". Therefore we have + # to invoke docker exec directly below. + STATE=`timeout 5s docker exec database redis-cli -n 6 hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi` + + if [[ x"${STATE}" == x"timed out" ]]; then + waitcount+=50 + retrycount+=1 + debug "Timed out getting pre-shutdown state (${waitcount}) retry count ${retrycount} ..." + if [[ retrycount -gt 2 ]]; then + break + fi + elif [[ x"${STATE}" != x"requesting" ]]; then + break + else + sleep 0.1 + waitcount+=1 + fi + done + + if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then + debug "Syncd pre-shutdown failed: ${STATE} ..." + else + debug "Pre-shutdown succeeded ..." + fi +} + +function backup_database() +{ + debug "Backing up database ..." + # Dump redis content to a file 'dump.rdb' in warmboot directory + mkdir -p $WARM_DIR + # Delete keys in stateDB except FDB_TABLE|*, MIRROR_SESSION_TABLE|*, WARM_RESTART_ENABLE_TABLE|* + redis-cli -n 6 eval " + for _, k in ipairs(redis.call('keys', '*')) do + if not string.match(k, 'FDB_TABLE|') and not string.match(k, 'WARM_RESTART_TABLE|') \ + and not string.match(k, 'MIRROR_SESSION_TABLE|') \ + and not string.match(k, 'WARM_RESTART_ENABLE_TABLE|') then + redis.call('del', k) + end + end + " 0 > /dev/null + redis-cli save > /dev/null + docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR + docker exec -i database rm /var/lib/redis/$REDIS_FILE +} + +function setup_control_plane_assistant() +{ + if [[ -n "${ASSISTANT_IP_LIST}" && -x ${ASSISTANT_SCRIPT} ]]; then + debug "Setting up control plane assistant: ${ASSISTANT_IP_LIST} ..." + ${ASSISTANT_SCRIPT} -s ${ASSISTANT_IP_LIST} -m set + elif [[ X"${STRICT}" == X"yes" ]]; then + debug "Strict mode: fail due to lack of control plane assistant ..." + exit ${EXIT_NO_CONTROL_PLANE_ASSISTANT} + fi +} + +function teardown_control_plane_assistant() +{ + if [[ -n "${ASSISTANT_IP_LIST}" && -x ${ASSISTANT_SCRIPT} ]]; then + debug "Tearing down control plane assistant: ${ASSISTANT_IP_LIST} ..." + ${ASSISTANT_SCRIPT} -s ${ASSISTANT_IP_LIST} -m reset + fi +} + +function setup_reboot_variables() +{ + # Kernel and initrd image + CURRENT_SONIC_IMAGE=$(sonic_installer list | grep "Current: " | cut -d ' ' -f 2) + NEXT_SONIC_IMAGE=$(sonic_installer list | grep "Next: " | cut -d ' ' -f 2) + IMAGE_PATH="/host/image-${NEXT_SONIC_IMAGE#SONiC-OS-}" + if grep -q aboot_platform= /host/machine.conf; then + KERNEL_IMAGE="$(ls $IMAGE_PATH/boot/vmlinuz-*)" + BOOT_OPTIONS="$(cat "$IMAGE_PATH/kernel-cmdline" | tr '\n' ' ') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" + elif grep -q onie_platform= /host/machine.conf; then + KERNEL_OPTIONS=$(cat /host/grub/grub.cfg | sed "/$NEXT_SONIC_IMAGE'/,/}/"'!'"g" | grep linux) + KERNEL_IMAGE="/host$(echo $KERNEL_OPTIONS | cut -d ' ' -f 2)" + BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" + else + error "Unknown bootloader. ${REBOOT_TYPE} is not supported." + exit "${EXIT_NOT_SUPPORTED}" + fi + INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g') +} + +function reboot_pre_check() +{ + # Make sure that the file system is normal: read-write able + filename="/host/test-`date +%Y%m%d-%H%M%S`" + if [[ ! -f ${filename} ]]; then + touch ${filename} + fi + rm ${filename} + + # Make sure /host has enough space for warm reboot temp files + avail=$(df -k /host | tail -1 | awk '{ print $4 }') + if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then + debug "/host has ${avail}K bytes available, not enough for warm reboot." + exit ${EXIT_FILE_SYSTEM_FULL} + fi + + # Make sure that the next image exists + if [[ ! -d ${IMAGE_PATH} ]]; then + debug "Next image ${NEXT_SONIC_IMAGE} doesn't exist ..." + exit ${EXIT_NEXT_IMAGE_NOT_EXISTS} + fi + + # Make sure ASIC configuration has not changed between images + ASIC_CONFIG_CHECK_SCRIPT="/usr/bin/asic_config_check" + ASIC_CONFIG_CHECK_SUCCESS=0 + if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + ASIC_CONFIG_CHECK_EXIT_CODE=0 + ${ASIC_CONFIG_CHECK_SCRIPT} || ASIC_CONFIG_CHECK_EXIT_CODE=$? + + if [[ "${ASIC_CONFIG_CHECK_EXIT_CODE}" != "${ASIC_CONFIG_CHECK_SUCCESS}" ]]; then + if [[ x"${FORCE}" == x"yes" ]]; then + debug "Ignoring ASIC config checksum failure..." + else + error "ASIC config may have changed: errno=${ASIC_CONFIG_CHECK_EXIT_CODE}" + exit "${EXIT_FAILURE}" + fi + fi + fi +} + +function unload_kernel() +{ + # Unload the previously loaded kernel if any loaded + if [[ "$(cat /sys/kernel/kexec_loaded)" -eq 1 ]]; then + /sbin/kexec -u + fi +} + +# main starts here +parseOptions $@ + +# Check root privileges +if [[ "$EUID" -ne 0 ]] +then + echo "This command must be run as root" >&2 + exit "${EXIT_FAILURE}" +fi + +sonic_asic_type=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type) + +# Check reboot type supported +BOOT_TYPE_ARG="cold" +case "$REBOOT_TYPE" in + "fast-reboot") + BOOT_TYPE_ARG=$REBOOT_TYPE + trap clear_fast_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + ;; + "warm-reboot") + if [[ "$sonic_asic_type" == "mellanox" ]]; then + REBOOT_TYPE="fastfast-reboot" + BOOT_TYPE_ARG="fastfast" + # source mlnx-ffb.sh file with + # functions to check ISSU upgrade possibility + source mlnx-ffb.sh + else + BOOT_TYPE_ARG="warm" + fi + trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + config warm_restart enable system + ;; + *) + error "Not supported reboot type: $REBOOT_TYPE" + exit "${EXIT_NOT_SUPPORTED}" + ;; +esac + +# Stopping all SLB neighbors if they're presented +if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + debug "Stopping all SLB neighbors if they are presented" + PASSIVE_BGP_NEIGHBORS=$(sonic-cfggen -d -v "BGP_PEER_RANGE | list") + case "$PASSIVE_BGP_NEIGHBORS" in + *BGPSLBPassive*) + ASN=$(sonic-cfggen -d -v "DEVICE_METADATA['localhost']['bgp_asn']") + vtysh -c "configure terminal" -c "router bgp ${ASN}" -c "neighbor BGPSLBPassive shutdown" + sleep 30 # wait for 30 seconds - BGP RouteAdv default timer + ;; + *) + ;; + esac +fi + +unload_kernel + +setup_reboot_variables + +reboot_pre_check + +# Install new FW for mellanox platforms before control plane goes down +# So on boot switch will not spend time to upgrade FW increasing the CP downtime +if [[ "$sonic_asic_type" == "mellanox" ]]; then + MLNX_EXIT_SUCCESS=0 + MLNX_EXIT_FW_ERROR=100 + MLNX_EXIT_FFB_FAILURE=101 + + MLNX_FW_UPGRADE_SCRIPT="/usr/bin/mlnx-fw-upgrade.sh" + + + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + check_ffb || { + error "Warm reboot is not supported" + exit "${MLNX_EXIT_FFB_FAILURE}" + } + fi + + debug "Prepare MLNX ASIC to ${REBOOT_TYPE}: install new FW if required" + + ${MLNX_FW_UPGRADE_SCRIPT} --upgrade + MLNX_EXIT_CODE="$?" + if [[ "${MLNX_EXIT_CODE}" != "${MLNX_EXIT_SUCCESS}" ]]; then + error "Failed to burn MLNX FW: errno=${MLNX_EXIT_CODE}" + exit "${MLNX_EXIT_FW_ERROR}" + fi +fi + +# Load kernel into the memory +/sbin/kexec -l "$KERNEL_IMAGE" --initrd="$INITRD" --append="$BOOT_OPTIONS" + +if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + # Dump the ARP and FDB tables to files also as default routes for both IPv4 and IPv6 + # into /host/fast-reboot + mkdir -p /host/fast-reboot + FAST_REBOOT_DUMP_RC=0 + /usr/bin/fast-reboot-dump.py -t /host/fast-reboot || FAST_REBOOT_DUMP_RC=$? + if [[ FAST_REBOOT_DUMP_RC -ne 0 ]]; then + error "Failed to run fast-reboot-dump.py. Exit code: $FAST_REBOOT_DUMP_RC" + unload_kernel + exit "${EXIT_FAST_REBOOT_DUMP_FAILURE}" + fi +fi + +init_warm_reboot_states + +setup_control_plane_assistant + +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + # Freeze orchagent for warm restart + # Ask orchagent_restart_check to try freeze 5 times with interval of 2 seconds, + # it is possible that the orchagent is in transient state and no opportunity to be freezed + # Note: assume that 2*5 seconds is enough for orchagent to process the request and respone freeze or not + debug "Pausing orchagent ..." + docker exec -i swss /usr/bin/orchagent_restart_check -w 2000 -r 5 > /dev/null || RESTARTCHECK_RC=$? + if [[ RESTARTCHECK_RC -ne 0 ]]; then + error "RESTARTCHECK failed" + if [[ x"${FORCE}" == x"yes" ]]; then + debug "Ignoring orchagent pausing failure ..." + else + exit "${EXIT_ORCHAGENT_SHUTDOWN}" + fi + fi +fi + +# We are fully committed to reboot from this point on becasue critical +# service will go down and we cannot recover from it. +set +e + +# Kill radv before stopping BGP service to prevent annoucing our departure. +debug "Stopping radv ..." +docker kill radv &>/dev/null || [ $? == 1 ] +systemctl stop radv + +# Kill bgpd to start the bgp graceful restart procedure +debug "Stopping bgp ..." +docker exec -i bgp pkill -9 zebra +docker exec -i bgp pkill -9 bgpd || [ $? == 1 ] +debug "Stopped bgp ..." + +# Kill lldp, otherwise it sends informotion about reboot. +# We call `docker kill lldp` to ensure the container stops as quickly as possible, +# then immediately call `systemctl stop lldp` to prevent the service from +# restarting the container automatically. +docker kill lldp &> /dev/null || debug "Docker lldp is not running ($?) ..." +systemctl stop lldp + +if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + debug "Stopping teamd ..." + docker kill teamd &> /dev/null || debug "Docker teamd is not running ($?) ..." + systemctl stop teamd + debug "Stopped teamd ..." +fi + +# Kill swss Docker container +# We call `docker kill swss` to ensure the container stops as quickly as possible, +# then immediately call `systemctl stop swss` to prevent the service from +# restarting the container automatically. +docker kill swss &> /dev/null || debug "Docker swss is not running ($?) ..." + +# Pre-shutdown syncd +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + initialize_pre_shutdown + + request_pre_shutdown + + wait_for_pre_shutdown_complete_or_fail + + # Warm reboot: dump state to host disk + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + redis-cli -n 1 FLUSHDB > /dev/null + redis-cli -n 2 FLUSHDB > /dev/null + redis-cli -n 5 FLUSHDB > /dev/null + fi + + # TODO: backup_database preserves FDB_TABLE + # need to cleanup as well for fastfast boot case + backup_database +fi + +# Stop teamd gracefully +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + debug "Stopping teamd ..." + # Send USR1 signal to all teamd instances to stop them + # It will prepare teamd for warm-reboot + # Note: We must send USR1 signal before syncd, because it will send the last packet through CPU port + docker exec -i teamd pkill -USR1 teamd > /dev/null || [ $? == 1 ] + debug "Stopped teamd ..." +fi + +debug "Stopping syncd ..." +if [[ ${CURRENT_SONIC_IMAGE} =~ "20180330" && "$sonic_asic_type" = 'broadcom' ]]; then + debug "Stopping syncd on ${CURRENT_SONIC_IMAGE} ..." + + # Gracefully stop syncd + docker exec -i syncd /usr/bin/syncd_request_shutdown --cold > /dev/null + + # Check that syncd was stopped + while docker top syncd | grep -q /usr/bin/syncd + do + sleep 0.1 + done +else + systemctl stop syncd || debug "Ignore stopping syncd service error $?" +fi +debug "Stopped syncd ..." + +# Kill other containers to make the reboot faster +# We call `docker kill ...` to ensure the container stops as quickly as possible, +# then immediately call `systemctl stop ...` to prevent the service from +# restarting the container automatically. +debug "Stopping all remaining containers ..." +for CONTAINER_NAME in $(docker ps --format '{{.Names}}'); do + CONTAINER_STOP_RC=0 + docker kill $CONTAINER_NAME &> /dev/null || CONTAINER_STOP_RC=$? + systemctl stop $CONTAINER_NAME || debug "Ignore stopping $CONTAINER_NAME error $?" + if [[ CONTAINER_STOP_RC -ne 0 ]]; then + debug "Failed killing container $CONTAINER_NAME RC $CONTAINER_STOP_RC ." + fi +done +debug "Stopped all remaining containers ..." + +# Stop the docker container engine. Otherwise we will have a broken docker storage +systemctl stop docker.service || debug "Ignore stopping docker service error $?" + +# Stop kernel modules for Nephos platform +if [[ "$sonic_asic_type" = 'nephos' ]]; +then + systemctl stop nps-modules-`uname -r`.service || debug "Ignore stopping nps service error $?" +fi + +# Update the reboot cause file to reflect that user issued this script +# Upon next boot, the contents of this file will be used to determine the +# cause of the previous reboot +echo "User issued '${REBOOT_SCRIPT_NAME}' command [User: ${REBOOT_USER}, Time: ${REBOOT_TIME}]" > ${REBOOT_CAUSE_FILE} + +# Wait until all buffers synced with disk +sync +sleep 1 +sync + +# sync the current system time to CMOS +if [ -x /sbin/hwclock ]; then + /sbin/hwclock -w || /bin/true +fi + +# Enable Watchdog Timer +if [[ -x /usr/bin/watchdog ]]; then + debug "Enabling Watchdog before ${REBOOT_TYPE}" + /usr/bin/watchdog -e +fi + +# Reboot: explicity call Linux native reboot under sbin +debug "Rebooting with ${REBOOT_METHOD} to ${NEXT_SONIC_IMAGE} ..." +exec ${REBOOT_METHOD} + +# Should never reach here +error "${REBOOT_TYPE} failed!" +exit "${EXIT_FAILURE}" diff --git a/tests/test_lag_2.py b/tests/test_lag_2.py index a8c1e843de..74370dd020 100644 --- a/tests/test_lag_2.py +++ b/tests/test_lag_2.py @@ -50,16 +50,23 @@ def test_lag_2(common_setup_teardown, nbrhosts): try: lag_facts['lags'][lag_name]['po_config']['runner']['min_ports'] except: - logging.info("Skip [check_single_lap_lacp_rate] for lag (%s) due to min_ports not exists" % lag_name) - logging.info("Skip [check_single_lap] for lag (%s) due to min_ports not exists" % lag_name) + logging.info("Skip [check_single_lag_lacp_rate] for lag (%s) due to min_ports not exists" % lag_name) + logging.info("Skip [check_single_lag] for lag (%s) due to min_ports not exists" % lag_name) continue else: check_single_lag_lacp_rate(common_setup_teardown, nbrhosts, lag_name) check_single_lag(common_setup_teardown, nbrhosts, lag_name) + try: + lag_facts['lags'][lag_name]['po_config']['runner']['fallback'] + except: + logging.info("Skip [check_lag_fallback] for lag (%s) due to fallback was not set for it" % lag_name) + else: + check_lag_fallback(common_setup_teardown, nbrhosts, lag_name) + def check_single_lag_lacp_rate(common_setup_teardown, nbrhosts, lag_name): duthost, ptfhost, vm_neighbors, mg_facts, lag_facts, fanout_neighbors = common_setup_teardown - logging.info("Start checking single lap lacp rate for: %s" % lag_name) + logging.info("Start checking single lag lacp rate for: %s" % lag_name) intf, po_interfaces = get_lag_intfs(lag_facts, lag_name) peer_device = vm_neighbors[intf]['name'] @@ -101,7 +108,7 @@ def check_single_lag_lacp_rate(common_setup_teardown, nbrhosts, lag_name): def check_single_lag(common_setup_teardown, nbrhosts, lag_name): duthost, ptfhost, vm_neighbors, mg_facts, lag_facts, fanout_neighbors = common_setup_teardown - logging.info("Start checking single lap for: %s" % lag_name) + logging.info("Start checking single lag for: %s" % lag_name) intf, po_interfaces = get_lag_intfs(lag_facts, lag_name) po_flap = check_flap(lag_facts, lag_name) @@ -119,6 +126,62 @@ def check_single_lag(common_setup_teardown, nbrhosts, lag_name): vm_host = nbrhosts[peer_device] verify_lag_minlink(duthost, vm_host, lag_name, peer_device, intf, neighbor_interface, po_interfaces, po_flap, deselect_time=95) +def check_lag_fallback(common_setup_teardown, nbrhosts, lag_name): + duthost, ptfhost, vm_neighbors, mg_facts, lag_facts, fanout_neighbors = common_setup_teardown + logging.info("Start checking lag fall back for: %s" % lag_name) + intf, po_interfaces = get_lag_intfs(lag_facts, lag_name) + po_fallback = lag_facts['lags'][lag_name]['po_config']['runner']['fallback'] + + # Figure out remote VM and interface info for the lag member and run lag fallback test + peer_device = vm_neighbors[intf]['name'] + neighbor_interface = vm_neighbors[intf]['port'] + vm_host = nbrhosts[peer_device] + + try: + # Shut down neighbor interface + vm_host.shutdown(neighbor_interface) + time.sleep(120) + + # Refresh lag facts + lag_facts = duthost.lag_facts(host = duthost.hostname)['ansible_facts']['lag_facts'] + + # Get teamshow result + teamshow_result = duthost.shell('teamshow') + logging.info("Teamshow result: %s" % teamshow_result) + + # Verify lag members + # 1. All other lag should keep selected state + # 2. Shutdown port should keep selected state if fallback enabled + # 3. Shutdown port should marded as deselected if fallback disabled + # is marked deselected for the shutdown port and all other lag member interfaces are marked selected + for po_intf in po_interfaces.keys(): + if po_intf != intf or po_fallback: + assert lag_facts['lags'][lag_name]['po_stats']['ports'][po_intf]['runner']['selected'] + else: + assert not lag_facts['lags'][lag_name]['po_stats']['ports'][po_intf]['runner']['selected'] + + # The portchannel should marked Up/Down correctly according to po fallback setting + if po_fallback: + assert lag_facts['lags'][lag_name]['po_intf_stat'] == 'Up' + else: + assert lag_facts['lags'][lag_name]['po_intf_stat'] == 'Down' + + finally: + # Bring up neighbor interface + vm_host.no_shutdown(neighbor_interface) + time.sleep(30) + + # Refresh lag facts + lag_facts = duthost.lag_facts(host = duthost.hostname)['ansible_facts']['lag_facts'] + + # Verify all interfaces in port_channel are marked up + for po_intf in po_interfaces.keys(): + assert lag_facts['lags'][lag_name]['po_stats']['ports'][po_intf]['link']['up'] == True + + # Verify portchannel interface are marked up correctly + assert lag_facts['lags'][lag_name]['po_intf_stat'] == 'Up' + + def verify_lag_lacp_timing(ptfhost, vm_name, lacp_timer, exp_iface): if exp_iface is None: return @@ -162,7 +225,7 @@ def verify_lag_minlink( # Refresh lag facts lag_facts = duthost.lag_facts(host = duthost.hostname)['ansible_facts']['lag_facts'] - # Verify lag member is marked deselected for the shutdown porta and all other lag member interfaces are marked selected + # Verify lag member is marked deselected for the shutdown port and all other lag member interfaces are marked selected for po_intf in po_interfaces.keys(): if po_intf != intf: assert lag_facts['lags'][lag_name]['po_stats']['ports'][po_intf]['runner']['selected'] diff --git a/tests/test_vxlan_decap.py b/tests/test_vxlan_decap.py new file mode 100644 index 0000000000..7a486cc7ef --- /dev/null +++ b/tests/test_vxlan_decap.py @@ -0,0 +1,149 @@ +import json +import logging +from datetime import datetime + +import pytest +from jinja2 import Template +from netaddr import IPAddress + +from ptf_runner import ptf_runner + +logger = logging.getLogger(__name__) + +VTEP2_IP = "8.8.8.8" +VNI_BASE = 336 +COUNT = 10 + + +def prepare_ptf(ptfhost, mg_facts, dut_facts): + """ + @summary: Prepare the PTF docker container for testing + @param mg_facts: Minigraph facts + @param dut_facts: Host facts of DUT + """ + logger.info("Remove IP and change MAC") + ptfhost.script("./scripts/remove_ip.sh") + ptfhost.script("./scripts/change_mac.sh") + + logger.info("Prepare arp_responder") + ptfhost.copy(src="../ansible/roles/test/files/helpers/arp_responder.py", dest="/opt") + + arp_responder_conf = Template(open("../ansible/roles/test/templates/arp_responder.conf.j2").read()) + ptfhost.copy(content=arp_responder_conf.render(arp_responder_args="--conf /tmp/vxlan_arpresponder.conf"), + dest="/etc/supervisor/conf.d/arp_responder.conf") + + ptfhost.shell("supervisorctl reread") + ptfhost.shell("supervisorctl update") + + logger.info("Put information needed by the PTF script to the PTF container.") + vxlan_decap = { + "minigraph_port_indices": mg_facts["minigraph_port_indices"], + "minigraph_portchannel_interfaces": mg_facts["minigraph_portchannel_interfaces"], + "minigraph_portchannels": mg_facts["minigraph_portchannels"], + "minigraph_lo_interfaces": mg_facts["minigraph_lo_interfaces"], + "minigraph_vlans": mg_facts["minigraph_vlans"], + "minigraph_vlan_interfaces": mg_facts["minigraph_vlan_interfaces"], + "dut_mac": dut_facts["ansible_Ethernet0"]["macaddress"] + } + ptfhost.copy(content=json.dumps(vxlan_decap, indent=2), dest="/tmp/vxlan_decap.json") + + logger.info("Copy PTF scripts to PTF container") + ptfhost.copy(src="ptftests", dest="/root") + + +def generate_vxlan_config_files(duthost, mg_facts): + """ + @summary: Generate VXLAN tunnel and VXLAN map configuration files to DUT. + @param duthost: DUT host object + @mg_facts: Minigraph facts + """ + loopback_ip = None + for intf in mg_facts["minigraph_lo_interfaces"]: + if IPAddress(intf["addr"]).version == 4: + loopback_ip = intf["addr"] + break + if not loopback_ip: + pytest.fail("ipv4 lo interface not found") + + # Generate vxlan tunnel config json file on DUT + vxlan_tunnel_cfg = { + "VXLAN_TUNNEL": { + "tunnelVxlan": { + "src_ip": loopback_ip, + "dst_ip": VTEP2_IP + } + } + } + duthost.copy(content=json.dumps(vxlan_tunnel_cfg, indent=2), dest="/tmp/vxlan_db.tunnel.json") + + # Generate vxlan maps config json file on DUT + vxlan_maps_cfg = { + "VXLAN_TUNNEL_MAP": {} + } + for vlan in mg_facts["minigraph_vlans"]: + vxlan_maps_cfg["VXLAN_TUNNEL_MAP"]["tunnelVxlan|map%s" % vlan] = { + "vni": int(vlan.replace("Vlan", "")) + VNI_BASE, + "vlan": vlan + } + duthost.copy(content=json.dumps(vxlan_maps_cfg, indent=2), dest="/tmp/vxlan_db.maps.json") + + +@pytest.fixture(scope="module") +def setup(duthost, ptfhost): + + logger.info("Gather some facts") + mg_facts = duthost.minigraph_facts(host=duthost.hostname)["ansible_facts"] + dut_facts = duthost.setup(gather_subset="!all,!any,network", filter="ansible_Ethernet*")["ansible_facts"] + ptf_facts = ptfhost.setup(gather_subset="!all,!any,network")["ansible_facts"] + + logger.info("Prepare PTF") + prepare_ptf(ptfhost, mg_facts, dut_facts) + + logger.info("Generate VxLAN config files") + generate_vxlan_config_files(duthost, mg_facts) + + setup_info = { + "mg_facts": mg_facts + } + + yield setup_info + + logger.info("Stop arp_responder on PTF") + ptfhost.shell("supervisorctl stop arp_responder") + + logger.info("Always try to remove any possible VxLAN tunnel and map configuration") + for vlan in mg_facts["minigraph_vlans"]: + duthost.shell('docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL_MAP|tunnelVxlan|map%s"' % vlan) + duthost.shell('docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL|tunnelVxlan"') + + +@pytest.fixture(params=["NoVxLAN", "Enabled", "Removed"]) +def vxlan_status(setup, request, duthost): + if request.param == "Enabled": + duthost.shell("sonic-cfggen -j /tmp/vxlan_db.tunnel.json --write-to-db") + duthost.shell("sonic-cfggen -j /tmp/vxlan_db.maps.json --write-to-db") + return True, request.param + elif request.param == "Removed": + for vlan in setup["mg_facts"]["minigraph_vlans"]: + duthost.shell('docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL_MAP|tunnelVxlan|map%s"' % vlan) + duthost.shell('docker exec -i database redis-cli -n 4 -c DEL "VXLAN_TUNNEL|tunnelVxlan"') + return False, request.param + else: + return False, request.param + + +def test_vxlan_decap(setup, vxlan_status, duthost, ptfhost): + + vxlan_enabled, scenario = vxlan_status + + logger.info("vxlan_enabled=%s, scenario=%s" % (vxlan_enabled, scenario)) + log_file = "/tmp/vxlan-decap.Vxlan.{}.{}.log".format(scenario, datetime.now().strftime('%Y-%m-%d-%H:%M:%S')) + ptf_runner(ptfhost, + "ptftests", + "vxlan-decap.Vxlan", + platform_dir="ptftests", + params={"vxlan_enabled": vxlan_enabled, + "config_file": '/tmp/vxlan_decap.json', + "count": COUNT}, + qlen=1000, + log_file=log_file)