From be3ad7c5bdc287ac7da68a53976c947ba0b39c41 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 25 Jan 2019 18:56:42 +0100 Subject: [PATCH 01/23] Implement agent_gpu profile extending agent --- .../files/etc/lightdm/lightdm.conf | 2 + .../agent_files/files/etc/lightdm/xhost.sh | 2 + .../profile/manifests/jenkins/agent_gpu.pp | 86 +++++++++++++++++++ modules/profile/manifests/ros/base.pp | 2 +- modules/role/manifests/buildfarm/agent_gpu.pp | 6 ++ 5 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 modules/agent_files/files/etc/lightdm/lightdm.conf create mode 100644 modules/agent_files/files/etc/lightdm/xhost.sh create mode 100644 modules/profile/manifests/jenkins/agent_gpu.pp create mode 100644 modules/role/manifests/buildfarm/agent_gpu.pp diff --git a/modules/agent_files/files/etc/lightdm/lightdm.conf b/modules/agent_files/files/etc/lightdm/lightdm.conf new file mode 100644 index 00000000..261f6081 --- /dev/null +++ b/modules/agent_files/files/etc/lightdm/lightdm.conf @@ -0,0 +1,2 @@ +[SeatDefaults] +display-setup-script=/etc/lightdm/xhost.sh diff --git a/modules/agent_files/files/etc/lightdm/xhost.sh b/modules/agent_files/files/etc/lightdm/xhost.sh new file mode 100644 index 00000000..f455d2cd --- /dev/null +++ b/modules/agent_files/files/etc/lightdm/xhost.sh @@ -0,0 +1,2 @@ +#!/bin/sh +xhost +si:localuser:jenkins diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp new file mode 100644 index 00000000..419dffe5 --- /dev/null +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -0,0 +1,86 @@ +# Jenkins Agent Profile +# +# Profile class for a node configured to act as a swarm agent for Jenkins. +# This profile should only ever be declared with an include into a role or site manifest. +# Parameter overloading should be done using hiera automatic parameter lookup. +# +# @example +# include profile::jenkins::master +# +# @pararm agent_username The unix user the agent will configure and run as. +class profile::jenkins::agent_gpu { + + include apt + + package { 'nvidia-375': + ensure => installed, + } + + package { 'wget': + ensure => installed, + } + + exec { 'apt-update': + command => "/usr/bin/apt-get update" + } + + exec { 'retrieve_docker_repo': + command => '/usr/bin/wget -q https://nvidia.github.io/nvidia-docker/ubuntu16.04/nvidia-docker.list -O /etc/apt/sources.list.d/nvidia-docker.list', + creates => '/etc/apt/sources.list.d/nvidia-docker.list', + require => Package['wget'], + } + + apt::key { 'nvidia_docker_key' : + source => 'https://nvidia.github.io/nvidia-docker/gpgkey', + id => 'C95B321B61E88C1809C4F759DDCAE044F796ECB0', + } + + package { 'nvidia-docker2': + ensure => installed, + require => [ + Exec['retrieve_docker_repo'], + Apt::Key['nvidia_docker_key'], + Exec['apt-update'] + ], + } + + package { 'lightdm': + ensure => installed, + } + + file { '/etc/lightdm/xhost.sh': + source => 'puppet:///modules/agent_files/etc/lightdm/xhost.sh', + mode => '0744', + require => Package[lightdm], + notify => Exec[service_lightdm_restart], + } + + # This two rules do: check if no lightdm is present and create one + # Ensure that display-setup-script is set + + file { '/etc/lightdm/lightdm.conf': + ensure => 'present', + source => 'puppet:///modules/agent_files/etc/lightdm/lightdm.conf', + replace => 'no', # this is the important property + require => File['/etc/lightdm/xhost.sh'] + } + + file_line { '/etc/lightdm/lightdm.conf': + ensure => present, + require => File['/etc/lightdm/lightdm.conf'], + line => 'display-setup-script=/etc/lightdm/xhost.sh', + path => '/etc/lightdm/lightdm.conf', + } + + service { 'lightdm': + ensure => running, + enable => true, + hasrestart => true, + } + + exec { 'service_lightdm_restart': + refreshonly => true, + command => '/usr/sbin/service lightdm restart', + require => [ Package['lightdm'], File['/etc/lightdm/lightdm.conf'] ] + } +} diff --git a/modules/profile/manifests/ros/base.pp b/modules/profile/manifests/ros/base.pp index a0d8de50..415242e5 100644 --- a/modules/profile/manifests/ros/base.pp +++ b/modules/profile/manifests/ros/base.pp @@ -42,7 +42,7 @@ $defaults = { 'ensure' => 'present', } - create_resources(ssh_authorized_key, hiera('ssh_keys'), $defaults) + # create_resources(ssh_authorized_key, hiera('ssh_keys'), $defaults) } else{ notice("No ssh_keys defined. You should probably have at least one.") diff --git a/modules/role/manifests/buildfarm/agent_gpu.pp b/modules/role/manifests/buildfarm/agent_gpu.pp new file mode 100644 index 00000000..a015e4f7 --- /dev/null +++ b/modules/role/manifests/buildfarm/agent_gpu.pp @@ -0,0 +1,6 @@ +class role::buildfarm::agent_gpu { + # Find the other instances + include profile::ros::base + include profile::jenkins::agent + include profile::jenkins::agent_gpu +} From 780f9c461b14f563376075c5d55aaa1963a406b5 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 25 Jan 2019 21:23:35 +0100 Subject: [PATCH 02/23] Fix jenkins username --- modules/agent_files/files/etc/lightdm/xhost.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/agent_files/files/etc/lightdm/xhost.sh b/modules/agent_files/files/etc/lightdm/xhost.sh index f455d2cd..1e36935c 100644 --- a/modules/agent_files/files/etc/lightdm/xhost.sh +++ b/modules/agent_files/files/etc/lightdm/xhost.sh @@ -1,2 +1,2 @@ #!/bin/sh -xhost +si:localuser:jenkins +xhost +si:localuser:jenkins-agent From 7908d42f36f0ba8b7bdd0b71075961cd0fad6e72 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 25 Jan 2019 21:24:31 +0100 Subject: [PATCH 03/23] Fixes to get real support on AWS machines --- modules/agent_files/files/etc/X11/xorg.conf | 52 +++++++++++++++++++ .../profile/manifests/jenkins/agent_gpu.pp | 35 +++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 modules/agent_files/files/etc/X11/xorg.conf diff --git a/modules/agent_files/files/etc/X11/xorg.conf b/modules/agent_files/files/etc/X11/xorg.conf new file mode 100644 index 00000000..be73a101 --- /dev/null +++ b/modules/agent_files/files/etc/X11/xorg.conf @@ -0,0 +1,52 @@ +Section "ServerLayout" + Identifier "Layout0" + InputDevice "Keyboard0" "CoreKeyboard" + InputDevice "Mouse0" "CorePointer" +EndSection + +Section "Files" +EndSection + +Section "InputDevice" + # generated from default + Identifier "Mouse0" + Driver "mouse" + Option "Protocol" "auto" + Option "Device" "/dev/psaux" + Option "Emulate3Buttons" "no" + Option "ZAxisMapping" "4 5" +EndSection + +Section "InputDevice" + # generated from default + Identifier "Keyboard0" + Driver "kbd" +EndSection + +Section "Monitor" + Identifier "Monitor0" + VendorName "Unknown" + ModelName "Unknown" + HorizSync 28.0 - 33.0 + VertRefresh 43.0 - 72.0 + Option "DPMS" +EndSection + +# jrivero: be sure of using always the right BusID +# can be obtained by: nvidia-xconfig --query-gpu-info +# The current BusID is the one corresponding to GRID K520 +# at AWS machines of the g2 series. +Section "Device" + Identifier "Device0" + Driver "nvidia" + VendorName "NVIDIA Corporation" + BoardName "GRID K520" + BusID "PCI:0:3:0" +EndSection + +Section "Screen" + Identifier "Default Screen" + Device "Device0" + Monitor "Monitor0" + Option "AllowEmptyInitialConfiguration" "True" +EndSection diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 419dffe5..4db00e4e 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -12,10 +12,37 @@ include apt - package { 'nvidia-375': + # neeed for xhost + package { 'x11-xserver-utils' : + ensure => installed, + } + + # needed for gpu-manager used by nvidia-prime + package { 'ubuntu-drivers-common': + ensure => installed, + } + + package { 'linux-aws': + ensure => installed, + } + + package { 'xserver-xorg-dev': ensure => installed, } + # needs to update first the kernel and headers before + # compiling the nvidia driver + package { 'nvidia-375': + ensure => installed, + require => [ Package[linux-aws], Package[ubuntu-drivers-common] ] + } + + file { '/etc/X11/xorg.conf': + source => 'puppet:///modules/agent_files/etc/X11/xorg.conf', + mode => '0744', + require => Package[xserver-xorg-dev], + } + package { 'wget': ensure => installed, } @@ -51,8 +78,7 @@ file { '/etc/lightdm/xhost.sh': source => 'puppet:///modules/agent_files/etc/lightdm/xhost.sh', mode => '0744', - require => Package[lightdm], - notify => Exec[service_lightdm_restart], + require => [ Package[lightdm], Package[x11-xserver-utils] ] } # This two rules do: check if no lightdm is present and create one @@ -62,7 +88,8 @@ ensure => 'present', source => 'puppet:///modules/agent_files/etc/lightdm/lightdm.conf', replace => 'no', # this is the important property - require => File['/etc/lightdm/xhost.sh'] + notify => Exec[service_lightdm_restart], + require => [ File['/etc/lightdm/xhost.sh'], File['/etc/X11/xorg.conf'] ] } file_line { '/etc/lightdm/lightdm.conf': From d81a613f0e566a060c01a4d05dee8380c751ae6b Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Thu, 31 Jan 2019 18:55:26 +0100 Subject: [PATCH 04/23] Be sure that xorg.conf is not overwritten --- modules/profile/manifests/jenkins/agent_gpu.pp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 4db00e4e..5a838d7a 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -15,19 +15,23 @@ # neeed for xhost package { 'x11-xserver-utils' : ensure => installed, + before => File['/etc/X11/xorg.conf'] } # needed for gpu-manager used by nvidia-prime package { 'ubuntu-drivers-common': ensure => installed, + before => File['/etc/X11/xorg.conf'] } package { 'linux-aws': ensure => installed, + before => File['/etc/X11/xorg.conf'] } package { 'xserver-xorg-dev': ensure => installed, + before => File['/etc/X11/xorg.conf'] } # needs to update first the kernel and headers before @@ -35,6 +39,7 @@ package { 'nvidia-375': ensure => installed, require => [ Package[linux-aws], Package[ubuntu-drivers-common] ] + before => File['/etc/X11/xorg.conf'] } file { '/etc/X11/xorg.conf': @@ -73,6 +78,7 @@ package { 'lightdm': ensure => installed, + before => File['/etc/X11/xorg.conf'] } file { '/etc/lightdm/xhost.sh': @@ -108,6 +114,6 @@ exec { 'service_lightdm_restart': refreshonly => true, command => '/usr/sbin/service lightdm restart', - require => [ Package['lightdm'], File['/etc/lightdm/lightdm.conf'] ] + require => [ Package['lightdm'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ] } } From f49be15b139a78cc7ebdc9a7d5fc6ee86efc3d85 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Tue, 5 Feb 2019 16:36:55 +0100 Subject: [PATCH 05/23] Fix syntax typo --- modules/profile/manifests/jenkins/agent_gpu.pp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 5a838d7a..f23850dd 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -38,8 +38,8 @@ # compiling the nvidia driver package { 'nvidia-375': ensure => installed, - require => [ Package[linux-aws], Package[ubuntu-drivers-common] ] - before => File['/etc/X11/xorg.conf'] + require => [ Package[linux-aws], Package[ubuntu-drivers-common] ], + before => File['/etc/X11/xorg.conf'], } file { '/etc/X11/xorg.conf': From 95add6ee2b37f115321ca3ddc57f4a9095953272 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 11 Feb 2019 16:30:40 +0100 Subject: [PATCH 06/23] Do not install ubuntu-drivers-common --- modules/profile/manifests/jenkins/agent_gpu.pp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index f23850dd..2e6181a7 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -19,10 +19,10 @@ } # needed for gpu-manager used by nvidia-prime - package { 'ubuntu-drivers-common': - ensure => installed, - before => File['/etc/X11/xorg.conf'] - } + # package { 'ubuntu-drivers-common': + # ensure => installed, + # before => File['/etc/X11/xorg.conf'] + # } package { 'linux-aws': ensure => installed, @@ -38,7 +38,7 @@ # compiling the nvidia driver package { 'nvidia-375': ensure => installed, - require => [ Package[linux-aws], Package[ubuntu-drivers-common] ], + require => Package[linux-aws], # [, Package[ubuntu-drivers-common] ], before => File['/etc/X11/xorg.conf'], } From 5840a7a4c87179fd98037f4f3b7657344e2b3ebc Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 11 Feb 2019 17:16:27 +0100 Subject: [PATCH 07/23] Use require in lightdm service --- modules/profile/manifests/jenkins/agent_gpu.pp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 2e6181a7..a1c8ebdb 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -18,12 +18,6 @@ before => File['/etc/X11/xorg.conf'] } - # needed for gpu-manager used by nvidia-prime - # package { 'ubuntu-drivers-common': - # ensure => installed, - # before => File['/etc/X11/xorg.conf'] - # } - package { 'linux-aws': ensure => installed, before => File['/etc/X11/xorg.conf'] @@ -38,7 +32,7 @@ # compiling the nvidia driver package { 'nvidia-375': ensure => installed, - require => Package[linux-aws], # [, Package[ubuntu-drivers-common] ], + require => Package[linux-aws], before => File['/etc/X11/xorg.conf'], } @@ -107,13 +101,13 @@ service { 'lightdm': ensure => running, + require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], enable => true, hasrestart => true, } - exec { 'service_lightdm_restart': - refreshonly => true, - command => '/usr/sbin/service lightdm restart', - require => [ Package['lightdm'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ] - } + # exec { 'service_lightdm_restart': + # refreshonly => true, + # command => '/usr/sbin/service lightdm restart', + # } } From b8a27f60b956bb82ff0e962ac816e273edf0ff36 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 11 Feb 2019 17:52:12 +0100 Subject: [PATCH 08/23] Need to restart the service --- modules/profile/manifests/jenkins/agent_gpu.pp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index a1c8ebdb..45454ef4 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -88,7 +88,6 @@ ensure => 'present', source => 'puppet:///modules/agent_files/etc/lightdm/lightdm.conf', replace => 'no', # this is the important property - notify => Exec[service_lightdm_restart], require => [ File['/etc/lightdm/xhost.sh'], File['/etc/X11/xorg.conf'] ] } @@ -96,18 +95,19 @@ ensure => present, require => File['/etc/lightdm/lightdm.conf'], line => 'display-setup-script=/etc/lightdm/xhost.sh', + notify => Exec[service_lightdm_restart], path => '/etc/lightdm/lightdm.conf', } service { 'lightdm': ensure => running, - require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], enable => true, hasrestart => true, } - # exec { 'service_lightdm_restart': - # refreshonly => true, - # command => '/usr/sbin/service lightdm restart', - # } + exec { 'service_lightdm_restart': + refreshonly => true, + command => '/usr/sbin/service lightdm restart', + require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], + } } From cf8f8738ab3ca45e1d3fe2e695c5adedbb8ee9a7 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 11 Feb 2019 19:15:12 +0100 Subject: [PATCH 09/23] Explicit order for lightdm --- modules/profile/manifests/jenkins/agent_gpu.pp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 45454ef4..57f0fe60 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -73,13 +73,13 @@ package { 'lightdm': ensure => installed, before => File['/etc/X11/xorg.conf'] - } + } -> file { '/etc/lightdm/xhost.sh': source => 'puppet:///modules/agent_files/etc/lightdm/xhost.sh', mode => '0744', require => [ Package[lightdm], Package[x11-xserver-utils] ] - } + } -> # This two rules do: check if no lightdm is present and create one # Ensure that display-setup-script is set @@ -89,7 +89,7 @@ source => 'puppet:///modules/agent_files/etc/lightdm/lightdm.conf', replace => 'no', # this is the important property require => [ File['/etc/lightdm/xhost.sh'], File['/etc/X11/xorg.conf'] ] - } + } -> file_line { '/etc/lightdm/lightdm.conf': ensure => present, @@ -97,6 +97,12 @@ line => 'display-setup-script=/etc/lightdm/xhost.sh', notify => Exec[service_lightdm_restart], path => '/etc/lightdm/lightdm.conf', + } -> + + exec { 'service_lightdm_restart': + refreshonly => true, + command => '/usr/sbin/service lightdm restart', + require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], } service { 'lightdm': @@ -104,10 +110,4 @@ enable => true, hasrestart => true, } - - exec { 'service_lightdm_restart': - refreshonly => true, - command => '/usr/sbin/service lightdm restart', - require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], - } } From d092226213118c6250c70d005a546940f9fc4138 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 25 Feb 2019 18:27:46 +0100 Subject: [PATCH 10/23] Add debug to the script --- modules/agent_files/files/etc/lightdm/xhost.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/agent_files/files/etc/lightdm/xhost.sh b/modules/agent_files/files/etc/lightdm/xhost.sh index 1e36935c..d3d5dd51 100644 --- a/modules/agent_files/files/etc/lightdm/xhost.sh +++ b/modules/agent_files/files/etc/lightdm/xhost.sh @@ -1,2 +1,3 @@ #!/bin/sh xhost +si:localuser:jenkins-agent +touch /tmp/xhost_`date +"%T"` From 2c9422013ea49ada759acf122acbc7f03e678225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 09:01:32 -0700 Subject: [PATCH 11/23] Update nvidia-docker2 package configuration. * Use a local copy of the nvidia sources file. Since http/https sources weren't introduced until Puppet 4.4, we can't use them. And the wget approach is a decent fallback but I'd rather just vendor the sources file and deal with linkrot of the apt repositories rather than linkrot of the sources and the apt repositories. * Notify Exec['apt_update'] from the apt package after the source has been added. This should resolve race conditions between the apt source, apt key, and apt update execution and means that the nvidia-docker2 package need only depend on the proper source being configured. --- .../jenkins/agent_gpu/nvidia-docker.list | 3 +++ .../profile/manifests/jenkins/agent_gpu.pp | 26 +++++-------------- 2 files changed, 10 insertions(+), 19 deletions(-) create mode 100644 modules/profile/files/jenkins/agent_gpu/nvidia-docker.list diff --git a/modules/profile/files/jenkins/agent_gpu/nvidia-docker.list b/modules/profile/files/jenkins/agent_gpu/nvidia-docker.list new file mode 100644 index 00000000..1c9d6fb8 --- /dev/null +++ b/modules/profile/files/jenkins/agent_gpu/nvidia-docker.list @@ -0,0 +1,3 @@ +deb https://nvidia.github.io/libnvidia-container/ubuntu16.04/$(ARCH) / +deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/$(ARCH) / +deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/$(ARCH) / diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 57f0fe60..b82a6104 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -42,32 +42,20 @@ require => Package[xserver-xorg-dev], } - package { 'wget': - ensure => installed, - } - - exec { 'apt-update': - command => "/usr/bin/apt-get update" - } - - exec { 'retrieve_docker_repo': - command => '/usr/bin/wget -q https://nvidia.github.io/nvidia-docker/ubuntu16.04/nvidia-docker.list -O /etc/apt/sources.list.d/nvidia-docker.list', - creates => '/etc/apt/sources.list.d/nvidia-docker.list', - require => Package['wget'], - } - apt::key { 'nvidia_docker_key' : source => 'https://nvidia.github.io/nvidia-docker/gpgkey', id => 'C95B321B61E88C1809C4F759DDCAE044F796ECB0', } + file { '/etc/apt/sources.list.d/nvidia-docker.list': + source => 'puppet:///modules/profile/jenkins/agent_gpu/nvidia-docker.list', + require => Apt::Key['nvidia_docker_key'], + notify => Exec['apt_update'] + } + package { 'nvidia-docker2': ensure => installed, - require => [ - Exec['retrieve_docker_repo'], - Apt::Key['nvidia_docker_key'], - Exec['apt-update'] - ], + require => File['/etc/apt/sources.list.d/nvidia-docker.list'] } package { 'lightdm': From cc7d29ed9b0fcb4ca7349c0bc4a20761a45c515e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 12:15:42 -0700 Subject: [PATCH 12/23] Avoid use of legacy 'agent_files' module. Going forward we should store files in the module they're used. --- .../files/jenkins/agent_gpu}/etc/X11/xorg.conf | 0 .../files/jenkins/agent_gpu}/etc/lightdm/lightdm.conf | 0 .../files/jenkins/agent_gpu}/etc/lightdm/xhost.sh | 0 modules/profile/manifests/jenkins/agent_gpu.pp | 6 +++--- 4 files changed, 3 insertions(+), 3 deletions(-) rename modules/{agent_files/files => profile/files/jenkins/agent_gpu}/etc/X11/xorg.conf (100%) rename modules/{agent_files/files => profile/files/jenkins/agent_gpu}/etc/lightdm/lightdm.conf (100%) rename modules/{agent_files/files => profile/files/jenkins/agent_gpu}/etc/lightdm/xhost.sh (100%) diff --git a/modules/agent_files/files/etc/X11/xorg.conf b/modules/profile/files/jenkins/agent_gpu/etc/X11/xorg.conf similarity index 100% rename from modules/agent_files/files/etc/X11/xorg.conf rename to modules/profile/files/jenkins/agent_gpu/etc/X11/xorg.conf diff --git a/modules/agent_files/files/etc/lightdm/lightdm.conf b/modules/profile/files/jenkins/agent_gpu/etc/lightdm/lightdm.conf similarity index 100% rename from modules/agent_files/files/etc/lightdm/lightdm.conf rename to modules/profile/files/jenkins/agent_gpu/etc/lightdm/lightdm.conf diff --git a/modules/agent_files/files/etc/lightdm/xhost.sh b/modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh similarity index 100% rename from modules/agent_files/files/etc/lightdm/xhost.sh rename to modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index b82a6104..7f69042d 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -37,7 +37,7 @@ } file { '/etc/X11/xorg.conf': - source => 'puppet:///modules/agent_files/etc/X11/xorg.conf', + source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/X11/xorg.conf', mode => '0744', require => Package[xserver-xorg-dev], } @@ -64,7 +64,7 @@ } -> file { '/etc/lightdm/xhost.sh': - source => 'puppet:///modules/agent_files/etc/lightdm/xhost.sh', + source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/lightdm/xhost.sh', mode => '0744', require => [ Package[lightdm], Package[x11-xserver-utils] ] } -> @@ -74,7 +74,7 @@ file { '/etc/lightdm/lightdm.conf': ensure => 'present', - source => 'puppet:///modules/agent_files/etc/lightdm/lightdm.conf', + source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/lightdm/lightdm.conf', replace => 'no', # this is the important property require => [ File['/etc/lightdm/xhost.sh'], File['/etc/X11/xorg.conf'] ] } -> From 067f994d2747dde9649e1e7076011c0e5addbd1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 12:35:58 -0700 Subject: [PATCH 13/23] Remove unneeded ordering arrows. These ordering arrows are redundant with the explicit `require` directive in each resource. In general I prefer to use ordering arrows only in one-liners rather than verbose resources as they get quite hard to follow, especially when comments are interspersed. --- modules/profile/manifests/jenkins/agent_gpu.pp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 7f69042d..469a1137 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -61,13 +61,13 @@ package { 'lightdm': ensure => installed, before => File['/etc/X11/xorg.conf'] - } -> + } file { '/etc/lightdm/xhost.sh': source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/lightdm/xhost.sh', mode => '0744', require => [ Package[lightdm], Package[x11-xserver-utils] ] - } -> + } # This two rules do: check if no lightdm is present and create one # Ensure that display-setup-script is set @@ -77,7 +77,7 @@ source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/lightdm/lightdm.conf', replace => 'no', # this is the important property require => [ File['/etc/lightdm/xhost.sh'], File['/etc/X11/xorg.conf'] ] - } -> + } file_line { '/etc/lightdm/lightdm.conf': ensure => present, @@ -85,7 +85,7 @@ line => 'display-setup-script=/etc/lightdm/xhost.sh', notify => Exec[service_lightdm_restart], path => '/etc/lightdm/lightdm.conf', - } -> + } exec { 'service_lightdm_restart': refreshonly => true, From 1ce7cb742071ccafa18dd3eaa06d78fd6b7016e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 12:48:24 -0700 Subject: [PATCH 14/23] Notify service directly rather than via exec. Notifying a service triggers a refresh or restart depending on the services capabilities. This should be sufficient to trigger a restart. --- modules/profile/manifests/jenkins/agent_gpu.pp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 469a1137..6d92d99c 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -83,16 +83,10 @@ ensure => present, require => File['/etc/lightdm/lightdm.conf'], line => 'display-setup-script=/etc/lightdm/xhost.sh', - notify => Exec[service_lightdm_restart], + notify => Service[lightdm], path => '/etc/lightdm/lightdm.conf', } - exec { 'service_lightdm_restart': - refreshonly => true, - command => '/usr/sbin/service lightdm restart', - require => [ Package['lightdm'], File['/etc/lightdm/xhost.sh'], File['/etc/lightdm/lightdm.conf'], File['/etc/X11/xorg.conf'] ], - } - service { 'lightdm': ensure => running, enable => true, From 715f3430e7309d23be4455d2f1c439950095f514 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 12:53:08 -0700 Subject: [PATCH 15/23] Only install linux-aws when on an EC2 instance. The linux-aws kernel is only necessary when using an EC2 instance. In other systems that kernel may not be reasonable. I'm not sure if other virtual systems will have specific kernel requirements as I don't have any nvidia GPU + virtualizable systems to test on. But that can be work for future PRs. --- modules/profile/manifests/jenkins/agent_gpu.pp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 6d92d99c..0435fb3d 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -18,9 +18,15 @@ before => File['/etc/X11/xorg.conf'] } - package { 'linux-aws': - ensure => installed, - before => File['/etc/X11/xorg.conf'] + if $facts['ec2_instance_id'] { + package { 'linux-aws': + ensure => installed, + # When running in EC2 the AWS kernel needs to be installed before + # compiling the nvidia driver. + # TODO(nuclearsandwich) Does the xorg.conf really depend on the kernel or + # is it implicit based on drivers? + before => [ File['/etc/X11/xorg.conf'], Package['nvidia-375'] ] + } } package { 'xserver-xorg-dev': @@ -32,7 +38,6 @@ # compiling the nvidia driver package { 'nvidia-375': ensure => installed, - require => Package[linux-aws], before => File['/etc/X11/xorg.conf'], } From 03d45d90353acb9aa57a90da867bb30b12865c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 13:04:20 -0700 Subject: [PATCH 16/23] Remove double-declared dependency. This dependency is established using both before and require on the respective resources. Where both resources are unconditionally declared require is preferred. --- modules/profile/manifests/jenkins/agent_gpu.pp | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 0435fb3d..fdbca0b9 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -31,7 +31,6 @@ package { 'xserver-xorg-dev': ensure => installed, - before => File['/etc/X11/xorg.conf'] } # needs to update first the kernel and headers before From d17584cca5da56f097bf6bcea73e1d580e158906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 3 Apr 2019 13:08:11 -0700 Subject: [PATCH 17/23] Prefer using require to before. When two resources are both installed unconditionally. I arbitrarily prefer declaring the dependency with `require` rather than `before`. Using `before` is still needed in cases where a dependency needs to be declared on a resource that is only installed conditionally. An example is the linux-aws kernel needing to be available before nvidia-375 when on an EC2 instance. --- modules/profile/manifests/jenkins/agent_gpu.pp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index fdbca0b9..93ff203f 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -15,7 +15,6 @@ # neeed for xhost package { 'x11-xserver-utils' : ensure => installed, - before => File['/etc/X11/xorg.conf'] } if $facts['ec2_instance_id'] { @@ -37,13 +36,17 @@ # compiling the nvidia driver package { 'nvidia-375': ensure => installed, - before => File['/etc/X11/xorg.conf'], } file { '/etc/X11/xorg.conf': source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/X11/xorg.conf', mode => '0744', - require => Package[xserver-xorg-dev], + require => [ + Package[lightdm], + Package['nvidia-375'], + Package['x11-xserver-utils'], + Package[xserver-xorg-dev], + ], } apt::key { 'nvidia_docker_key' : @@ -64,7 +67,6 @@ package { 'lightdm': ensure => installed, - before => File['/etc/X11/xorg.conf'] } file { '/etc/lightdm/xhost.sh': From 8724994eb4c8cdf33f8a7ba9601b6816cf7b84c8 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 5 Apr 2019 18:13:18 +0200 Subject: [PATCH 18/23] Transform xorg.conf into a template to get busid value from nvidia-xconfig --- .../X11/xorg.conf => agent_files/templates/xorg.conf.erb} | 6 +----- modules/profile/manifests/jenkins/agent_gpu.pp | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) rename modules/{profile/files/jenkins/agent_gpu/etc/X11/xorg.conf => agent_files/templates/xorg.conf.erb} (83%) diff --git a/modules/profile/files/jenkins/agent_gpu/etc/X11/xorg.conf b/modules/agent_files/templates/xorg.conf.erb similarity index 83% rename from modules/profile/files/jenkins/agent_gpu/etc/X11/xorg.conf rename to modules/agent_files/templates/xorg.conf.erb index be73a101..c47240a0 100644 --- a/modules/profile/files/jenkins/agent_gpu/etc/X11/xorg.conf +++ b/modules/agent_files/templates/xorg.conf.erb @@ -32,16 +32,12 @@ Section "Monitor" Option "DPMS" EndSection -# jrivero: be sure of using always the right BusID -# can be obtained by: nvidia-xconfig --query-gpu-info -# The current BusID is the one corresponding to GRID K520 -# at AWS machines of the g2 series. Section "Device" Identifier "Device0" Driver "nvidia" VendorName "NVIDIA Corporation" BoardName "GRID K520" - BusID "PCI:0:3:0" + BusID <%=@busid%> EndSection Section "Screen" diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 93ff203f..2b61d8c7 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -38,8 +38,10 @@ ensure => installed, } + $busid = generate('nvidia-xconfig --query-gpu-info | grep BusID | sed "s/.*PCI:/PCI:/g"') + file { '/etc/X11/xorg.conf': - source => 'puppet:///modules/profile/jenkins/agent_gpu/etc/X11/xorg.conf', + content => template('agent_files/xorg.conf.erb'), mode => '0744', require => [ Package[lightdm], From 18e06402b4ce2e9bf3dab43619646363e2885ce0 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 8 Apr 2019 18:02:50 +0200 Subject: [PATCH 19/23] Fix facter implementation --- modules/agent_files/templates/xorg.conf.erb | 2 +- modules/facts/lib/facter/busid.rb | 5 +++++ modules/profile/manifests/jenkins/agent_gpu.pp | 2 -- 3 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 modules/facts/lib/facter/busid.rb diff --git a/modules/agent_files/templates/xorg.conf.erb b/modules/agent_files/templates/xorg.conf.erb index c47240a0..e52fe9c0 100644 --- a/modules/agent_files/templates/xorg.conf.erb +++ b/modules/agent_files/templates/xorg.conf.erb @@ -37,7 +37,7 @@ Section "Device" Driver "nvidia" VendorName "NVIDIA Corporation" BoardName "GRID K520" - BusID <%=@busid%> + BusID <%= @facts['busid'] %> EndSection Section "Screen" diff --git a/modules/facts/lib/facter/busid.rb b/modules/facts/lib/facter/busid.rb new file mode 100644 index 00000000..80181b12 --- /dev/null +++ b/modules/facts/lib/facter/busid.rb @@ -0,0 +1,5 @@ +Facter.add(:busid) do + setcode do + Facter::Core::Execution.execute('nvidia-xconfig --query-gpu-info | grep BusID | sed "s/.*PCI:/PCI:/g"') + end +end diff --git a/modules/profile/manifests/jenkins/agent_gpu.pp b/modules/profile/manifests/jenkins/agent_gpu.pp index 2b61d8c7..0f67125c 100644 --- a/modules/profile/manifests/jenkins/agent_gpu.pp +++ b/modules/profile/manifests/jenkins/agent_gpu.pp @@ -38,8 +38,6 @@ ensure => installed, } - $busid = generate('nvidia-xconfig --query-gpu-info | grep BusID | sed "s/.*PCI:/PCI:/g"') - file { '/etc/X11/xorg.conf': content => template('agent_files/xorg.conf.erb'), mode => '0744', From 927037d58b064ad7201698bfcfc2c7411ee5d381 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Mon, 8 Apr 2019 18:11:44 +0200 Subject: [PATCH 20/23] value needs quotes --- modules/agent_files/templates/xorg.conf.erb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/agent_files/templates/xorg.conf.erb b/modules/agent_files/templates/xorg.conf.erb index e52fe9c0..aee83316 100644 --- a/modules/agent_files/templates/xorg.conf.erb +++ b/modules/agent_files/templates/xorg.conf.erb @@ -37,7 +37,7 @@ Section "Device" Driver "nvidia" VendorName "NVIDIA Corporation" BoardName "GRID K520" - BusID <%= @facts['busid'] %> + BusID "<%= @facts['busid'] %>" EndSection Section "Screen" From 3f32d61aef1436912f127e974f9dec054b4eaa14 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 26 Apr 2019 16:16:33 +0200 Subject: [PATCH 21/23] Change busid by gpu_device_bus_id --- modules/agent_files/templates/xorg.conf.erb | 2 +- modules/facts/lib/facter/busid.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/agent_files/templates/xorg.conf.erb b/modules/agent_files/templates/xorg.conf.erb index aee83316..a53865d9 100644 --- a/modules/agent_files/templates/xorg.conf.erb +++ b/modules/agent_files/templates/xorg.conf.erb @@ -37,7 +37,7 @@ Section "Device" Driver "nvidia" VendorName "NVIDIA Corporation" BoardName "GRID K520" - BusID "<%= @facts['busid'] %>" + BusID "<%= @facts['gpu_device_bus_id'] %>" EndSection Section "Screen" diff --git a/modules/facts/lib/facter/busid.rb b/modules/facts/lib/facter/busid.rb index 80181b12..e05ca3a8 100644 --- a/modules/facts/lib/facter/busid.rb +++ b/modules/facts/lib/facter/busid.rb @@ -1,4 +1,4 @@ -Facter.add(:busid) do +Facter.add(:gpu_device_bus_id) do setcode do Facter::Core::Execution.execute('nvidia-xconfig --query-gpu-info | grep BusID | sed "s/.*PCI:/PCI:/g"') end From b92729a6f47fd4b896b3e4f1ba84672c05cd0e16 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Fri, 3 May 2019 21:26:06 +0200 Subject: [PATCH 22/23] Revert the use of facter to get the nvidia-xconfig value ERB files are evaluated before puppet manages the dependency resolution of the manifests so no nvidia-xconfig is available when the ERB file is being parsed --- modules/agent_files/templates/xorg.conf.erb | 10 +++++++++- modules/facts/lib/facter/busid.rb | 5 ----- 2 files changed, 9 insertions(+), 6 deletions(-) delete mode 100644 modules/facts/lib/facter/busid.rb diff --git a/modules/agent_files/templates/xorg.conf.erb b/modules/agent_files/templates/xorg.conf.erb index a53865d9..32e1a63c 100644 --- a/modules/agent_files/templates/xorg.conf.erb +++ b/modules/agent_files/templates/xorg.conf.erb @@ -32,12 +32,20 @@ Section "Monitor" Option "DPMS" EndSection +# Be sure of using always the right BusID +# can be obtained by: nvidia-xconfig --query-gpu-info +# The current BusID is the one corresponding to GRID K520 +# at AWS machines of the g2 series. +# The nvidia-config command can not be integrated into +# erb files since they are evaluated before the +# dependency resolution. See: +# https://github.com/ros-infrastructure/buildfarm_deployment/pull/211#discussion_r279920242 Section "Device" Identifier "Device0" Driver "nvidia" VendorName "NVIDIA Corporation" BoardName "GRID K520" - BusID "<%= @facts['gpu_device_bus_id'] %>" + BusID "PCI:0:3:0" EndSection Section "Screen" diff --git a/modules/facts/lib/facter/busid.rb b/modules/facts/lib/facter/busid.rb deleted file mode 100644 index e05ca3a8..00000000 --- a/modules/facts/lib/facter/busid.rb +++ /dev/null @@ -1,5 +0,0 @@ -Facter.add(:gpu_device_bus_id) do - setcode do - Facter::Core::Execution.execute('nvidia-xconfig --query-gpu-info | grep BusID | sed "s/.*PCI:/PCI:/g"') - end -end From 220a30f0a7456616f9d17805945ed169e7ffed13 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Tue, 3 Sep 2019 21:01:10 +0200 Subject: [PATCH 23/23] remove debug touch command --- modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh b/modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh index d3d5dd51..1e36935c 100644 --- a/modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh +++ b/modules/profile/files/jenkins/agent_gpu/etc/lightdm/xhost.sh @@ -1,3 +1,2 @@ #!/bin/sh xhost +si:localuser:jenkins-agent -touch /tmp/xhost_`date +"%T"`