-
Notifications
You must be signed in to change notification settings - Fork 99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add gpu support #537
Add gpu support #537
Changes from 11 commits
34da274
f9fbeca
50a520f
9e3e600
7b3a826
a3cc729
148b990
ea915fc
0412dac
63a25f1
f301fd9
d019c27
7212e38
262c2ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -206,9 +206,15 @@ object AssetMeta extends Schema with AnormAdapter[AssetMeta] with AssetMetaKeys | |
val BaseProduct = findOrCreateFromName("BASE_PRODUCT") | ||
val BaseVendor = findOrCreateFromName("BASE_VENDOR") | ||
val BaseSerial = findOrCreateFromName("BASE_SERIAL") | ||
val GpuCount = findOrCreateFromName("GPU_COUNT") | ||
val GpuDescription = findOrCreateFromName("GPU_DESCRIPTION") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should have both vendor and description? I believe pci enumeration has fields for both vendor and description? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added product and vendor. It's a bit confusing because lshw refers to the "label" or "description" of the PCI device as the "product". |
||
|
||
def getValues(): Seq[AssetMeta] = { | ||
Seq(BaseDescription, BaseProduct, BaseVendor, BaseSerial) | ||
Seq(BaseDescription, BaseProduct, BaseVendor, BaseSerial, GpuCount, GpuDescription) | ||
} | ||
|
||
def getLshwValues(): Set[AssetMeta] = { | ||
Set(BaseDescription, BaseProduct, BaseVendor, BaseSerial, GpuCount, GpuDescription) | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ package collins.models | |
import collins.models.AssetMeta.DynamicEnum._ | ||
import collins.models.AssetMeta.Enum._ | ||
import collins.models.lshw.Cpu | ||
import collins.models.lshw.Gpu | ||
import collins.models.lshw.Disk | ||
import collins.models.lshw.Memory | ||
import collins.models.lshw.Nic | ||
|
@@ -34,8 +35,11 @@ object LshwHelper extends CommonHelper[LshwRepresentation] { | |
DiskStorageTotal | ||
) | ||
|
||
override val managedDynamicTags = AssetMeta.DynamicEnum.getLshwValues() | ||
|
||
def construct(asset: Asset, lshw: LshwRepresentation): Seq[AssetMetaValue] = { | ||
collectCpus(asset, lshw) ++ | ||
collectGpus(asset, lshw) ++ | ||
collectMemory(asset, lshw) ++ | ||
collectNics(asset, lshw) ++ | ||
collectDisks(asset, lshw) ++ | ||
|
@@ -45,11 +49,12 @@ object LshwHelper extends CommonHelper[LshwRepresentation] { | |
def reconstruct(asset: Asset, assetMeta: Seq[MetaWrapper]): Reconstruction = { | ||
val metaMap = assetMeta.groupBy { _.getGroupId } | ||
val (cpus,postCpuMap) = reconstructCpu(metaMap) | ||
val (memory,postMemoryMap) = reconstructMemory(postCpuMap) | ||
val (gpus,postGpuMap) = reconstructGpu(postCpuMap) | ||
val (memory,postMemoryMap) = reconstructMemory(postGpuMap) | ||
val (nics,postNicMap) = reconstructNics(postMemoryMap) | ||
val (disks,postDiskMap) = reconstructDisks(postNicMap) | ||
val (base,postBaseMap) = reconstructBase(postDiskMap) | ||
(LshwRepresentation(cpus, memory, nics, disks, base.headOption.getOrElse(ServerBase())), postBaseMap.values.flatten.toSeq) | ||
(LshwRepresentation(cpus, gpus, memory, nics, disks, base.headOption.getOrElse(ServerBase())), postBaseMap.values.flatten.toSeq) | ||
} | ||
|
||
protected def reconstructCpu(meta: Map[Int, Seq[MetaWrapper]]): FilteredSeq[Cpu] = { | ||
|
@@ -72,6 +77,7 @@ object LshwHelper extends CommonHelper[LshwRepresentation] { | |
} | ||
(cpuSeq, filteredMeta) | ||
} | ||
|
||
protected def collectCpus(asset: Asset, lshw: LshwRepresentation): Seq[AssetMetaValue] = { | ||
if (lshw.cpuCount < 1) { | ||
return Seq() | ||
|
@@ -86,6 +92,41 @@ object LshwHelper extends CommonHelper[LshwRepresentation] { | |
) | ||
} | ||
|
||
protected def reconstructGpu(meta: Map[Int, Seq[MetaWrapper]]): FilteredSeq[Gpu] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i assume this is copy+paste from another reconstruct function? its really hard to read :/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I modeled There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally this would be made more legible to make support burden easier going forward There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do, I didn't change this with my most recent commit (d019c27) but I'll get this cleaned up. |
||
val gpuSeq = meta.foldLeft(Seq[Gpu]()) { case (seq, map) => | ||
val groupId = map._1 | ||
val wrapSeq = map._2 | ||
val descr = amfinder(wrapSeq, GpuDescription, _.toString, "") | ||
if (descr.isEmpty) { | ||
seq | ||
} else { | ||
Gpu(descr, "", "") +: seq | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are these last 2 params ""? |
||
} | ||
} | ||
val filteredMeta = meta.map { case(groupId, metaSeq) => | ||
val newSeq = filterNot( | ||
metaSeq, | ||
Set(GpuDescription.id) | ||
) | ||
groupId -> newSeq | ||
} | ||
(gpuSeq, filteredMeta) | ||
} | ||
|
||
protected def collectGpus(asset: Asset, lshw: LshwRepresentation): Seq[AssetMetaValue] = { | ||
if (lshw.gpuCount < 1) { | ||
return Seq() | ||
} | ||
lshw.gpus.foldLeft((0,Seq[AssetMetaValue]())) { case (run,gpu) => | ||
val groupId = run._1 | ||
val total = run._2 | ||
val res: Seq[AssetMetaValue] = Seq( | ||
AssetMetaValue(asset, GpuDescription.id, groupId, "%s - %s".format(gpu.product, gpu.vendor)) | ||
) | ||
(groupId + 1, total ++ res) | ||
}._2 | ||
} | ||
|
||
protected def reconstructMemory(meta: Map[Int, Seq[MetaWrapper]]): FilteredSeq[Memory] = { | ||
if (!meta.contains(0)) { | ||
return (Seq[Memory](), meta) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package collins.models.lshw | ||
|
||
import play.api.libs.json.Format | ||
import play.api.libs.json.JsObject | ||
import play.api.libs.json.JsSuccess | ||
import play.api.libs.json.JsValue | ||
import play.api.libs.json.Json | ||
|
||
object Gpu { | ||
|
||
implicit object GpuFormat extends Format[Gpu] { | ||
override def reads(json: JsValue) = JsSuccess(Gpu( | ||
(json \ "DESCRIPTION").as[String], | ||
(json \ "PRODUCT").as[String], | ||
(json \ "VENDOR").as[String])) | ||
override def writes(gpu: Gpu) = JsObject(Seq( | ||
"DESCRIPTION" -> Json.toJson(gpu.description), | ||
"PRODUCT" -> Json.toJson(gpu.product), | ||
"VENDOR" -> Json.toJson(gpu.vendor))) | ||
} | ||
} | ||
|
||
case class Gpu( | ||
description: String, product: String, vendor: String) extends LshwAsset { | ||
import Gpu._ | ||
override def toJsValue() = Json.toJson(this) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ import play.api.libs.json.Json | |
|
||
import collins.models.lshw.Cpu | ||
import collins.models.lshw.Cpu.CpuFormat | ||
import collins.models.lshw.Gpu | ||
import collins.models.lshw.Gpu.GpuFormat | ||
import collins.models.lshw.Disk | ||
import collins.models.lshw.Disk.DiskFormat | ||
import collins.models.lshw.Memory | ||
|
@@ -19,23 +21,26 @@ import collins.models.lshw.ServerBase.ServerbaseFormat | |
|
||
object LshwRepresentation { | ||
def empty(): LshwRepresentation = { | ||
new LshwRepresentation(Seq(), Seq(), Seq(), Seq(), new ServerBase) | ||
new LshwRepresentation(Seq(), Seq(), Seq(), Seq(), Seq(), new ServerBase) | ||
} | ||
implicit object LshwFormat extends Format[LshwRepresentation] { | ||
import Cpu._ | ||
import Gpu._ | ||
import Disk._ | ||
import Memory._ | ||
import Nic._ | ||
import ServerBase._ | ||
import Json.toJson | ||
override def reads(json: JsValue) = JsSuccess(LshwRepresentation( | ||
(json \ "CPU").as[Seq[Cpu]], | ||
(json \ "GPU").as[Seq[Gpu]], | ||
(json \ "MEMORY").as[Seq[Memory]], | ||
(json \ "NIC").as[Seq[Nic]], | ||
(json \ "DISK").as[Seq[Disk]], | ||
(json \ "BASE").as[ServerBase])) | ||
override def writes(lshw: LshwRepresentation) = JsObject(Seq( | ||
"CPU" -> Json.toJson(lshw.cpus), | ||
"GPU" -> Json.toJson(lshw.gpus), | ||
"MEMORY" -> Json.toJson(lshw.memory), | ||
"NIC" -> Json.toJson(lshw.nics), | ||
"DISK" -> Json.toJson(lshw.disks), | ||
|
@@ -45,6 +50,7 @@ object LshwRepresentation { | |
|
||
case class LshwRepresentation( | ||
cpus: Seq[Cpu], | ||
gpus: Seq[Gpu], | ||
memory: Seq[Memory], | ||
nics: Seq[Nic], | ||
disks: Seq[Disk], | ||
|
@@ -61,6 +67,8 @@ case class LshwRepresentation( | |
} | ||
def cpuSpeed: Double = cpus.sortBy(_.speedGhz).lastOption.map(_.speedGhz).getOrElse(0.0) | ||
|
||
def gpuCount: Int = gpus.size | ||
|
||
def totalMemory: ByteStorageUnit = memory.foldLeft(new ByteStorageUnit(0)) { | ||
case (total, mem) => | ||
new ByteStorageUnit(total.bytes + mem.size.bytes) | ||
|
@@ -105,6 +113,7 @@ case class LshwRepresentation( | |
(cpuCoreCount == other.cpuCoreCount) && | ||
(cpuThreadCount == other.cpuThreadCount) && | ||
(cpuSpeed == other.cpuSpeed) && | ||
(gpuCount == other.gpuCount) && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i am not sure where this method gets used, but comparing GPU counts (as opposed to deep comparison of desc/vendor/product) could produce odd behavior when you change GPUs and update LSHW, but the same number of GPUs are present. Would collins not update the meta attributes in the DB if the old reconstructed LSHW I think what you have in this diff is fine, but we should test this behavior (induct with GPU A, change to GPU B, induct, ensure LSHW is updated properly) and open an issue if this is a problem There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, this was something I intended to remove but missed. |
||
(totalMemory.inBytes == other.totalMemory.inBytes) && | ||
(memoryBanksUsed == other.memoryBanksUsed) && | ||
(memoryBanksUnused == other.memoryBanksUnused) && | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package collins.util.config | ||
|
||
object GpuConfig extends Configurable { | ||
override val namespace = "gpu" | ||
override val referenceConfigFilename = "gpu_reference.conf" | ||
|
||
def gpuVendors= getStringSet("gpuVendors") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like the default values to be reflected here, instead of relying on the config to include something. I.e. the default unpopulated config should include nvidia, instead of the user being forced to set this config on every installation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added default value of "NVIDIA Corporation". |
||
|
||
override protected def validateConfig() { | ||
gpuVendors | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,23 @@ <h4>CPU <small>Collected CPU Information</small></h4> | |
</tbody> | ||
</table> | ||
|
||
<h4>GPU <small>Collected GPU Information</small></h4> | ||
<table class="table table-bordered table-hover table-condensed"> | ||
<thead> | ||
<tr> | ||
<th>Id</th><th>Description</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
@aa.lshw.gpus.zipWithIndex.map { case(gpu,id) => | ||
<tr> | ||
<th>@id</th> | ||
<td>@gpu.description</td> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add vendor column here, and maybe sort by id, so the GPUs show up in enumeration order? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added Description (internally called "product") and Vendor columns. |
||
</tr> | ||
} | ||
</tbody> | ||
</table> | ||
|
||
<h4>Memory <small>Collected Memory Information</small></h4> | ||
<table class="table table-bordered table-hover table-condensed"> | ||
<thead> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -204,6 +204,14 @@ <h3>Hardware Summary <small>Summary of system components reported by LSHW</small | |
<td>@{if (aa.lshw.hasHyperthreadingEnabled) "Yes" else "No"}</td> | ||
</tr> | ||
|
||
<th colspan="3">GPU</th> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whitespace is off here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, good catch. |
||
</tr> | ||
<tr> | ||
<td></td> | ||
<td>Total GPUs</td> | ||
<td>@aa.lshw.gpuCount</td> | ||
</tr> | ||
|
||
<tr> | ||
<th colspan="3">Memory</th> | ||
</tr> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# --- Add GPU information to asset_meta | ||
|
||
# --- !Ups | ||
|
||
INSERT INTO asset_meta (name, priority, label, description) VALUES ('GPU_COUNT', -1, 'GPU Count', 'Number of physical GPUs in asset'); | ||
INSERT INTO asset_meta (name, priority, label, description) VALUES ('GPU_DESCRIPTION', -1, 'GPU Description', 'GPU description, vendor labels'); | ||
|
||
# --- !Downs | ||
|
||
DELETE FROM asset_meta WHERE name ='GPU_COUNT' | ||
DELETE FROM asset_meta WHERE name ='GPU_DESCRIPTION' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
gpu { | ||
# A list of hardware vendors that should be parsed by lshw as "gpu vendors" | ||
gpuVendors = [] | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -282,6 +282,10 @@ lshw { | |
lshw.defaultNicCapacity=10000000000 | ||
} | ||
|
||
gpu { | ||
gpuVendors = ["NVIDIA Corporation"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a stutter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call, renamed |
||
} | ||
|
||
include "authentication.conf" | ||
|
||
# Set logging properties in logger.xml or dev_logger.xml | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"status":"success:ok","data":{"ASSET":{"ID":2,"TAG":"U000006","STATE":{"ID":1,"STATUS":null,"NAME":"NEW","LABEL":"New","DESCRIPTION":"A service in this state is inactive. It does minimal work and consumes minimal resources."},"STATUS":"New","TYPE":"SERVER_NODE","CREATED":"2017-04-26T16:52:36","UPDATED":"2017-04-26T16:52:38","DELETED":null},"HARDWARE":{"CPU":[{"CORES":6,"THREADS":6,"SPEED_GHZ":2.3,"DESCRIPTION":"AMD Opteron(tm) Processor 4174 HE Hynix Semiconductor (Hyundai Electronics)","PRODUCT":"","VENDOR":""},{"CORES":6,"THREADS":6,"SPEED_GHZ":2.3,"DESCRIPTION":"AMD Opteron(tm) Processor 4174 HE Hynix Semiconductor (Hyundai Electronics)","PRODUCT":"","VENDOR":""}],"GPU":[{"DESCRIPTION":"GM200GL [Quadro M6000] - NVIDIA Corporation","PRODUCT":"","VENDOR":""},{"DESCRIPTION":"GM200GL [Quadro M6000] - NVIDIA Corporation","PRODUCT":"","VENDOR":""}],"MEMORY":[{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":0,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":1,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":8589934592,"SIZE_S":"8589934592","SIZE_HUMAN":"8.00 GB","BANK":2,"DESCRIPTION":"DIMM DDR3 Synchronous 1333 MHz (0.8 ns) - Hyundai HMT31GR7BFR4A-H9","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":3,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":4,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":8589934592,"SIZE_S":"8589934592","SIZE_HUMAN":"8.00 GB","BANK":5,"DESCRIPTION":"DIMM DDR3 Synchronous 1333 MHz (0.8 ns) - Hyundai HMT31GR7BFR4A-H9","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":6,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":7,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":8589934592,"SIZE_S":"8589934592","SIZE_HUMAN":"8.00 GB","BANK":8,"DESCRIPTION":"DIMM DDR3 Synchronous 1333 MHz (0.8 ns) - Hyundai HMT31GR7BFR4A-H9","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":9,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":0,"SIZE_S":"0","SIZE_HUMAN":"0 Bytes","BANK":10,"DESCRIPTION":"Empty Memory Bank","PRODUCT":"","VENDOR":""},{"SIZE":8589934592,"SIZE_S":"8589934592","SIZE_HUMAN":"8.00 GB","BANK":11,"DESCRIPTION":"DIMM DDR3 Synchronous 1333 MHz (0.8 ns) - Hyundai HMT31GR7BFR4A-H9","PRODUCT":"","VENDOR":""}],"NIC":[],"DISK":[],"BASE":{"DESCRIPTION":"Rack Mount Chassis","PRODUCT":"PowerEdge C6105 (N/A)","VENDOR":"Winbond Electronics","SERIAL":"FZ22YQ1"}},"LLDP":{"INTERFACES":[{"NAME":"eth0","CHASSIS":{"NAME":"accessB07.corp.uhq.ua.tc","ID":{"TYPE":"mac","VALUE":"ec:3e:f7:1e:77:c0"},"DESCRIPTION":"Juniper Networks, Inc. ex4300-48p Ethernet Switch, kernel JUNOS 14.1X53-D30.3, Build date: 2015-10-02 12:40:24 UTC Copyright (c) 1996-2015 Juniper Networks, Inc."},"PORT":{"ID":{"TYPE":"local","VALUE":"529"},"DESCRIPTION":"ge-0/0/12"},"VLANS":[{"ID":40,"NAME":"vlan-40"}]}]},"IPMI":{"ASSET_ID":2,"ASSET_TAG":"U000006","IPMI_USERNAME":"root","IPMI_PASSWORD":"YCM0PCt6y37uJsHb","IPMI_GATEWAY":"172.16.32.1","IPMI_ADDRESS":"172.16.32.20","IPMI_NETMASK":"255.255.240.0","ID":2},"ADDRESSES":[],"POWER":[],"ATTRIBS":{"0":{"CHASSIS_TAG":"U000006"}}}} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be computed on reconstruction, instead of storing this? It would make it easier to compute how many NVIDIA vs AMD GPUs there are (if in a mixed env)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call, I removed the extraneous
GPU_COUNT