From 03ea83e9a37e41d436f8348e6eee3d8281bfff3a Mon Sep 17 00:00:00 2001 From: Tushar Behera Date: Mon, 10 Jun 2013 10:20:55 +0530 Subject: [PATCH 01/19] NVMe: Use kzalloc instead of kmalloc+memset Use kzalloc instead of kmalloc and a susbsequent memset. Signed-off-by: Tushar Behera Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 102de2f52b5c5f..4a4ff4eb8e2331 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -933,13 +933,12 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res = SNTI_TRANSLATION_SUCCESS; int xfer_len; - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); if (inq_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ inq_response[2] = 0x00; /* Page Length MSB */ inq_response[3] = 0x3C; /* Page Length LSB */ @@ -964,12 +963,11 @@ static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, int xfer_len; u8 *log_response; - log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; /* Subpage=0x00, Page Length MSB=0 */ @@ -1000,12 +998,11 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, u8 temp_c; u16 temp_k; - log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), @@ -1069,12 +1066,11 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 temp_c_cur, temp_c_thresh; u16 temp_k; - log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_TEMP_PAGE_LENGTH); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), @@ -1380,12 +1376,11 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns, blk_desc_offset = mph_size; mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_mem; } - memset(response, 0, resp_size); res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, llbaa, mode_data_length, blk_desc_len); @@ -2480,12 +2475,11 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, } id_ns = mem; - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_dma; } - memset(response, 0, resp_size); nvme_trans_fill_read_cap(response, id_ns, cdb16); xfer_len = min(alloc_len, resp_size); @@ -2554,12 +2548,11 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_dma; } - memset(response, 0, resp_size); /* The first LUN ID will always be 0 per the SAM spec */ for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { @@ -2600,12 +2593,11 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : (FIXED_FMT_SENSE_DATA_SIZE)); - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out; } - memset(response, 0, resp_size); if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { /* Descriptor Format Sense Data */ From 063a8096f3dbca7521d5918b3aea7ab46c5d2fe9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 20 Jun 2013 10:53:48 -0400 Subject: [PATCH 02/19] NVMe: Restructure MSI / MSI-X setup The current code copies 'nr_io_queues' into 'q_count', modifies 'nr_io_queues' during MSI-X setup, then resets 'nr_io_queues' for MSI setup. Instead, copy 'nr_io_queues' into 'vecs' and modify 'vecs' during both MSI-X and MSI setup. This lets us simplify the for-loops that set up MSI-X and MSI, and opens the possibility of using more I/O queues than we have interrupt vectors, should future benchmarking prove that to be a useful feature. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 44 ++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index ce79a590b45bff..de3a75978c56da 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1638,7 +1638,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) static int nvme_setup_io_queues(struct nvme_dev *dev) { struct pci_dev *pdev = dev->pci_dev; - int result, cpu, i, nr_io_queues, db_bar_size, q_depth, q_count; + int result, cpu, i, vecs, nr_io_queues, db_bar_size, q_depth; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1647,7 +1647,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; - q_count = nr_io_queues; /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, dev->queues[0]); @@ -1659,39 +1658,42 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->queues[0]->q_db = dev->dbs; } - for (i = 0; i < nr_io_queues; i++) + vecs = nr_io_queues; + for (i = 0; i < vecs; i++) dev->entry[i].entry = i; for (;;) { - result = pci_enable_msix(pdev, dev->entry, nr_io_queues); - if (result == 0) { + result = pci_enable_msix(pdev, dev->entry, vecs); + if (result <= 0) break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 0; - break; - } + vecs = result; } - if (nr_io_queues == 0) { - nr_io_queues = q_count; + if (result < 0) { + vecs = nr_io_queues; + if (vecs > 32) + vecs = 32; for (;;) { - result = pci_enable_msi_block(pdev, nr_io_queues); + result = pci_enable_msi_block(pdev, vecs); if (result == 0) { - for (i = 0; i < nr_io_queues; i++) + for (i = 0; i < vecs; i++) dev->entry[i].vector = i + pdev->irq; break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 1; + } else if (result < 0) { + vecs = 1; break; } + vecs = result; } } + /* + * Should investigate if there's a performance win from allocating + * more queues than interrupt vectors; it might allow the submission + * path to scale better, even if the receive path is limited by the + * number of interrupts. + */ + nr_io_queues = vecs; + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); /* XXX: handle failure here */ From 6198221fa0df0298513b35796f63f242ea97134e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 29 May 2013 15:59:39 -0600 Subject: [PATCH 03/19] NVMe: Disk IO statistics Add io stats accounting for bio requests so nvme block devices show useful disk stats. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 28 ++++++++++++++++++++++++++++ include/linux/nvme.h | 1 + 2 files changed, 29 insertions(+) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index de3a75978c56da..4e71b075d3b4fc 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -285,6 +285,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->npages = -1; iod->length = nbytes; iod->nents = 0; + iod->start_time = jiffies; } return iod; @@ -308,6 +309,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } +static void nvme_start_io_acct(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); +} + +static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); +} + static void bio_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) { @@ -318,6 +343,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, if (iod->nents) dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + nvme_end_io_acct(bio, iod->start_time); nvme_free_iod(dev, iod); if (status) bio_endio(bio, -EIO); @@ -695,6 +722,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + nvme_start_io_acct(bio); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f451c8d6e23100..5d7c07946fbedd 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -572,6 +572,7 @@ struct nvme_iod { int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ + unsigned long start_time; dma_addr_t first_dma; struct scatterlist sg[0]; }; From e9539f47525ecee05c9f22c3565885f3e9492c52 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 24 Jun 2013 11:47:34 -0400 Subject: [PATCH 04/19] NVMe: Return correct value from interrupt handler The interrupt handler currently reports whether it found any new completion queue entries. If the completion queue is primarily being processed by a method other than the interrupt handler, it may return IRQ_NONE so often that Linux thinks that the interrupt is being falsely triggered. To solve this problem, report whether any completion queue entries have been seen since the last interrupt was received for this queue. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 4e71b075d3b4fc..88a95747494c50 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -79,7 +79,8 @@ struct nvme_queue { u16 sq_head; u16 sq_tail; u16 cq_head; - u16 cq_phase; + u8 cq_phase; + u8 cqe_seen; unsigned long cmdid_data[]; }; @@ -756,7 +757,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) put_nvmeq(nvmeq); } -static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) +static int nvme_process_cq(struct nvme_queue *nvmeq) { u16 head, phase; @@ -786,13 +787,14 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) * a big problem. */ if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) - return IRQ_NONE; + return 0; writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); nvmeq->cq_head = head; nvmeq->cq_phase = phase; - return IRQ_HANDLED; + nvmeq->cqe_seen = 1; + return 1; } static irqreturn_t nvme_irq(int irq, void *data) @@ -800,7 +802,9 @@ static irqreturn_t nvme_irq(int irq, void *data) irqreturn_t result; struct nvme_queue *nvmeq = data; spin_lock(&nvmeq->q_lock); - result = nvme_process_cq(nvmeq); + nvme_process_cq(nvmeq); + result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; + nvmeq->cqe_seen = 0; spin_unlock(&nvmeq->q_lock); return result; } From bc57a0f7a44cfcf3e9873f6c6b8dcecdca486b1f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 24 Jun 2013 11:56:42 -0400 Subject: [PATCH 05/19] NVMe: Remove "process_cq did something" message I was originally intending to log the fact that the kthread had done some work since it might help us find interrupt handling problems, but that hasn't been done yet, and spamming the logs with this message is just rude. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 88a95747494c50..eb4a91f3bf4169 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1538,8 +1538,7 @@ static int nvme_kthread(void *data) if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvme_process_cq(nvmeq)) - printk("process_cq did something\n"); + nvme_process_cq(nvmeq); nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); spin_unlock_irq(&nvmeq->q_lock); From 7d8224574cbd2326a6be00f319f5f7597abec3f6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 24 Jun 2013 12:03:57 -0400 Subject: [PATCH 06/19] NVMe: Call nvme_process_cq from submission path Since we have the queue locked, it makes sense to check if there are any completion queue entries on the queue before we release the lock. If there are, it may save an interrupt and reduce latency for the I/Os that happened to complete. This happens fairly often for some workloads. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index eb4a91f3bf4169..07d527c66eb45a 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -738,25 +738,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return result; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) -{ - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - spin_lock_irq(&nvmeq->q_lock); - if (bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); -} - static int nvme_process_cq(struct nvme_queue *nvmeq) { u16 head, phase; @@ -797,6 +778,26 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } +static void nvme_make_request(struct request_queue *q, struct bio *bio) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + int result = -EBUSY; + + spin_lock_irq(&nvmeq->q_lock); + if (bio_list_empty(&nvmeq->sq_cong)) + result = nvme_submit_bio_queue(nvmeq, ns, bio); + if (unlikely(result)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + } + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + put_nvmeq(nvmeq); +} + static irqreturn_t nvme_irq(int irq, void *data) { irqreturn_t result; From 42c7768316905dc64ad22256d6cbff273e3fbf55 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 25 Jun 2013 15:14:56 -0400 Subject: [PATCH 07/19] NVMe: Split header file into user-visible and kernel-visible pieces To build user programs that call the NVMe ioctls, we need to have a user header file. Catch up to the new way of doing that by splitting the header file into kernel and uapi portions. Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 461 +------------------------------------ include/uapi/linux/Kbuild | 1 + include/uapi/linux/nvme.h | 471 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 477 insertions(+), 456 deletions(-) create mode 100644 include/uapi/linux/nvme.h diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5d7c07946fbedd..8d0041513e1ab4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,6 +1,6 @@ /* * Definitions for the NVM Express interface - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2013, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -19,7 +19,10 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H -#include +#include +#include +#include +#include struct nvme_bar { __u64 cap; /* Controller Capabilities */ @@ -59,460 +62,8 @@ enum { NVME_CSTS_SHST_CMPLT = 2 << 2, }; -struct nvme_id_power_state { - __le16 max_power; /* centiwatts */ - __u16 rsvd2; - __le32 entry_lat; /* microseconds */ - __le32 exit_lat; /* microseconds */ - __u8 read_tput; - __u8 read_lat; - __u8 write_tput; - __u8 write_lat; - __u8 rsvd16[16]; -}; - #define NVME_VS(major, minor) (major << 16 | minor) -struct nvme_id_ctrl { - __le16 vid; - __le16 ssvid; - char sn[20]; - char mn[40]; - char fr[8]; - __u8 rab; - __u8 ieee[3]; - __u8 mic; - __u8 mdts; - __u8 rsvd78[178]; - __le16 oacs; - __u8 acl; - __u8 aerl; - __u8 frmw; - __u8 lpa; - __u8 elpe; - __u8 npss; - __u8 rsvd264[248]; - __u8 sqes; - __u8 cqes; - __u8 rsvd514[2]; - __le32 nn; - __le16 oncs; - __le16 fuses; - __u8 fna; - __u8 vwc; - __le16 awun; - __le16 awupf; - __u8 rsvd530[1518]; - struct nvme_id_power_state psd[32]; - __u8 vs[1024]; -}; - -enum { - NVME_CTRL_ONCS_COMPARE = 1 << 0, - NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, - NVME_CTRL_ONCS_DSM = 1 << 2, -}; - -struct nvme_lbaf { - __le16 ms; - __u8 ds; - __u8 rp; -}; - -struct nvme_id_ns { - __le64 nsze; - __le64 ncap; - __le64 nuse; - __u8 nsfeat; - __u8 nlbaf; - __u8 flbas; - __u8 mc; - __u8 dpc; - __u8 dps; - __u8 rsvd30[98]; - struct nvme_lbaf lbaf[16]; - __u8 rsvd192[192]; - __u8 vs[3712]; -}; - -enum { - NVME_NS_FEAT_THIN = 1 << 0, - NVME_LBAF_RP_BEST = 0, - NVME_LBAF_RP_BETTER = 1, - NVME_LBAF_RP_GOOD = 2, - NVME_LBAF_RP_DEGRADED = 3, -}; - -struct nvme_smart_log { - __u8 critical_warning; - __u8 temperature[2]; - __u8 avail_spare; - __u8 spare_thresh; - __u8 percent_used; - __u8 rsvd6[26]; - __u8 data_units_read[16]; - __u8 data_units_written[16]; - __u8 host_reads[16]; - __u8 host_writes[16]; - __u8 ctrl_busy_time[16]; - __u8 power_cycles[16]; - __u8 power_on_hours[16]; - __u8 unsafe_shutdowns[16]; - __u8 media_errors[16]; - __u8 num_err_log_entries[16]; - __u8 rsvd192[320]; -}; - -enum { - NVME_SMART_CRIT_SPARE = 1 << 0, - NVME_SMART_CRIT_TEMPERATURE = 1 << 1, - NVME_SMART_CRIT_RELIABILITY = 1 << 2, - NVME_SMART_CRIT_MEDIA = 1 << 3, - NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, -}; - -struct nvme_lba_range_type { - __u8 type; - __u8 attributes; - __u8 rsvd2[14]; - __u64 slba; - __u64 nlb; - __u8 guid[16]; - __u8 rsvd48[16]; -}; - -enum { - NVME_LBART_TYPE_FS = 0x01, - NVME_LBART_TYPE_RAID = 0x02, - NVME_LBART_TYPE_CACHE = 0x03, - NVME_LBART_TYPE_SWAP = 0x04, - - NVME_LBART_ATTRIB_TEMP = 1 << 0, - NVME_LBART_ATTRIB_HIDE = 1 << 1, -}; - -/* I/O commands */ - -enum nvme_opcode { - nvme_cmd_flush = 0x00, - nvme_cmd_write = 0x01, - nvme_cmd_read = 0x02, - nvme_cmd_write_uncor = 0x04, - nvme_cmd_compare = 0x05, - nvme_cmd_dsm = 0x09, -}; - -struct nvme_common_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le32 cdw2[2]; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le32 cdw10[6]; -}; - -struct nvme_rw_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 slba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le32 reftag; - __le16 apptag; - __le16 appmask; -}; - -enum { - NVME_RW_LR = 1 << 15, - NVME_RW_FUA = 1 << 14, - NVME_RW_DSM_FREQ_UNSPEC = 0, - NVME_RW_DSM_FREQ_TYPICAL = 1, - NVME_RW_DSM_FREQ_RARE = 2, - NVME_RW_DSM_FREQ_READS = 3, - NVME_RW_DSM_FREQ_WRITES = 4, - NVME_RW_DSM_FREQ_RW = 5, - NVME_RW_DSM_FREQ_ONCE = 6, - NVME_RW_DSM_FREQ_PREFETCH = 7, - NVME_RW_DSM_FREQ_TEMP = 8, - NVME_RW_DSM_LATENCY_NONE = 0 << 4, - NVME_RW_DSM_LATENCY_IDLE = 1 << 4, - NVME_RW_DSM_LATENCY_NORM = 2 << 4, - NVME_RW_DSM_LATENCY_LOW = 3 << 4, - NVME_RW_DSM_SEQ_REQ = 1 << 6, - NVME_RW_DSM_COMPRESSED = 1 << 7, -}; - -struct nvme_dsm_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 nr; - __le32 attributes; - __u32 rsvd12[4]; -}; - -enum { - NVME_DSMGMT_IDR = 1 << 0, - NVME_DSMGMT_IDW = 1 << 1, - NVME_DSMGMT_AD = 1 << 2, -}; - -struct nvme_dsm_range { - __le32 cattr; - __le32 nlb; - __le64 slba; -}; - -/* Admin commands */ - -enum nvme_admin_opcode { - nvme_admin_delete_sq = 0x00, - nvme_admin_create_sq = 0x01, - nvme_admin_get_log_page = 0x02, - nvme_admin_delete_cq = 0x04, - nvme_admin_create_cq = 0x05, - nvme_admin_identify = 0x06, - nvme_admin_abort_cmd = 0x08, - nvme_admin_set_features = 0x09, - nvme_admin_get_features = 0x0a, - nvme_admin_async_event = 0x0c, - nvme_admin_activate_fw = 0x10, - nvme_admin_download_fw = 0x11, - nvme_admin_format_nvm = 0x80, - nvme_admin_security_send = 0x81, - nvme_admin_security_recv = 0x82, -}; - -enum { - NVME_QUEUE_PHYS_CONTIG = (1 << 0), - NVME_CQ_IRQ_ENABLED = (1 << 1), - NVME_SQ_PRIO_URGENT = (0 << 1), - NVME_SQ_PRIO_HIGH = (1 << 1), - NVME_SQ_PRIO_MEDIUM = (2 << 1), - NVME_SQ_PRIO_LOW = (3 << 1), - NVME_FEAT_ARBITRATION = 0x01, - NVME_FEAT_POWER_MGMT = 0x02, - NVME_FEAT_LBA_RANGE = 0x03, - NVME_FEAT_TEMP_THRESH = 0x04, - NVME_FEAT_ERR_RECOVERY = 0x05, - NVME_FEAT_VOLATILE_WC = 0x06, - NVME_FEAT_NUM_QUEUES = 0x07, - NVME_FEAT_IRQ_COALESCE = 0x08, - NVME_FEAT_IRQ_CONFIG = 0x09, - NVME_FEAT_WRITE_ATOMIC = 0x0a, - NVME_FEAT_ASYNC_EVENT = 0x0b, - NVME_FEAT_SW_PROGRESS = 0x0c, - NVME_FWACT_REPL = (0 << 3), - NVME_FWACT_REPL_ACTV = (1 << 3), - NVME_FWACT_ACTV = (2 << 3), -}; - -struct nvme_identify { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 cns; - __u32 rsvd11[5]; -}; - -struct nvme_features { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 fid; - __le32 dword11; - __u32 rsvd12[4]; -}; - -struct nvme_create_cq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 cqid; - __le16 qsize; - __le16 cq_flags; - __le16 irq_vector; - __u32 rsvd12[4]; -}; - -struct nvme_create_sq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 sqid; - __le16 qsize; - __le16 sq_flags; - __le16 cqid; - __u32 rsvd12[4]; -}; - -struct nvme_delete_queue { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[9]; - __le16 qid; - __u16 rsvd10; - __u32 rsvd11[5]; -}; - -struct nvme_download_firmware { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; - __le32 numd; - __le32 offset; - __u32 rsvd12[4]; -}; - -struct nvme_format_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[4]; - __le32 cdw10; - __u32 rsvd11[5]; -}; - -struct nvme_command { - union { - struct nvme_common_command common; - struct nvme_rw_command rw; - struct nvme_identify identify; - struct nvme_features features; - struct nvme_create_cq create_cq; - struct nvme_create_sq create_sq; - struct nvme_delete_queue delete_queue; - struct nvme_download_firmware dlfw; - struct nvme_format_cmd format; - struct nvme_dsm_cmd dsm; - }; -}; - -enum { - NVME_SC_SUCCESS = 0x0, - NVME_SC_INVALID_OPCODE = 0x1, - NVME_SC_INVALID_FIELD = 0x2, - NVME_SC_CMDID_CONFLICT = 0x3, - NVME_SC_DATA_XFER_ERROR = 0x4, - NVME_SC_POWER_LOSS = 0x5, - NVME_SC_INTERNAL = 0x6, - NVME_SC_ABORT_REQ = 0x7, - NVME_SC_ABORT_QUEUE = 0x8, - NVME_SC_FUSED_FAIL = 0x9, - NVME_SC_FUSED_MISSING = 0xa, - NVME_SC_INVALID_NS = 0xb, - NVME_SC_CMD_SEQ_ERROR = 0xc, - NVME_SC_LBA_RANGE = 0x80, - NVME_SC_CAP_EXCEEDED = 0x81, - NVME_SC_NS_NOT_READY = 0x82, - NVME_SC_CQ_INVALID = 0x100, - NVME_SC_QID_INVALID = 0x101, - NVME_SC_QUEUE_SIZE = 0x102, - NVME_SC_ABORT_LIMIT = 0x103, - NVME_SC_ABORT_MISSING = 0x104, - NVME_SC_ASYNC_LIMIT = 0x105, - NVME_SC_FIRMWARE_SLOT = 0x106, - NVME_SC_FIRMWARE_IMAGE = 0x107, - NVME_SC_INVALID_VECTOR = 0x108, - NVME_SC_INVALID_LOG_PAGE = 0x109, - NVME_SC_INVALID_FORMAT = 0x10a, - NVME_SC_BAD_ATTRIBUTES = 0x180, - NVME_SC_WRITE_FAULT = 0x280, - NVME_SC_READ_ERROR = 0x281, - NVME_SC_GUARD_CHECK = 0x282, - NVME_SC_APPTAG_CHECK = 0x283, - NVME_SC_REFTAG_CHECK = 0x284, - NVME_SC_COMPARE_FAILED = 0x285, - NVME_SC_ACCESS_DENIED = 0x286, -}; - -struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; - __le16 sq_head; /* how much of this queue may be reclaimed */ - __le16 sq_id; /* submission queue that generated this entry */ - __u16 command_id; /* of the command which completed */ - __le16 status; /* did the command fail, and if so, why? */ -}; - -struct nvme_user_io { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -}; - -struct nvme_admin_cmd { - __u8 opcode; - __u8 flags; - __u16 rsvd1; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u32 cdw10; - __u32 cdw11; - __u32 cdw12; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u32 timeout_ms; - __u32 result; -}; - -#define NVME_IOCTL_ID _IO('N', 0x40) -#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) -#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) - -#ifdef __KERNEL__ -#include -#include -#include - #define NVME_IO_TIMEOUT (5 * HZ) /* @@ -614,6 +165,4 @@ struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); int nvme_sg_get_version_num(int __user *ip); -#endif - #endif /* _LINUX_NVME_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ab5d4992e568f3..a5e677a7070fee 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -281,6 +281,7 @@ header-y += nfs_mount.h header-y += nfsacl.h header-y += nl80211.h header-y += nubus.h +header-y += nvme.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h new file mode 100644 index 00000000000000..31ed566f81f6ed --- /dev/null +++ b/include/uapi/linux/nvme.h @@ -0,0 +1,471 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _UAPI_LINUX_NVME_H +#define _UAPI_LINUX_NVME_H + +#include + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u16 rsvd2; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __u8 rsvd16[16]; +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 mic; + __u8 mdts; + __u8 rsvd78[178]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 rsvd264[248]; + __u8 sqes; + __u8 cqes; + __u8 rsvd514[2]; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 rsvd530[1518]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 rsvd30[98]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __u8 rsvd192[320]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_dsm = 0x09, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 cns; + __u32 rsvd11[5]; +}; + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 fid; + __le32 dword11; + __u32 rsvd12[4]; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + }; +}; + +enum { + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, +}; + +struct nvme_completion { + __le32 result; /* Used by admin commands to return data */ + __u32 rsvd; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_admin_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) + +#endif /* _UAPI_LINUX_NVME_H */ From 685585c25e8269cc355b59d1cd6fc65b8c5c4878 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 25 Jun 2013 15:15:23 -0600 Subject: [PATCH 08/19] NVMe: Update nvme_id_power_state with latest spec Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/uapi/linux/nvme.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 31ed566f81f6ed..989c04e0c56311 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -23,7 +23,8 @@ struct nvme_id_power_state { __le16 max_power; /* centiwatts */ - __u16 rsvd2; + __u8 rsvd2; + __u8 flags; __le32 entry_lat; /* microseconds */ __le32 exit_lat; /* microseconds */ __u8 read_tput; @@ -33,6 +34,11 @@ struct nvme_id_power_state { __u8 rsvd16[16]; }; +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + struct nvme_id_ctrl { __le16 vid; __le16 ssvid; From c3bfe7176c035a0a2c70bc79180fb13a6c57142a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 8 Jul 2013 17:26:25 -0400 Subject: [PATCH 09/19] NVMe: Namespace IDs are unsigned The 'Number of Namespaces' read from the device was being treated as signed, which would cause us to not scan any namespaces for a device with more than 2 billion namespaces. That led to noticing that the namespace ID was also being treated as signed, which could lead to the result from NVME_IOCTL_ID being treated as an error code. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 7 +++++-- include/linux/nvme.h | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 07d527c66eb45a..56d1fa472d06c7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -1486,6 +1487,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ID: + force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: return nvme_user_admin_cmd(ns->dev, (void __user *)arg); @@ -1588,7 +1590,7 @@ static void nvme_config_discard(struct nvme_ns *ns) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, +static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, struct nvme_id_ns *id, struct nvme_lba_range_type *rt) { struct nvme_ns *ns; @@ -1768,7 +1770,8 @@ static void nvme_free_queues(struct nvme_dev *dev) */ static int nvme_dev_add(struct nvme_dev *dev) { - int res, nn, i; + int res; + unsigned nn, i; struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; struct nvme_id_ns *id_ns; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8d0041513e1ab4..3403c8f06fa0d7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -104,7 +104,7 @@ struct nvme_ns { struct request_queue *queue; struct gendisk *disk; - int ns_id; + unsigned ns_id; int lba_shift; int ms; u64 mode_select_num_blocks; From 1b56749e541ad59068582f2a28297843e243b856 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 18 Jul 2013 12:13:51 -0600 Subject: [PATCH 10/19] NVMe: Fix checkpatch issues Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 56d1fa472d06c7..f9244131b88a5e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -451,10 +451,8 @@ static void nvme_bio_pair_endio(struct bio *bio, int err) if (atomic_dec_and_test(&bp->cnt)) { bio_endio(bp->parent, bp->err); - if (bp->bv1) - kfree(bp->bv1); - if (bp->bv2) - kfree(bp->bv2); + kfree(bp->bv1); + kfree(bp->bv2); kfree(bp); } } @@ -1348,7 +1346,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); if (meta_len) { - meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); + meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, + meta_len); if (IS_ERR(meta_iod)) { status = PTR_ERR(meta_iod); meta_iod = NULL; From 7e03b124065507e72008ef294c30001eca74a031 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jul 2013 16:20:56 -0600 Subject: [PATCH 11/19] NVMe: Bring up cdev on set feature failure This patch creates the character device as long as a device's admin queues are usable so a user has an opprotunity to perform administration tasks. A device may be in a state that does not allow IO and setting the queue count feature in such a state returns an error. Previously the driver would bail and the controller would be unusable. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f9244131b88a5e..8cfa4576d4242e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1664,7 +1664,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); if (status) - return -EIO; + return status < 0 ? -EIO : -EBUSY; return min(result & 0xffff, result >> 16) + 1; } @@ -2018,7 +2018,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_unlock(&dev_list_lock); result = nvme_dev_add(dev); - if (result) + if (result && result != -EBUSY) goto delete; scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); From 9e59d091b0eb04f223ed037348e3d9e36f30e72b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 8 Aug 2013 10:25:38 -0600 Subject: [PATCH 12/19] NVMe: Disk stats for read/write commands only Flush and discard requests would previously mess up the accounting. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8cfa4576d4242e..360ac5d32d2676 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -342,11 +342,11 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; - if (iod->nents) + if (iod->nents) { dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - nvme_end_io_acct(bio, iod->start_time); + nvme_end_io_acct(bio, iod->start_time); + } nvme_free_iod(dev, iod); if (status) bio_endio(bio, -EIO); From 0877cb0d285c7f1d53d0b84b360bdea4be4f3f59 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:19 -0600 Subject: [PATCH 13/19] NVMe: Group pci related actions in functions This will make it easier to reuse these outside probe/remove. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 112 ++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 46 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 360ac5d32d2676..a93f52c48036f3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1191,9 +1191,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; - dev->dbs = ((void __iomem *)dev->bar) + 4096; - dev->db_stride = NVME_CAP_STRIDE(cap); - result = nvme_disable_ctrl(dev, cap); if (result < 0) return result; @@ -1832,6 +1829,61 @@ static int nvme_dev_add(struct nvme_dev *dev) return res; } +static int nvme_dev_map(struct nvme_dev *dev) +{ + int bars, result = -ENOMEM; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_enable_device_mem(pdev)) + return result; + + dev->entry[0].vector = pdev->irq; + pci_set_master(pdev); + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (pci_request_selected_regions(pdev, bars, "nvme")) + goto disable_pci; + + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); + else + goto disable_pci; + + pci_set_drvdata(pdev, dev); + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto disable; + + dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap)); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + return 0; + + disable: + pci_release_regions(pdev); + disable_pci: + pci_disable_device(pdev); + return result; +} + +static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); + + if (dev->bar) { + iounmap(dev->bar); + dev->bar = NULL; + } + + pci_release_regions(dev->pci_dev); + if (pci_is_enabled(dev->pci_dev)) + pci_disable_device(dev->pci_dev); +} + static int nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns, *next; @@ -1908,15 +1960,9 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_dev_remove(dev); - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); - iounmap(dev->bar); + nvme_dev_unmap(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); - pci_disable_device(dev->pci_dev); - pci_release_regions(dev->pci_dev); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -1959,7 +2005,7 @@ static const struct file_operations nvme_dev_fops = { static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int bars, result = -ENOMEM; + int result = -ENOMEM; struct nvme_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -1974,39 +2020,19 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - if (pci_enable_device_mem(pdev)) - goto free; - pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable; - INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; - pci_set_drvdata(pdev, dev); - - if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); - else - goto disable; - result = nvme_set_instance(dev); if (result) - goto disable; - - dev->entry[0].vector = pdev->irq; + goto free; result = nvme_setup_prp_pools(dev); if (result) - goto disable_msix; + goto release; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) { - result = -ENOMEM; - goto disable_msix; - } + result = nvme_dev_map(dev); + if (result) + goto release_pools; result = nvme_configure_admin_queue(dev); if (result) @@ -2042,17 +2068,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_free_queues(dev); unmap: - iounmap(dev->bar); - disable_msix: - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); - nvme_release_instance(dev); + nvme_dev_unmap(dev); + release_pools: nvme_release_prp_pools(dev); - disable: - pci_disable_device(pdev); - pci_release_regions(pdev); + release: + nvme_release_instance(dev); free: kfree(dev->queues); kfree(dev->entry); From 224042742582c9938788b81165180c876e997a07 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:20 -0600 Subject: [PATCH 14/19] NVMe: Separate queue alloc/free from create/delete This separates nvme queue allocation from creation, and queue deletion from freeing. This is so that we may in the future temporarily disable queues and reuse the same memory when bringing them back online, like coming back from suspend state. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 133 +++++++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 39 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a93f52c48036f3..95e28b6bcd092c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -82,6 +82,7 @@ struct nvme_queue { u16 cq_head; u8 cq_phase; u8 cqe_seen; + u8 q_suspended; unsigned long cmdid_data[]; }; @@ -117,6 +118,11 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; } +static unsigned nvme_queue_extra(int depth) +{ + return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); +} + /** * alloc_cmdid() - Allocate a Command ID * @nvmeq: The queue that will be used for this command @@ -784,7 +790,7 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) int result = -EBUSY; spin_lock_irq(&nvmeq->q_lock); - if (bio_list_empty(&nvmeq->sq_cong)) + if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) result = nvme_submit_bio_queue(nvmeq, ns, bio); if (unlikely(result)) { if (bio_list_empty(&nvmeq->sq_cong)) @@ -1018,8 +1024,15 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) } } -static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +static void nvme_free_queue(struct nvme_queue *nvmeq) { + spin_lock_irq(&nvmeq->q_lock); + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + bio_endio(bio, -EIO); + } + spin_unlock_irq(&nvmeq->q_lock); + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), @@ -1027,17 +1040,28 @@ static void nvme_free_queue_mem(struct nvme_queue *nvmeq) kfree(nvmeq); } -static void nvme_free_queue(struct nvme_dev *dev, int qid) +static void nvme_free_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) { + nvme_free_queue(dev->queues[i]); + dev->queue_count--; + dev->queues[i] = NULL; + } +} + +static void nvme_disable_queue(struct nvme_dev *dev, int qid) { struct nvme_queue *nvmeq = dev->queues[qid]; int vector = dev->entry[nvmeq->cq_vector].vector; spin_lock_irq(&nvmeq->q_lock); - nvme_cancel_ios(nvmeq, false); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); + if (nvmeq->q_suspended) { + spin_unlock_irq(&nvmeq->q_lock); + return; } + nvmeq->q_suspended = 1; spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); @@ -1049,15 +1073,17 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) adapter_delete_cq(dev, qid); } - nvme_free_queue_mem(nvmeq); + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + nvme_cancel_ios(nvmeq, false); + spin_unlock_irq(&nvmeq->q_lock); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * - sizeof(struct nvme_cmd_info)); + unsigned extra = nvme_queue_extra(depth); struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); if (!nvmeq) return NULL; @@ -1084,6 +1110,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; + nvmeq->q_suspended = 1; + dev->queue_count++; return nvmeq; @@ -1107,18 +1135,29 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, IRQF_DISABLED | IRQF_SHARED, name, nvmeq); } -static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, - int cq_size, int vector) +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { - int result; - struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); + struct nvme_dev *dev = nvmeq->dev; + unsigned extra = nvme_queue_extra(nvmeq->q_depth); - if (!nvmeq) - return ERR_PTR(-ENOMEM); + nvmeq->sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; + memset(nvmeq->cmdid_data, 0, extra); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + nvme_cancel_ios(nvmeq, false); + nvmeq->q_suspended = 0; +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) - goto free_nvmeq; + return result; result = adapter_alloc_sq(dev, qid, nvmeq); if (result < 0) @@ -1128,19 +1167,17 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, if (result < 0) goto release_sq; - return nvmeq; + spin_lock(&nvmeq->q_lock); + nvme_init_queue(nvmeq, qid); + spin_unlock(&nvmeq->q_lock); + + return result; release_sq: adapter_delete_sq(dev, qid); release_cq: adapter_delete_cq(dev, qid); - free_nvmeq: - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); - return ERR_PTR(result); + return result; } static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) @@ -1221,10 +1258,13 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) goto free_q; dev->queues[0] = nvmeq; + spin_lock(&nvmeq->q_lock); + nvme_init_queue(nvmeq, 0); + spin_unlock(&nvmeq->q_lock); return result; free_q: - nvme_free_queue_mem(nvmeq); + nvme_free_queue(nvmeq); return result; } @@ -1386,6 +1426,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) put_nvmeq(nvmeq); if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; + else if (!nvmeq || nvmeq->q_suspended) + status = -EBUSY; else status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); @@ -1537,9 +1579,12 @@ static int nvme_kthread(void *data) if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); + if (nvmeq->q_suspended) + goto unlock; nvme_process_cq(nvmeq); nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); + unlock: spin_unlock_irq(&nvmeq->q_lock); } } @@ -1725,7 +1770,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nr_io_queues = vecs; result = queue_request_irq(dev, dev->queues[0], "nvme admin"); - /* XXX: handle failure here */ + if (result) + goto free_queues; cpu = cpumask_first(cpu_online_mask); for (i = 0; i < nr_io_queues; i++) { @@ -1736,10 +1782,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH); for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); - if (IS_ERR(dev->queues[i + 1])) - return PTR_ERR(dev->queues[i + 1]); - dev->queue_count++; + dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); + if (!dev->queues[i + 1]) { + result = -ENOMEM; + goto free_queues; + } } for (; i < num_possible_cpus(); i++) { @@ -1747,15 +1794,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->queues[i + 1] = dev->queues[target + 1]; } - return 0; -} + for (i = 1; i < dev->queue_count; i++) { + result = nvme_create_queue(dev->queues[i], i); + if (result) { + for (--i; i > 0; i--) + nvme_disable_queue(dev, i); + goto free_queues; + } + } -static void nvme_free_queues(struct nvme_dev *dev) -{ - int i; + return 0; - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_free_queue(dev, i); + free_queues: + nvme_free_queues(dev); + return result; } /* @@ -1887,6 +1939,10 @@ static void nvme_dev_unmap(struct nvme_dev *dev) static int nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns, *next; + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_disable_queue(dev, i); spin_lock(&dev_list_lock); list_del(&dev->node); @@ -2037,7 +2093,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) result = nvme_configure_admin_queue(dev); if (result) goto unmap; - dev->queue_count++; spin_lock(&dev_list_lock); list_add(&dev->node, &dev_list); From f0b50732a979c55c2d15fe8ec4503fa5b3260c53 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:21 -0600 Subject: [PATCH 15/19] NVMe: Separate controller init from disk discovery This combines the controller initialization into one function, removing IO queue setup from namespace discovery, and creates symetric functions for device removal. The controller start and shutdown functions can now be called from resume/suspend context as well as probe/remove. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 77 ++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 95e28b6bcd092c..1595ffba8a660b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1827,10 +1827,6 @@ static int nvme_dev_add(struct nvme_dev *dev) dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - res = nvme_setup_io_queues(dev); - if (res) - return res; - mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, GFP_KERNEL); if (!mem) @@ -1936,27 +1932,29 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_disable_device(dev->pci_dev); } -static int nvme_dev_remove(struct nvme_dev *dev) +static void nvme_dev_shutdown(struct nvme_dev *dev) { - struct nvme_ns *ns, *next; int i; for (i = dev->queue_count - 1; i >= 0; i--) nvme_disable_queue(dev, i); spin_lock(&dev_list_lock); - list_del(&dev->node); + list_del_init(&dev->node); spin_unlock(&dev_list_lock); + nvme_dev_unmap(dev); +} + +static void nvme_dev_remove(struct nvme_dev *dev) +{ + struct nvme_ns *ns, *next; + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); del_gendisk(ns->disk); nvme_ns_free(ns); } - - nvme_free_queues(dev); - - return 0; } static int nvme_setup_prp_pools(struct nvme_dev *dev) @@ -2016,7 +2014,8 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_dev_remove(dev); - nvme_dev_unmap(dev); + nvme_dev_shutdown(dev); + nvme_free_queues(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); kfree(dev->queues); @@ -2059,6 +2058,37 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static int nvme_dev_start(struct nvme_dev *dev) +{ + int result; + + result = nvme_dev_map(dev); + if (result) + return result; + + result = nvme_configure_admin_queue(dev); + if (result) + goto unmap; + + spin_lock(&dev_list_lock); + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + result = nvme_setup_io_queues(dev); + if (result) + goto disable; + + return 0; + + disable: + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + spin_unlock(&dev_list_lock); + unmap: + nvme_dev_unmap(dev); + return result; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int result = -ENOMEM; @@ -2086,21 +2116,13 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release; - result = nvme_dev_map(dev); + result = nvme_dev_start(dev); if (result) goto release_pools; - result = nvme_configure_admin_queue(dev); - if (result) - goto unmap; - - spin_lock(&dev_list_lock); - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - result = nvme_dev_add(dev); if (result && result != -EBUSY) - goto delete; + goto shutdown; scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; @@ -2116,15 +2138,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) remove: nvme_dev_remove(dev); - delete: - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - - nvme_free_queues(dev); - unmap: - nvme_dev_unmap(dev); + shutdown: + nvme_dev_shutdown(dev); release_pools: + nvme_free_queues(dev); nvme_release_prp_pools(dev); release: nvme_release_instance(dev); From 1894d8f16afe5ad54b732f0fa6c4e80bd4d40b91 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:22 -0600 Subject: [PATCH 16/19] NVMe: Use normal shutdown The NVMe spec recommends using the shutdown normal sequence when safely taking the controller offline instead of hitting CC.EN on the next start-up to reset the controller. The spec recommends a minimum of 1 second for the shutdown complete. This patch waits 2 seconds to be on the safe side. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 26 ++++++++++++++++++++++++++ include/linux/nvme.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 1595ffba8a660b..23bb5a70d81001 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1221,6 +1221,30 @@ static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) return nvme_wait_ready(dev, cap, true); } +static int nvme_shutdown_ctrl(struct nvme_dev *dev) +{ + unsigned long timeout; + u32 cc; + + cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; + writel(cc, &dev->bar->cc); + + timeout = 2 * HZ + jiffies; + while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != + NVME_CSTS_SHST_CMPLT) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return 0; +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1943,6 +1967,8 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) list_del_init(&dev->node); spin_unlock(&dev_list_lock); + if (dev->bar) + nvme_shutdown_ctrl(dev); nvme_dev_unmap(dev); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3403c8f06fa0d7..26ebcf41c2131d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -53,6 +53,7 @@ enum { NVME_CC_SHN_NONE = 0 << 14, NVME_CC_SHN_NORMAL = 1 << 14, NVME_CC_SHN_ABRUPT = 2 << 14, + NVME_CC_SHN_MASK = 3 << 14, NVME_CC_IOSQES = 6 << 16, NVME_CC_IOCQES = 4 << 20, NVME_CSTS_RDY = 1 << 0, @@ -60,6 +61,7 @@ enum { NVME_CSTS_SHST_NORMAL = 0 << 2, NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, + NVME_CSTS_SHST_MASK = 3 << 2, }; #define NVME_VS(major, minor) (major << 16 | minor) From cd63894630ab17a192bf97427d16dbec10710a6a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:23 -0600 Subject: [PATCH 17/19] NVMe: Add pci suspend/resume driver callbacks Used for going in and out of low power states. Resuming reuses the IO queues from the previous initialization, freeing any allocated queues that are no longer usable. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 73 +++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 23bb5a70d81001..8efa728f1eac53 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -789,6 +789,12 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) struct nvme_queue *nvmeq = get_nvmeq(ns->dev); int result = -EBUSY; + if (!nvmeq) { + put_nvmeq(NULL); + bio_endio(bio, -EIO); + return; + } + spin_lock_irq(&nvmeq->q_lock); if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) result = nvme_submit_bio_queue(nvmeq, ns, bio); @@ -1256,9 +1262,13 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); - if (!nvmeq) - return -ENOMEM; + nvmeq = dev->queues[0]; + if (!nvmeq) { + nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + if (!nvmeq) + return -ENOMEM; + dev->queues[0] = nvmeq; + } aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1275,21 +1285,16 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) result = nvme_enable_ctrl(dev, cap); if (result) - goto free_q; + return result; result = queue_request_irq(dev, nvmeq, "nvme admin"); if (result) - goto free_q; + return result; - dev->queues[0] = nvmeq; spin_lock(&nvmeq->q_lock); nvme_init_queue(nvmeq, 0); spin_unlock(&nvmeq->q_lock); return result; - - free_q: - nvme_free_queue(nvmeq); - return result; } struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, @@ -1797,6 +1802,21 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result) goto free_queues; + /* Free previously allocated queues that are no longer usable */ + spin_lock(&dev_list_lock); + for (i = dev->queue_count - 1; i > nr_io_queues; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + spin_lock(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + spin_unlock(&nvmeq->q_lock); + + nvme_free_queue(nvmeq); + dev->queue_count--; + dev->queues[i] = NULL; + } + spin_unlock(&dev_list_lock); + cpu = cpumask_first(cpu_online_mask); for (i = 0; i < nr_io_queues; i++) { irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); @@ -1805,7 +1825,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH); - for (i = 0; i < nr_io_queues; i++) { + for (i = dev->queue_count - 1; i < nr_io_queues; i++) { dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); if (!dev->queues[i + 1]) { result = -ENOMEM; @@ -2191,8 +2211,30 @@ static void nvme_remove(struct pci_dev *pdev) #define nvme_link_reset NULL #define nvme_slot_reset NULL #define nvme_error_resume NULL -#define nvme_suspend NULL -#define nvme_resume NULL + +static int nvme_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + nvme_dev_shutdown(ndev); + return 0; +} + +static int nvme_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + int ret; + + ret = nvme_dev_start(ndev); + /* XXX: should remove gendisks if resume fails */ + if (ret) + nvme_free_queues(ndev); + return ret; +} + +static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, @@ -2216,8 +2258,9 @@ static struct pci_driver nvme_driver = { .id_table = nvme_id_table, .probe = nvme_probe, .remove = nvme_remove, - .suspend = nvme_suspend, - .resume = nvme_resume, + .driver = { + .pm = &nvme_dev_pm_ops, + }, .err_handler = &nvme_err_handler, }; From 9d713c2bfb5e1d6abb18a8b12293631f9fcdc708 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Jul 2013 15:02:24 -0600 Subject: [PATCH 18/19] NVMe: Handle ioremap failure Decrement the number of queues required for doorbell remapping until the memory is successfully mapped for that size. Additional checks are done so that we don't call free_irq if it has already been freed. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8efa728f1eac53..9f2b424c445e19 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1739,10 +1739,15 @@ static int set_queue_count(struct nvme_dev *dev, int count) return min(result & 0xffff, result >> 16) + 1; } +static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct pci_dev *pdev = dev->pci_dev; - int result, cpu, i, vecs, nr_io_queues, db_bar_size, q_depth; + int result, cpu, i, vecs, nr_io_queues, size, q_depth; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1751,17 +1756,24 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, dev->queues[0]); - - db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); - if (db_bar_size > 8192) { + size = db_bar_size(dev, nr_io_queues); + if (size > 8192) { iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size); + do { + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (dev->bar) + break; + if (!--nr_io_queues) + return -ENOMEM; + size = db_bar_size(dev, nr_io_queues); + } while (1); dev->dbs = ((void __iomem *)dev->bar) + 4096; dev->queues[0]->q_db = dev->dbs; } + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, dev->queues[0]); + vecs = nr_io_queues; for (i = 0; i < vecs; i++) dev->entry[i].entry = i; @@ -1799,8 +1811,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nr_io_queues = vecs; result = queue_request_irq(dev, dev->queues[0], "nvme admin"); - if (result) + if (result) { + dev->queues[0]->q_suspended = 1; goto free_queues; + } /* Free previously allocated queues that are no longer usable */ spin_lock(&dev_list_lock); From d82e8bfdef9afae83b894be49af4644d9ac3c359 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Sep 2013 14:45:07 -0600 Subject: [PATCH 19/19] NVMe: Merge issue on character device bring-up A recent patch made it possible to bring up the character handle when the device is responsive but not accepting a set-features command. Another recent patch moved the initialization that requires we move where the checks for this condition occur. This patch merges these two ideas so it works much as before. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 9f2b424c445e19..da52092980e231 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2135,10 +2135,10 @@ static int nvme_dev_start(struct nvme_dev *dev) spin_unlock(&dev_list_lock); result = nvme_setup_io_queues(dev); - if (result) + if (result && result != -EBUSY) goto disable; - return 0; + return result; disable: spin_lock(&dev_list_lock); @@ -2177,13 +2177,17 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release; result = nvme_dev_start(dev); - if (result) + if (result) { + if (result == -EBUSY) + goto create_cdev; goto release_pools; + } result = nvme_dev_add(dev); - if (result && result != -EBUSY) + if (result) goto shutdown; + create_cdev: scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; dev->miscdev.parent = &pdev->dev;