]> git.ipfire.org Git - thirdparty/qemu.git/blame - hw/block/nvme.c
qom: Change object_get_canonical_path_component() not to malloc
[thirdparty/qemu.git] / hw / block / nvme.c
CommitLineData
f3c507ad
KB
1/*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11/**
a896f7f2 12 * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
f3c507ad
KB
13 *
14 * http://www.nvmexpress.org/resources/
15 */
16
17/**
18 * Usage: add options:
19 * -drive file=<file>,if=none,id=<drive_id>
a896f7f2 20 * -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
7c895269 21 * cmb_size_mb=<cmb_size_mb[optional]>, \
6cf94132 22 * [pmrdev=<mem_backend_file_id>,] \
dce22c86 23 * max_ioqpairs=<N[optional]>
a896f7f2
SB
24 *
25 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
b2b2b67a 26 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
6cf94132
AJ
27 *
28 * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
29 * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
30 * both provided.
31 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
32 * For example:
33 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
34 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
f3c507ad
KB
35 */
36
80c71a24 37#include "qemu/osdep.h"
e8400cf3 38#include "qemu/units.h"
dce22c86 39#include "qemu/error-report.h"
a9c94277 40#include "hw/block/block.h"
a9c94277
MA
41#include "hw/pci/msix.h"
42#include "hw/pci/pci.h"
a27bd6c7 43#include "hw/qdev-properties.h"
d6454270 44#include "migration/vmstate.h"
33739c71 45#include "sysemu/sysemu.h"
da34e65c 46#include "qapi/error.h"
33739c71 47#include "qapi/visitor.h"
6cf94132 48#include "sysemu/hostmem.h"
4be74634 49#include "sysemu/block-backend.h"
bc2a2364 50#include "exec/memory.h"
1ee24514 51#include "qemu/log.h"
0b8fa32f 52#include "qemu/module.h"
6b39bad0 53#include "qemu/cutils.h"
1ee24514 54#include "trace.h"
f3c507ad
KB
55#include "nvme.h"
56
6a25a4b4 57#define NVME_MAX_IOQPAIRS 0xffff
f7e8c23f
KJ
58#define NVME_REG_SIZE 0x1000
59#define NVME_DB_SIZE 4
51ec094d 60#define NVME_CMB_BIR 2
37712e00 61#define NVME_PMR_BIR 2
f7e8c23f 62
1ee24514
DG
63#define NVME_GUEST_ERR(trace, fmt, ...) \
64 do { \
65 (trace_##trace)(__VA_ARGS__); \
66 qemu_log_mask(LOG_GUEST_ERROR, #trace \
67 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
68 } while (0)
69
f3c507ad
KB
70static void nvme_process_sq(void *opaque);
71
b4529c5c
KJ
72static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
73{
74 hwaddr low = n->ctrl_mem.addr;
75 hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
76
77 return addr >= low && addr < hi;
78}
79
a896f7f2
SB
80static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
81{
e1731e81 82 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
a896f7f2 83 memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
b4529c5c 84 return;
a896f7f2 85 }
b4529c5c
KJ
86
87 pci_dma_read(&n->parent_obj, addr, buf, size);
a896f7f2
SB
88}
89
f3c507ad
KB
90static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
91{
dce22c86 92 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
f3c507ad
KB
93}
94
95static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
96{
dce22c86 97 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
f3c507ad
KB
98}
99
100static void nvme_inc_cq_tail(NvmeCQueue *cq)
101{
102 cq->tail++;
103 if (cq->tail >= cq->size) {
104 cq->tail = 0;
105 cq->phase = !cq->phase;
106 }
107}
108
109static void nvme_inc_sq_head(NvmeSQueue *sq)
110{
111 sq->head = (sq->head + 1) % sq->size;
112}
113
114static uint8_t nvme_cq_full(NvmeCQueue *cq)
115{
116 return (cq->tail + 1) % cq->size == cq->head;
117}
118
119static uint8_t nvme_sq_empty(NvmeSQueue *sq)
120{
121 return sq->head == sq->tail;
122}
123
5e9aa92e
HN
124static void nvme_irq_check(NvmeCtrl *n)
125{
126 if (msix_enabled(&(n->parent_obj))) {
127 return;
128 }
129 if (~n->bar.intms & n->irq_status) {
130 pci_irq_assert(&n->parent_obj);
131 } else {
132 pci_irq_deassert(&n->parent_obj);
133 }
134}
135
136static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
f3c507ad
KB
137{
138 if (cq->irq_enabled) {
139 if (msix_enabled(&(n->parent_obj))) {
6f4ee2e9 140 trace_pci_nvme_irq_msix(cq->vector);
f3c507ad
KB
141 msix_notify(&(n->parent_obj), cq->vector);
142 } else {
6f4ee2e9 143 trace_pci_nvme_irq_pin();
ca247d35
KJ
144 assert(cq->vector < 32);
145 n->irq_status |= 1 << cq->vector;
5e9aa92e 146 nvme_irq_check(n);
f3c507ad 147 }
1ee24514 148 } else {
6f4ee2e9 149 trace_pci_nvme_irq_masked();
f3c507ad
KB
150 }
151}
152
5e9aa92e
HN
153static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
154{
155 if (cq->irq_enabled) {
156 if (msix_enabled(&(n->parent_obj))) {
157 return;
158 } else {
ca247d35
KJ
159 assert(cq->vector < 32);
160 n->irq_status &= ~(1 << cq->vector);
5e9aa92e
HN
161 nvme_irq_check(n);
162 }
163 }
164}
165
b2b2b67a
SB
166static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
167 uint64_t prp2, uint32_t len, NvmeCtrl *n)
f3c507ad
KB
168{
169 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
170 trans_len = MIN(len, trans_len);
171 int num_prps = (len >> n->page_bits) + 1;
172
1ee24514 173 if (unlikely(!prp1)) {
6f4ee2e9 174 trace_pci_nvme_err_invalid_prp();
f3c507ad 175 return NVME_INVALID_FIELD | NVME_DNR;
e1731e81 176 } else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
b2b2b67a
SB
177 prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
178 qsg->nsg = 0;
179 qemu_iovec_init(iov, num_prps);
180 qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
181 } else {
182 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
183 qemu_sglist_add(qsg, prp1, trans_len);
f3c507ad 184 }
f3c507ad
KB
185 len -= trans_len;
186 if (len) {
1ee24514 187 if (unlikely(!prp2)) {
6f4ee2e9 188 trace_pci_nvme_err_invalid_prp2_missing();
f3c507ad
KB
189 goto unmap;
190 }
191 if (len > n->page_size) {
192 uint64_t prp_list[n->max_prp_ents];
193 uint32_t nents, prp_trans;
194 int i = 0;
195
196 nents = (len + n->page_size - 1) >> n->page_bits;
197 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
b2b2b67a 198 nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
f3c507ad
KB
199 while (len != 0) {
200 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
201
202 if (i == n->max_prp_ents - 1 && len > n->page_size) {
1ee24514 203 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
6f4ee2e9 204 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
f3c507ad
KB
205 goto unmap;
206 }
207
208 i = 0;
209 nents = (len + n->page_size - 1) >> n->page_bits;
210 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
b2b2b67a 211 nvme_addr_read(n, prp_ent, (void *)prp_list,
f3c507ad
KB
212 prp_trans);
213 prp_ent = le64_to_cpu(prp_list[i]);
214 }
215
1ee24514 216 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
6f4ee2e9 217 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
f3c507ad
KB
218 goto unmap;
219 }
220
221 trans_len = MIN(len, n->page_size);
b2b2b67a
SB
222 if (qsg->nsg){
223 qemu_sglist_add(qsg, prp_ent, trans_len);
224 } else {
225 qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
226 }
f3c507ad
KB
227 len -= trans_len;
228 i++;
229 }
230 } else {
1ee24514 231 if (unlikely(prp2 & (n->page_size - 1))) {
6f4ee2e9 232 trace_pci_nvme_err_invalid_prp2_align(prp2);
f3c507ad
KB
233 goto unmap;
234 }
b2b2b67a
SB
235 if (qsg->nsg) {
236 qemu_sglist_add(qsg, prp2, len);
237 } else {
238 qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
239 }
f3c507ad
KB
240 }
241 }
242 return NVME_SUCCESS;
243
244 unmap:
245 qemu_sglist_destroy(qsg);
246 return NVME_INVALID_FIELD | NVME_DNR;
247}
248
3036a626
KH
249static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
250 uint64_t prp1, uint64_t prp2)
251{
252 QEMUSGList qsg;
253 QEMUIOVector iov;
254 uint16_t status = NVME_SUCCESS;
255
256 if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
257 return NVME_INVALID_FIELD | NVME_DNR;
258 }
259 if (qsg.nsg > 0) {
260 if (dma_buf_write(ptr, len, &qsg)) {
261 status = NVME_INVALID_FIELD | NVME_DNR;
262 }
263 qemu_sglist_destroy(&qsg);
264 } else {
265 if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
266 status = NVME_INVALID_FIELD | NVME_DNR;
267 }
268 qemu_iovec_destroy(&iov);
269 }
270 return status;
271}
272
f3c507ad
KB
273static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
274 uint64_t prp1, uint64_t prp2)
275{
276 QEMUSGList qsg;
b2b2b67a
SB
277 QEMUIOVector iov;
278 uint16_t status = NVME_SUCCESS;
f3c507ad 279
6f4ee2e9 280 trace_pci_nvme_dma_read(prp1, prp2);
1ee24514 281
b2b2b67a 282 if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
f3c507ad
KB
283 return NVME_INVALID_FIELD | NVME_DNR;
284 }
b2b2b67a 285 if (qsg.nsg > 0) {
1ee24514 286 if (unlikely(dma_buf_read(ptr, len, &qsg))) {
6f4ee2e9 287 trace_pci_nvme_err_invalid_dma();
b2b2b67a
SB
288 status = NVME_INVALID_FIELD | NVME_DNR;
289 }
f3c507ad 290 qemu_sglist_destroy(&qsg);
b2b2b67a 291 } else {
25349e82 292 if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
6f4ee2e9 293 trace_pci_nvme_err_invalid_dma();
b2b2b67a
SB
294 status = NVME_INVALID_FIELD | NVME_DNR;
295 }
296 qemu_iovec_destroy(&iov);
f3c507ad 297 }
b2b2b67a 298 return status;
f3c507ad
KB
299}
300
301static void nvme_post_cqes(void *opaque)
302{
303 NvmeCQueue *cq = opaque;
304 NvmeCtrl *n = cq->ctrl;
305 NvmeRequest *req, *next;
306
307 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
308 NvmeSQueue *sq;
309 hwaddr addr;
310
311 if (nvme_cq_full(cq)) {
312 break;
313 }
314
315 QTAILQ_REMOVE(&cq->req_list, req, entry);
316 sq = req->sq;
317 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
318 req->cqe.sq_id = cpu_to_le16(sq->sqid);
319 req->cqe.sq_head = cpu_to_le16(sq->head);
320 addr = cq->dma_addr + cq->tail * n->cqe_size;
321 nvme_inc_cq_tail(cq);
322 pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
323 sizeof(req->cqe));
324 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
325 }
6da02181
KB
326 if (cq->tail != cq->head) {
327 nvme_irq_assert(n, cq);
328 }
f3c507ad
KB
329}
330
331static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
332{
333 assert(cq->cqid == req->sq->cqid);
334 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
335 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
bc72ad67 336 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
337}
338
339static void nvme_rw_cb(void *opaque, int ret)
340{
341 NvmeRequest *req = opaque;
342 NvmeSQueue *sq = req->sq;
343 NvmeCtrl *n = sq->ctrl;
344 NvmeCQueue *cq = n->cq[sq->cqid];
345
f3c507ad 346 if (!ret) {
1753f3dc 347 block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
f3c507ad
KB
348 req->status = NVME_SUCCESS;
349 } else {
1753f3dc 350 block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
f3c507ad
KB
351 req->status = NVME_INTERNAL_DEV_ERROR;
352 }
8b9d74e0
CH
353 if (req->has_sg) {
354 qemu_sglist_destroy(&req->qsg);
355 }
f3c507ad
KB
356 nvme_enqueue_req_completion(cq, req);
357}
358
8b9d74e0
CH
359static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
360 NvmeRequest *req)
361{
362 req->has_sg = false;
363 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
364 BLOCK_ACCT_FLUSH);
365 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
366
367 return NVME_NO_COMPLETE;
368}
369
c03e7ef1
CH
370static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
371 NvmeRequest *req)
372{
373 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
374 const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
375 const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
376 uint64_t slba = le64_to_cpu(rw->slba);
377 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
9d6459d2
KB
378 uint64_t offset = slba << data_shift;
379 uint32_t count = nlb << data_shift;
c03e7ef1 380
1ee24514 381 if (unlikely(slba + nlb > ns->id_ns.nsze)) {
6f4ee2e9 382 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
c03e7ef1
CH
383 return NVME_LBA_RANGE | NVME_DNR;
384 }
385
386 req->has_sg = false;
387 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
388 BLOCK_ACCT_WRITE);
9d6459d2 389 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
c03e7ef1
CH
390 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
391 return NVME_NO_COMPLETE;
392}
393
f3c507ad
KB
394static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
395 NvmeRequest *req)
396{
397 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
398 uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
399 uint64_t slba = le64_to_cpu(rw->slba);
400 uint64_t prp1 = le64_to_cpu(rw->prp1);
401 uint64_t prp2 = le64_to_cpu(rw->prp2);
402
403 uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
404 uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
2115f2a1 405 uint64_t data_size = (uint64_t)nlb << data_shift;
cbe0ed62 406 uint64_t data_offset = slba << data_shift;
f3c507ad 407 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
1753f3dc 408 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
f3c507ad 409
6f4ee2e9 410 trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
1ee24514
DG
411
412 if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
1753f3dc 413 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
6f4ee2e9 414 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
f3c507ad
KB
415 return NVME_LBA_RANGE | NVME_DNR;
416 }
1753f3dc 417
b2b2b67a 418 if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
1753f3dc 419 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
f3c507ad
KB
420 return NVME_INVALID_FIELD | NVME_DNR;
421 }
1753f3dc 422
1753f3dc 423 dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
b2b2b67a
SB
424 if (req->qsg.nsg > 0) {
425 req->has_sg = true;
426 req->aiocb = is_write ?
427 dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
428 nvme_rw_cb, req) :
429 dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
430 nvme_rw_cb, req);
431 } else {
432 req->has_sg = false;
433 req->aiocb = is_write ?
434 blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
435 req) :
436 blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
437 req);
438 }
f3c507ad
KB
439
440 return NVME_NO_COMPLETE;
441}
442
443static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
444{
445 NvmeNamespace *ns;
446 uint32_t nsid = le32_to_cpu(cmd->nsid);
447
1ee24514 448 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
6f4ee2e9 449 trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
f3c507ad
KB
450 return NVME_INVALID_NSID | NVME_DNR;
451 }
452
453 ns = &n->namespaces[nsid - 1];
454 switch (cmd->opcode) {
455 case NVME_CMD_FLUSH:
8b9d74e0 456 return nvme_flush(n, ns, cmd, req);
c03e7ef1
CH
457 case NVME_CMD_WRITE_ZEROS:
458 return nvme_write_zeros(n, ns, cmd, req);
f3c507ad
KB
459 case NVME_CMD_WRITE:
460 case NVME_CMD_READ:
461 return nvme_rw(n, ns, cmd, req);
462 default:
6f4ee2e9 463 trace_pci_nvme_err_invalid_opc(cmd->opcode);
f3c507ad
KB
464 return NVME_INVALID_OPCODE | NVME_DNR;
465 }
466}
467
468static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
469{
470 n->sq[sq->sqid] = NULL;
bc72ad67
AB
471 timer_del(sq->timer);
472 timer_free(sq->timer);
f3c507ad
KB
473 g_free(sq->io_req);
474 if (sq->sqid) {
475 g_free(sq);
476 }
477}
478
479static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
480{
481 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
482 NvmeRequest *req, *next;
483 NvmeSQueue *sq;
484 NvmeCQueue *cq;
485 uint16_t qid = le16_to_cpu(c->qid);
486
1ee24514 487 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
6f4ee2e9 488 trace_pci_nvme_err_invalid_del_sq(qid);
f3c507ad
KB
489 return NVME_INVALID_QID | NVME_DNR;
490 }
491
6f4ee2e9 492 trace_pci_nvme_del_sq(qid);
1ee24514 493
f3c507ad
KB
494 sq = n->sq[qid];
495 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
496 req = QTAILQ_FIRST(&sq->out_req_list);
497 assert(req->aiocb);
4be74634 498 blk_aio_cancel(req->aiocb);
f3c507ad
KB
499 }
500 if (!nvme_check_cqid(n, sq->cqid)) {
501 cq = n->cq[sq->cqid];
502 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
503
504 nvme_post_cqes(cq);
505 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
506 if (req->sq == sq) {
507 QTAILQ_REMOVE(&cq->req_list, req, entry);
508 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
509 }
510 }
511 }
512
513 nvme_free_sq(sq, n);
514 return NVME_SUCCESS;
515}
516
517static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
518 uint16_t sqid, uint16_t cqid, uint16_t size)
519{
520 int i;
521 NvmeCQueue *cq;
522
523 sq->ctrl = n;
524 sq->dma_addr = dma_addr;
525 sq->sqid = sqid;
526 sq->size = size;
527 sq->cqid = cqid;
528 sq->head = sq->tail = 0;
02c4f26b 529 sq->io_req = g_new(NvmeRequest, sq->size);
f3c507ad
KB
530
531 QTAILQ_INIT(&sq->req_list);
532 QTAILQ_INIT(&sq->out_req_list);
533 for (i = 0; i < sq->size; i++) {
534 sq->io_req[i].sq = sq;
535 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
536 }
bc72ad67 537 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
f3c507ad
KB
538
539 assert(n->cq[cqid]);
540 cq = n->cq[cqid];
541 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
542 n->sq[sqid] = sq;
543}
544
545static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
546{
547 NvmeSQueue *sq;
548 NvmeCreateSq *c = (NvmeCreateSq *)cmd;
549
550 uint16_t cqid = le16_to_cpu(c->cqid);
551 uint16_t sqid = le16_to_cpu(c->sqid);
552 uint16_t qsize = le16_to_cpu(c->qsize);
553 uint16_t qflags = le16_to_cpu(c->sq_flags);
554 uint64_t prp1 = le64_to_cpu(c->prp1);
555
6f4ee2e9 556 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
1ee24514
DG
557
558 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
6f4ee2e9 559 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
f3c507ad
KB
560 return NVME_INVALID_CQID | NVME_DNR;
561 }
1ee24514 562 if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
6f4ee2e9 563 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
f3c507ad
KB
564 return NVME_INVALID_QID | NVME_DNR;
565 }
1ee24514 566 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
6f4ee2e9 567 trace_pci_nvme_err_invalid_create_sq_size(qsize);
f3c507ad
KB
568 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
569 }
1ee24514 570 if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
6f4ee2e9 571 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
f3c507ad
KB
572 return NVME_INVALID_FIELD | NVME_DNR;
573 }
1ee24514 574 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
6f4ee2e9 575 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
f3c507ad
KB
576 return NVME_INVALID_FIELD | NVME_DNR;
577 }
578 sq = g_malloc0(sizeof(*sq));
579 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
580 return NVME_SUCCESS;
581}
582
583static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
584{
585 n->cq[cq->cqid] = NULL;
bc72ad67
AB
586 timer_del(cq->timer);
587 timer_free(cq->timer);
f3c507ad
KB
588 msix_vector_unuse(&n->parent_obj, cq->vector);
589 if (cq->cqid) {
590 g_free(cq);
591 }
592}
593
594static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
595{
596 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
597 NvmeCQueue *cq;
598 uint16_t qid = le16_to_cpu(c->qid);
599
1ee24514 600 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
6f4ee2e9 601 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
f3c507ad
KB
602 return NVME_INVALID_CQID | NVME_DNR;
603 }
604
605 cq = n->cq[qid];
1ee24514 606 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
6f4ee2e9 607 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
f3c507ad
KB
608 return NVME_INVALID_QUEUE_DEL;
609 }
ad3a7e45 610 nvme_irq_deassert(n, cq);
6f4ee2e9 611 trace_pci_nvme_del_cq(qid);
f3c507ad
KB
612 nvme_free_cq(cq, n);
613 return NVME_SUCCESS;
614}
615
616static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
617 uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
618{
fbf2e537
PMD
619 int ret;
620
621 ret = msix_vector_use(&n->parent_obj, vector);
622 assert(ret == 0);
f3c507ad
KB
623 cq->ctrl = n;
624 cq->cqid = cqid;
625 cq->size = size;
626 cq->dma_addr = dma_addr;
627 cq->phase = 1;
628 cq->irq_enabled = irq_enabled;
629 cq->vector = vector;
630 cq->head = cq->tail = 0;
631 QTAILQ_INIT(&cq->req_list);
632 QTAILQ_INIT(&cq->sq_list);
f3c507ad 633 n->cq[cqid] = cq;
bc72ad67 634 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
f3c507ad
KB
635}
636
637static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
638{
639 NvmeCQueue *cq;
640 NvmeCreateCq *c = (NvmeCreateCq *)cmd;
641 uint16_t cqid = le16_to_cpu(c->cqid);
642 uint16_t vector = le16_to_cpu(c->irq_vector);
643 uint16_t qsize = le16_to_cpu(c->qsize);
644 uint16_t qflags = le16_to_cpu(c->cq_flags);
645 uint64_t prp1 = le64_to_cpu(c->prp1);
646
6f4ee2e9
KJ
647 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
648 NVME_CQ_FLAGS_IEN(qflags) != 0);
1ee24514
DG
649
650 if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
6f4ee2e9 651 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
f3c507ad
KB
652 return NVME_INVALID_CQID | NVME_DNR;
653 }
1ee24514 654 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
6f4ee2e9 655 trace_pci_nvme_err_invalid_create_cq_size(qsize);
f3c507ad
KB
656 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
657 }
1ee24514 658 if (unlikely(!prp1)) {
6f4ee2e9 659 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
f3c507ad
KB
660 return NVME_INVALID_FIELD | NVME_DNR;
661 }
ca247d35
KJ
662 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
663 trace_pci_nvme_err_invalid_create_cq_vector(vector);
664 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
665 }
6a25a4b4 666 if (unlikely(vector >= n->params.msix_qsize)) {
6f4ee2e9 667 trace_pci_nvme_err_invalid_create_cq_vector(vector);
f3c507ad
KB
668 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
669 }
1ee24514 670 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
6f4ee2e9 671 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
f3c507ad
KB
672 return NVME_INVALID_FIELD | NVME_DNR;
673 }
674
675 cq = g_malloc0(sizeof(*cq));
676 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
677 NVME_CQ_FLAGS_IEN(qflags));
678 return NVME_SUCCESS;
679}
680
03035a23
CH
681static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
682{
683 uint64_t prp1 = le64_to_cpu(c->prp1);
684 uint64_t prp2 = le64_to_cpu(c->prp2);
685
6f4ee2e9 686 trace_pci_nvme_identify_ctrl();
1ee24514 687
03035a23
CH
688 return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
689 prp1, prp2);
690}
691
692static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
f3c507ad
KB
693{
694 NvmeNamespace *ns;
f3c507ad
KB
695 uint32_t nsid = le32_to_cpu(c->nsid);
696 uint64_t prp1 = le64_to_cpu(c->prp1);
697 uint64_t prp2 = le64_to_cpu(c->prp2);
698
6f4ee2e9 699 trace_pci_nvme_identify_ns(nsid);
1ee24514
DG
700
701 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
6f4ee2e9 702 trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
f3c507ad
KB
703 return NVME_INVALID_NSID | NVME_DNR;
704 }
705
706 ns = &n->namespaces[nsid - 1];
1ee24514 707
f3c507ad
KB
708 return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
709 prp1, prp2);
710}
711
03035a23
CH
712static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
713{
3e829fd4 714 static const int data_len = NVME_IDENTIFY_DATA_SIZE;
03035a23
CH
715 uint32_t min_nsid = le32_to_cpu(c->nsid);
716 uint64_t prp1 = le64_to_cpu(c->prp1);
717 uint64_t prp2 = le64_to_cpu(c->prp2);
718 uint32_t *list;
719 uint16_t ret;
720 int i, j = 0;
721
6f4ee2e9 722 trace_pci_nvme_identify_nslist(min_nsid);
1ee24514 723
03035a23
CH
724 list = g_malloc0(data_len);
725 for (i = 0; i < n->num_namespaces; i++) {
726 if (i < min_nsid) {
727 continue;
728 }
729 list[j++] = cpu_to_le32(i + 1);
730 if (j == data_len / sizeof(uint32_t)) {
731 break;
732 }
733 }
734 ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
735 g_free(list);
736 return ret;
737}
738
03035a23
CH
739static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
740{
741 NvmeIdentify *c = (NvmeIdentify *)cmd;
742
743 switch (le32_to_cpu(c->cns)) {
3e829fd4 744 case NVME_ID_CNS_NS:
03035a23 745 return nvme_identify_ns(n, c);
3e829fd4 746 case NVME_ID_CNS_CTRL:
03035a23 747 return nvme_identify_ctrl(n, c);
3e829fd4 748 case NVME_ID_CNS_NS_ACTIVE_LIST:
03035a23
CH
749 return nvme_identify_nslist(n, c);
750 default:
6f4ee2e9 751 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
03035a23
CH
752 return NVME_INVALID_FIELD | NVME_DNR;
753 }
754}
755
3036a626
KH
756static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
757{
6f4ee2e9 758 trace_pci_nvme_setfeat_timestamp(ts);
3036a626
KH
759
760 n->host_timestamp = le64_to_cpu(ts);
761 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
762}
763
764static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
765{
766 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
767 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
768
769 union nvme_timestamp {
770 struct {
771 uint64_t timestamp:48;
772 uint64_t sync:1;
773 uint64_t origin:3;
774 uint64_t rsvd1:12;
775 };
776 uint64_t all;
777 };
778
779 union nvme_timestamp ts;
780 ts.all = 0;
781
782 /*
783 * If the sum of the Timestamp value set by the host and the elapsed
784 * time exceeds 2^48, the value returned should be reduced modulo 2^48.
785 */
786 ts.timestamp = (n->host_timestamp + elapsed_time) & 0xffffffffffff;
787
788 /* If the host timestamp is non-zero, set the timestamp origin */
789 ts.origin = n->host_timestamp ? 0x01 : 0x00;
790
6f4ee2e9 791 trace_pci_nvme_getfeat_timestamp(ts.all);
3036a626
KH
792
793 return cpu_to_le64(ts.all);
794}
795
796static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
797{
798 uint64_t prp1 = le64_to_cpu(cmd->prp1);
799 uint64_t prp2 = le64_to_cpu(cmd->prp2);
800
801 uint64_t timestamp = nvme_get_timestamp(n);
802
803 return nvme_dma_read_prp(n, (uint8_t *)&timestamp,
804 sizeof(timestamp), prp1, prp2);
805}
806
f3c507ad
KB
807static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
808{
809 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
30349fd0 810 uint32_t result;
f3c507ad
KB
811
812 switch (dw10) {
aacd5650 813 case NVME_VOLATILE_WRITE_CACHE:
30349fd0 814 result = blk_enable_write_cache(n->conf.blk);
6f4ee2e9 815 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
30349fd0
CH
816 break;
817 case NVME_NUMBER_OF_QUEUES:
dce22c86
KJ
818 result = cpu_to_le32((n->params.max_ioqpairs - 1) |
819 ((n->params.max_ioqpairs - 1) << 16));
6f4ee2e9 820 trace_pci_nvme_getfeat_numq(result);
aacd5650 821 break;
3036a626
KH
822 case NVME_TIMESTAMP:
823 return nvme_get_feature_timestamp(n, cmd);
f3c507ad 824 default:
6f4ee2e9 825 trace_pci_nvme_err_invalid_getfeat(dw10);
f3c507ad
KB
826 return NVME_INVALID_FIELD | NVME_DNR;
827 }
30349fd0
CH
828
829 req->cqe.result = result;
f3c507ad
KB
830 return NVME_SUCCESS;
831}
832
3036a626
KH
833static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
834{
835 uint16_t ret;
836 uint64_t timestamp;
837 uint64_t prp1 = le64_to_cpu(cmd->prp1);
838 uint64_t prp2 = le64_to_cpu(cmd->prp2);
839
840 ret = nvme_dma_write_prp(n, (uint8_t *)&timestamp,
841 sizeof(timestamp), prp1, prp2);
842 if (ret != NVME_SUCCESS) {
843 return ret;
844 }
845
846 nvme_set_timestamp(n, timestamp);
847
848 return NVME_SUCCESS;
849}
850
f3c507ad
KB
851static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
852{
853 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
30349fd0 854 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
f3c507ad
KB
855
856 switch (dw10) {
30349fd0
CH
857 case NVME_VOLATILE_WRITE_CACHE:
858 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
859 break;
f3c507ad 860 case NVME_NUMBER_OF_QUEUES:
6f4ee2e9
KJ
861 trace_pci_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
862 ((dw11 >> 16) & 0xFFFF) + 1,
dce22c86
KJ
863 n->params.max_ioqpairs,
864 n->params.max_ioqpairs);
865 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
866 ((n->params.max_ioqpairs - 1) << 16));
f3c507ad 867 break;
3036a626
KH
868 case NVME_TIMESTAMP:
869 return nvme_set_feature_timestamp(n, cmd);
f3c507ad 870 default:
6f4ee2e9 871 trace_pci_nvme_err_invalid_setfeat(dw10);
f3c507ad
KB
872 return NVME_INVALID_FIELD | NVME_DNR;
873 }
874 return NVME_SUCCESS;
875}
876
877static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
878{
879 switch (cmd->opcode) {
880 case NVME_ADM_CMD_DELETE_SQ:
881 return nvme_del_sq(n, cmd);
882 case NVME_ADM_CMD_CREATE_SQ:
883 return nvme_create_sq(n, cmd);
884 case NVME_ADM_CMD_DELETE_CQ:
885 return nvme_del_cq(n, cmd);
886 case NVME_ADM_CMD_CREATE_CQ:
887 return nvme_create_cq(n, cmd);
888 case NVME_ADM_CMD_IDENTIFY:
889 return nvme_identify(n, cmd);
890 case NVME_ADM_CMD_SET_FEATURES:
891 return nvme_set_feature(n, cmd, req);
892 case NVME_ADM_CMD_GET_FEATURES:
893 return nvme_get_feature(n, cmd, req);
894 default:
6f4ee2e9 895 trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
f3c507ad
KB
896 return NVME_INVALID_OPCODE | NVME_DNR;
897 }
898}
899
900static void nvme_process_sq(void *opaque)
901{
902 NvmeSQueue *sq = opaque;
903 NvmeCtrl *n = sq->ctrl;
904 NvmeCQueue *cq = n->cq[sq->cqid];
905
906 uint16_t status;
907 hwaddr addr;
908 NvmeCmd cmd;
909 NvmeRequest *req;
910
911 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
912 addr = sq->dma_addr + sq->head * n->sqe_size;
a896f7f2 913 nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
f3c507ad
KB
914 nvme_inc_sq_head(sq);
915
916 req = QTAILQ_FIRST(&sq->req_list);
917 QTAILQ_REMOVE(&sq->req_list, req, entry);
918 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
919 memset(&req->cqe, 0, sizeof(req->cqe));
920 req->cqe.cid = cmd.cid;
921
922 status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
923 nvme_admin_cmd(n, &cmd, req);
924 if (status != NVME_NO_COMPLETE) {
925 req->status = status;
926 nvme_enqueue_req_completion(cq, req);
927 }
928 }
929}
930
931static void nvme_clear_ctrl(NvmeCtrl *n)
932{
933 int i;
934
6bf74636
ID
935 blk_drain(n->conf.blk);
936
dce22c86 937 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
f3c507ad
KB
938 if (n->sq[i] != NULL) {
939 nvme_free_sq(n->sq[i], n);
940 }
941 }
dce22c86 942 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
f3c507ad
KB
943 if (n->cq[i] != NULL) {
944 nvme_free_cq(n->cq[i], n);
945 }
946 }
947
4be74634 948 blk_flush(n->conf.blk);
f3c507ad
KB
949 n->bar.cc = 0;
950}
951
952static int nvme_start_ctrl(NvmeCtrl *n)
953{
954 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
955 uint32_t page_size = 1 << page_bits;
956
1ee24514 957 if (unlikely(n->cq[0])) {
6f4ee2e9 958 trace_pci_nvme_err_startfail_cq();
1ee24514
DG
959 return -1;
960 }
961 if (unlikely(n->sq[0])) {
6f4ee2e9 962 trace_pci_nvme_err_startfail_sq();
1ee24514
DG
963 return -1;
964 }
965 if (unlikely(!n->bar.asq)) {
6f4ee2e9 966 trace_pci_nvme_err_startfail_nbarasq();
1ee24514
DG
967 return -1;
968 }
969 if (unlikely(!n->bar.acq)) {
6f4ee2e9 970 trace_pci_nvme_err_startfail_nbaracq();
1ee24514
DG
971 return -1;
972 }
973 if (unlikely(n->bar.asq & (page_size - 1))) {
6f4ee2e9 974 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
1ee24514
DG
975 return -1;
976 }
977 if (unlikely(n->bar.acq & (page_size - 1))) {
6f4ee2e9 978 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
1ee24514
DG
979 return -1;
980 }
981 if (unlikely(NVME_CC_MPS(n->bar.cc) <
982 NVME_CAP_MPSMIN(n->bar.cap))) {
6f4ee2e9 983 trace_pci_nvme_err_startfail_page_too_small(
1ee24514
DG
984 NVME_CC_MPS(n->bar.cc),
985 NVME_CAP_MPSMIN(n->bar.cap));
986 return -1;
987 }
988 if (unlikely(NVME_CC_MPS(n->bar.cc) >
989 NVME_CAP_MPSMAX(n->bar.cap))) {
6f4ee2e9 990 trace_pci_nvme_err_startfail_page_too_large(
1ee24514
DG
991 NVME_CC_MPS(n->bar.cc),
992 NVME_CAP_MPSMAX(n->bar.cap));
993 return -1;
994 }
995 if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
996 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
6f4ee2e9 997 trace_pci_nvme_err_startfail_cqent_too_small(
1ee24514
DG
998 NVME_CC_IOCQES(n->bar.cc),
999 NVME_CTRL_CQES_MIN(n->bar.cap));
1000 return -1;
1001 }
1002 if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
1003 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
6f4ee2e9 1004 trace_pci_nvme_err_startfail_cqent_too_large(
1ee24514
DG
1005 NVME_CC_IOCQES(n->bar.cc),
1006 NVME_CTRL_CQES_MAX(n->bar.cap));
1007 return -1;
1008 }
1009 if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
1010 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
6f4ee2e9 1011 trace_pci_nvme_err_startfail_sqent_too_small(
1ee24514
DG
1012 NVME_CC_IOSQES(n->bar.cc),
1013 NVME_CTRL_SQES_MIN(n->bar.cap));
1014 return -1;
1015 }
1016 if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
1017 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
6f4ee2e9 1018 trace_pci_nvme_err_startfail_sqent_too_large(
1ee24514
DG
1019 NVME_CC_IOSQES(n->bar.cc),
1020 NVME_CTRL_SQES_MAX(n->bar.cap));
1021 return -1;
1022 }
1023 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
6f4ee2e9 1024 trace_pci_nvme_err_startfail_asqent_sz_zero();
1ee24514
DG
1025 return -1;
1026 }
1027 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
6f4ee2e9 1028 trace_pci_nvme_err_startfail_acqent_sz_zero();
f3c507ad
KB
1029 return -1;
1030 }
1031
1032 n->page_bits = page_bits;
1033 n->page_size = page_size;
1034 n->max_prp_ents = n->page_size / sizeof(uint64_t);
1035 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
1036 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
1037 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
1038 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
1039 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
1040 NVME_AQA_ASQS(n->bar.aqa) + 1);
1041
3036a626
KH
1042 nvme_set_timestamp(n, 0ULL);
1043
f3c507ad
KB
1044 return 0;
1045}
1046
1047static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
1048 unsigned size)
1049{
1ee24514 1050 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
6f4ee2e9 1051 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
1ee24514
DG
1052 "MMIO write not 32-bit aligned,"
1053 " offset=0x%"PRIx64"", offset);
1054 /* should be ignored, fall through for now */
1055 }
1056
1057 if (unlikely(size < sizeof(uint32_t))) {
6f4ee2e9 1058 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
1ee24514
DG
1059 "MMIO write smaller than 32-bits,"
1060 " offset=0x%"PRIx64", size=%u",
1061 offset, size);
1062 /* should be ignored, fall through for now */
1063 }
1064
f3c507ad 1065 switch (offset) {
1ee24514
DG
1066 case 0xc: /* INTMS */
1067 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6f4ee2e9 1068 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
1ee24514
DG
1069 "undefined access to interrupt mask set"
1070 " when MSI-X is enabled");
1071 /* should be ignored, fall through for now */
1072 }
f3c507ad
KB
1073 n->bar.intms |= data & 0xffffffff;
1074 n->bar.intmc = n->bar.intms;
6f4ee2e9 1075 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5e9aa92e 1076 nvme_irq_check(n);
f3c507ad 1077 break;
1ee24514
DG
1078 case 0x10: /* INTMC */
1079 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6f4ee2e9 1080 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
1ee24514
DG
1081 "undefined access to interrupt mask clr"
1082 " when MSI-X is enabled");
1083 /* should be ignored, fall through for now */
1084 }
f3c507ad
KB
1085 n->bar.intms &= ~(data & 0xffffffff);
1086 n->bar.intmc = n->bar.intms;
6f4ee2e9 1087 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5e9aa92e 1088 nvme_irq_check(n);
f3c507ad 1089 break;
1ee24514 1090 case 0x14: /* CC */
6f4ee2e9 1091 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
4a4d614f
DS
1092 /* Windows first sends data, then sends enable bit */
1093 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
1094 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
1095 {
1096 n->bar.cc = data;
1097 }
1098
f3c507ad
KB
1099 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
1100 n->bar.cc = data;
1ee24514 1101 if (unlikely(nvme_start_ctrl(n))) {
6f4ee2e9 1102 trace_pci_nvme_err_startfail();
f3c507ad
KB
1103 n->bar.csts = NVME_CSTS_FAILED;
1104 } else {
6f4ee2e9 1105 trace_pci_nvme_mmio_start_success();
f3c507ad
KB
1106 n->bar.csts = NVME_CSTS_READY;
1107 }
1108 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
6f4ee2e9 1109 trace_pci_nvme_mmio_stopped();
f3c507ad
KB
1110 nvme_clear_ctrl(n);
1111 n->bar.csts &= ~NVME_CSTS_READY;
1112 }
1113 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
6f4ee2e9 1114 trace_pci_nvme_mmio_shutdown_set();
1ee24514
DG
1115 nvme_clear_ctrl(n);
1116 n->bar.cc = data;
1117 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
f3c507ad 1118 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
6f4ee2e9 1119 trace_pci_nvme_mmio_shutdown_cleared();
1ee24514
DG
1120 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
1121 n->bar.cc = data;
1122 }
1123 break;
1124 case 0x1C: /* CSTS */
1125 if (data & (1 << 4)) {
6f4ee2e9 1126 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
1ee24514
DG
1127 "attempted to W1C CSTS.NSSRO"
1128 " but CAP.NSSRS is zero (not supported)");
1129 } else if (data != 0) {
6f4ee2e9 1130 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
1ee24514
DG
1131 "attempted to set a read only bit"
1132 " of controller status");
1133 }
1134 break;
1135 case 0x20: /* NSSR */
1136 if (data == 0x4E564D65) {
6f4ee2e9 1137 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
1ee24514
DG
1138 } else {
1139 /* The spec says that writes of other values have no effect */
1140 return;
f3c507ad
KB
1141 }
1142 break;
1ee24514 1143 case 0x24: /* AQA */
f3c507ad 1144 n->bar.aqa = data & 0xffffffff;
6f4ee2e9 1145 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
f3c507ad 1146 break;
1ee24514 1147 case 0x28: /* ASQ */
f3c507ad 1148 n->bar.asq = data;
6f4ee2e9 1149 trace_pci_nvme_mmio_asqaddr(data);
f3c507ad 1150 break;
1ee24514 1151 case 0x2c: /* ASQ hi */
f3c507ad 1152 n->bar.asq |= data << 32;
6f4ee2e9 1153 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
f3c507ad 1154 break;
1ee24514 1155 case 0x30: /* ACQ */
6f4ee2e9 1156 trace_pci_nvme_mmio_acqaddr(data);
f3c507ad
KB
1157 n->bar.acq = data;
1158 break;
1ee24514 1159 case 0x34: /* ACQ hi */
f3c507ad 1160 n->bar.acq |= data << 32;
6f4ee2e9 1161 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
f3c507ad 1162 break;
1ee24514 1163 case 0x38: /* CMBLOC */
6f4ee2e9 1164 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
1ee24514
DG
1165 "invalid write to reserved CMBLOC"
1166 " when CMBSZ is zero, ignored");
1167 return;
1168 case 0x3C: /* CMBSZ */
6f4ee2e9 1169 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
1ee24514
DG
1170 "invalid write to read only CMBSZ, ignored");
1171 return;
6cf94132 1172 case 0xE00: /* PMRCAP */
6f4ee2e9 1173 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
6cf94132
AJ
1174 "invalid write to PMRCAP register, ignored");
1175 return;
1176 case 0xE04: /* TODO PMRCTL */
1177 break;
1178 case 0xE08: /* PMRSTS */
6f4ee2e9 1179 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
6cf94132
AJ
1180 "invalid write to PMRSTS register, ignored");
1181 return;
1182 case 0xE0C: /* PMREBS */
6f4ee2e9 1183 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
6cf94132
AJ
1184 "invalid write to PMREBS register, ignored");
1185 return;
1186 case 0xE10: /* PMRSWTP */
6f4ee2e9 1187 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
6cf94132
AJ
1188 "invalid write to PMRSWTP register, ignored");
1189 return;
1190 case 0xE14: /* TODO PMRMSC */
1191 break;
f3c507ad 1192 default:
6f4ee2e9 1193 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
1ee24514
DG
1194 "invalid MMIO write,"
1195 " offset=0x%"PRIx64", data=%"PRIx64"",
1196 offset, data);
f3c507ad
KB
1197 break;
1198 }
1199}
1200
1201static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
1202{
1203 NvmeCtrl *n = (NvmeCtrl *)opaque;
1204 uint8_t *ptr = (uint8_t *)&n->bar;
1205 uint64_t val = 0;
1206
1ee24514 1207 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6f4ee2e9 1208 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
1ee24514
DG
1209 "MMIO read not 32-bit aligned,"
1210 " offset=0x%"PRIx64"", addr);
1211 /* should RAZ, fall through for now */
1212 } else if (unlikely(size < sizeof(uint32_t))) {
6f4ee2e9 1213 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
1ee24514
DG
1214 "MMIO read smaller than 32-bits,"
1215 " offset=0x%"PRIx64"", addr);
1216 /* should RAZ, fall through for now */
1217 }
1218
f3c507ad 1219 if (addr < sizeof(n->bar)) {
6cf94132
AJ
1220 /*
1221 * When PMRWBM bit 1 is set then read from
1222 * from PMRSTS should ensure prior writes
1223 * made it to persistent media
1224 */
1225 if (addr == 0xE08 &&
1226 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
bc2a2364 1227 memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
6cf94132 1228 }
f3c507ad 1229 memcpy(&val, ptr + addr, size);
1ee24514 1230 } else {
6f4ee2e9 1231 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
1ee24514
DG
1232 "MMIO read beyond last register,"
1233 " offset=0x%"PRIx64", returning 0", addr);
f3c507ad 1234 }
1ee24514 1235
f3c507ad
KB
1236 return val;
1237}
1238
1239static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
1240{
1241 uint32_t qid;
1242
1ee24514 1243 if (unlikely(addr & ((1 << 2) - 1))) {
6f4ee2e9 1244 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
1ee24514
DG
1245 "doorbell write not 32-bit aligned,"
1246 " offset=0x%"PRIx64", ignoring", addr);
f3c507ad
KB
1247 return;
1248 }
1249
1250 if (((addr - 0x1000) >> 2) & 1) {
1ee24514
DG
1251 /* Completion queue doorbell write */
1252
f3c507ad
KB
1253 uint16_t new_head = val & 0xffff;
1254 int start_sqs;
1255 NvmeCQueue *cq;
1256
1257 qid = (addr - (0x1000 + (1 << 2))) >> 3;
1ee24514 1258 if (unlikely(nvme_check_cqid(n, qid))) {
6f4ee2e9 1259 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
1ee24514
DG
1260 "completion queue doorbell write"
1261 " for nonexistent queue,"
1262 " sqid=%"PRIu32", ignoring", qid);
f3c507ad
KB
1263 return;
1264 }
1265
1266 cq = n->cq[qid];
1ee24514 1267 if (unlikely(new_head >= cq->size)) {
6f4ee2e9 1268 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
1ee24514
DG
1269 "completion queue doorbell write value"
1270 " beyond queue size, sqid=%"PRIu32","
1271 " new_head=%"PRIu16", ignoring",
1272 qid, new_head);
f3c507ad
KB
1273 return;
1274 }
1275
1276 start_sqs = nvme_cq_full(cq) ? 1 : 0;
1277 cq->head = new_head;
1278 if (start_sqs) {
1279 NvmeSQueue *sq;
1280 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
bc72ad67 1281 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad 1282 }
bc72ad67 1283 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
1284 }
1285
5e9aa92e
HN
1286 if (cq->tail == cq->head) {
1287 nvme_irq_deassert(n, cq);
f3c507ad
KB
1288 }
1289 } else {
1ee24514
DG
1290 /* Submission queue doorbell write */
1291
f3c507ad
KB
1292 uint16_t new_tail = val & 0xffff;
1293 NvmeSQueue *sq;
1294
1295 qid = (addr - 0x1000) >> 3;
1ee24514 1296 if (unlikely(nvme_check_sqid(n, qid))) {
6f4ee2e9 1297 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
1ee24514
DG
1298 "submission queue doorbell write"
1299 " for nonexistent queue,"
1300 " sqid=%"PRIu32", ignoring", qid);
f3c507ad
KB
1301 return;
1302 }
1303
1304 sq = n->sq[qid];
1ee24514 1305 if (unlikely(new_tail >= sq->size)) {
6f4ee2e9 1306 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
1ee24514
DG
1307 "submission queue doorbell write value"
1308 " beyond queue size, sqid=%"PRIu32","
1309 " new_tail=%"PRIu16", ignoring",
1310 qid, new_tail);
f3c507ad
KB
1311 return;
1312 }
1313
1314 sq->tail = new_tail;
bc72ad67 1315 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
1316 }
1317}
1318
1319static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
1320 unsigned size)
1321{
1322 NvmeCtrl *n = (NvmeCtrl *)opaque;
1323 if (addr < sizeof(n->bar)) {
1324 nvme_write_bar(n, addr, data, size);
1325 } else if (addr >= 0x1000) {
1326 nvme_process_db(n, addr, data);
1327 }
1328}
1329
1330static const MemoryRegionOps nvme_mmio_ops = {
1331 .read = nvme_mmio_read,
1332 .write = nvme_mmio_write,
1333 .endianness = DEVICE_LITTLE_ENDIAN,
1334 .impl = {
1335 .min_access_size = 2,
1336 .max_access_size = 8,
1337 },
1338};
1339
a896f7f2
SB
1340static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
1341 unsigned size)
1342{
1343 NvmeCtrl *n = (NvmeCtrl *)opaque;
71a86dde 1344 stn_le_p(&n->cmbuf[addr], size, data);
a896f7f2
SB
1345}
1346
1347static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
1348{
a896f7f2 1349 NvmeCtrl *n = (NvmeCtrl *)opaque;
71a86dde 1350 return ldn_le_p(&n->cmbuf[addr], size);
a896f7f2
SB
1351}
1352
1353static const MemoryRegionOps nvme_cmb_ops = {
1354 .read = nvme_cmb_read,
1355 .write = nvme_cmb_write,
1356 .endianness = DEVICE_LITTLE_ENDIAN,
1357 .impl = {
87ad860c 1358 .min_access_size = 1,
a896f7f2
SB
1359 .max_access_size = 8,
1360 },
1361};
1362
54000c66 1363static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
f3c507ad 1364{
54000c66 1365 NvmeParams *params = &n->params;
f3c507ad 1366
54000c66 1367 if (params->num_queues) {
dce22c86
KJ
1368 warn_report("num_queues is deprecated; please use max_ioqpairs "
1369 "instead");
1370
54000c66 1371 params->max_ioqpairs = params->num_queues - 1;
dce22c86
KJ
1372 }
1373
54000c66 1374 if (params->max_ioqpairs < 1 ||
6a25a4b4 1375 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
dce22c86 1376 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6a25a4b4
KJ
1377 NVME_MAX_IOQPAIRS);
1378 return;
1379 }
1380
1381 if (params->msix_qsize < 1 ||
1382 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
1383 error_setg(errp, "msix_qsize must be between 1 and %d",
1384 PCI_MSIX_FLAGS_QSIZE + 1);
2410e133
LQ
1385 return;
1386 }
1387
4be74634 1388 if (!n->conf.blk) {
e01d6a41
MZ
1389 error_setg(errp, "drive property not set");
1390 return;
f3c507ad
KB
1391 }
1392
54000c66 1393 if (!params->serial) {
e01d6a41
MZ
1394 error_setg(errp, "serial property not set");
1395 return;
f3c507ad 1396 }
6cf94132 1397
1065abfb 1398 if (!n->params.cmb_size_mb && n->pmrdev) {
6cf94132 1399 if (host_memory_backend_is_mapped(n->pmrdev)) {
7a309cc9
MA
1400 error_setg(errp, "can't use already busy memdev: %s",
1401 object_get_canonical_path_component(OBJECT(n->pmrdev)));
6cf94132
AJ
1402 return;
1403 }
1404
1405 if (!is_power_of_2(n->pmrdev->size)) {
1406 error_setg(errp, "pmr backend size needs to be power of 2 in size");
1407 return;
1408 }
1409
1410 host_memory_backend_set_mapped(n->pmrdev, true);
1411 }
54000c66
KJ
1412}
1413
a17f5018
KJ
1414static void nvme_init_state(NvmeCtrl *n)
1415{
1416 n->num_namespaces = 1;
1417 /* add one to max_ioqpairs to account for the admin queue pair */
1418 n->reg_size = pow2ceil(NVME_REG_SIZE +
1419 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
1420 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
1421 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
1422 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
1423}
1424
90f45115
KJ
1425static void nvme_init_blk(NvmeCtrl *n, Error **errp)
1426{
c56ee92f
RK
1427 if (!blkconf_blocksizes(&n->conf, errp)) {
1428 return;
1429 }
90f45115
KJ
1430 blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
1431 false, errp);
1432}
1433
d634d742
KJ
1434static void nvme_init_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
1435{
1436 int64_t bs_size;
1437 NvmeIdNs *id_ns = &ns->id_ns;
1438
1439 bs_size = blk_getlength(n->conf.blk);
1440 if (bs_size < 0) {
1441 error_setg_errno(errp, -bs_size, "could not get backing file size");
1442 return;
1443 }
1444
1445 n->ns_size = bs_size;
1446
1447 id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
1448 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(n, ns));
1449
1450 /* no thin provisioning */
1451 id_ns->ncap = id_ns->nsze;
1452 id_ns->nuse = id_ns->ncap;
1453}
1454
51ec094d
KJ
1455static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
1456{
1457 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
1458 NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
1459
1460 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
1461 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
1462 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
1463 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
1464 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
1465 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
1466 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
1467
1468 n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
1469 memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
1470 "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
1471 pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
1472 PCI_BASE_ADDRESS_SPACE_MEMORY |
1473 PCI_BASE_ADDRESS_MEM_TYPE_64 |
1474 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
1475}
1476
37712e00
KJ
1477static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
1478{
1479 /* Controller Capabilities register */
1480 NVME_CAP_SET_PMRS(n->bar.cap, 1);
1481
1482 /* PMR Capabities register */
1483 n->bar.pmrcap = 0;
1484 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
1485 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
1486 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
1487 NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
1488 /* Turn on bit 1 support */
1489 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
1490 NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
1491 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
1492
1493 /* PMR Control register */
1494 n->bar.pmrctl = 0;
1495 NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
1496
1497 /* PMR Status register */
1498 n->bar.pmrsts = 0;
1499 NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
1500 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
1501 NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
1502 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
1503
1504 /* PMR Elasticity Buffer Size register */
1505 n->bar.pmrebs = 0;
1506 NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
1507 NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
1508 NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
1509
1510 /* PMR Sustained Write Throughput register */
1511 n->bar.pmrswtp = 0;
1512 NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
1513 NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
1514
1515 /* PMR Memory Space Control register */
1516 n->bar.pmrmsc = 0;
1517 NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
1518 NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
1519
1520 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
1521 PCI_BASE_ADDRESS_SPACE_MEMORY |
1522 PCI_BASE_ADDRESS_MEM_TYPE_64 |
1523 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
1524}
1525
1c0c2163 1526static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
c3f5526d
KJ
1527{
1528 uint8_t *pci_conf = pci_dev->config;
1529
1530 pci_conf[PCI_INTERRUPT_PIN] = 1;
1531 pci_config_set_prog_interface(pci_conf, 0x2);
1532 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
1533 pcie_endpoint_cap_init(pci_dev, 0x80);
1534
1535 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
1536 n->reg_size);
1537 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
1538 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
1c0c2163
KJ
1539 if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) {
1540 return;
1541 }
0c35ad46
KJ
1542
1543 if (n->params.cmb_size_mb) {
1544 nvme_init_cmb(n, pci_dev);
1545 } else if (n->pmrdev) {
1546 nvme_init_pmr(n, pci_dev);
1547 }
c3f5526d
KJ
1548}
1549
945cb8f4 1550static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
54000c66 1551{
54000c66 1552 NvmeIdCtrl *id = &n->id_ctrl;
945cb8f4 1553 uint8_t *pci_conf = pci_dev->config;
f3c507ad
KB
1554
1555 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
1556 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
1557 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
1558 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
1065abfb 1559 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
f3c507ad
KB
1560 id->rab = 6;
1561 id->ieee[0] = 0x00;
1562 id->ieee[1] = 0x02;
1563 id->ieee[2] = 0xb3;
1564 id->oacs = cpu_to_le16(0);
1565 id->frmw = 7 << 1;
1566 id->lpa = 1 << 0;
1567 id->sqes = (0x6 << 4) | 0x6;
1568 id->cqes = (0x4 << 4) | 0x4;
1569 id->nn = cpu_to_le32(n->num_namespaces);
3036a626 1570 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
f3c507ad
KB
1571 id->psd[0].mp = cpu_to_le16(0x9c4);
1572 id->psd[0].enlat = cpu_to_le32(0x10);
1573 id->psd[0].exlat = cpu_to_le32(0x4);
30349fd0
CH
1574 if (blk_enable_write_cache(n->conf.blk)) {
1575 id->vwc = 1;
1576 }
f3c507ad
KB
1577
1578 n->bar.cap = 0;
1579 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
1580 NVME_CAP_SET_CQR(n->bar.cap, 1);
f3c507ad
KB
1581 NVME_CAP_SET_TO(n->bar.cap, 0xf);
1582 NVME_CAP_SET_CSS(n->bar.cap, 1);
be0677a9 1583 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
f3c507ad 1584
a896f7f2 1585 n->bar.vs = 0x00010200;
f3c507ad 1586 n->bar.intmc = n->bar.intms = 0;
945cb8f4
KJ
1587}
1588
1589static void nvme_realize(PCIDevice *pci_dev, Error **errp)
1590{
1591 NvmeCtrl *n = NVME(pci_dev);
1592 Error *local_err = NULL;
1593
1594 int i;
1595
1596 nvme_check_constraints(n, &local_err);
1597 if (local_err) {
1598 error_propagate(errp, local_err);
1599 return;
1600 }
1601
1602 nvme_init_state(n);
1603 nvme_init_blk(n, &local_err);
1604 if (local_err) {
1605 error_propagate(errp, local_err);
1606 return;
1607 }
1608
1c0c2163
KJ
1609 nvme_init_pci(n, pci_dev, &local_err);
1610 if (local_err) {
1611 error_propagate(errp, local_err);
1612 return;
1613 }
1614
945cb8f4 1615 nvme_init_ctrl(n, pci_dev);
f3c507ad
KB
1616
1617 for (i = 0; i < n->num_namespaces; i++) {
d634d742
KJ
1618 nvme_init_namespace(n, &n->namespaces[i], &local_err);
1619 if (local_err) {
1620 error_propagate(errp, local_err);
1621 return;
1622 }
f3c507ad 1623 }
f3c507ad
KB
1624}
1625
1626static void nvme_exit(PCIDevice *pci_dev)
1627{
1628 NvmeCtrl *n = NVME(pci_dev);
1629
1630 nvme_clear_ctrl(n);
1631 g_free(n->namespaces);
1632 g_free(n->cq);
1633 g_free(n->sq);
a896f7f2 1634
1065abfb 1635 if (n->params.cmb_size_mb) {
a883d6a0
LQ
1636 g_free(n->cmbuf);
1637 }
6cf94132
AJ
1638
1639 if (n->pmrdev) {
1640 host_memory_backend_set_mapped(n->pmrdev, false);
1641 }
f3c507ad 1642 msix_uninit_exclusive_bar(pci_dev);
f3c507ad
KB
1643}
1644
1645static Property nvme_props[] = {
1646 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
6cf94132
AJ
1647 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
1648 HostMemoryBackend *),
1065abfb
KJ
1649 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
1650 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
dce22c86
KJ
1651 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
1652 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6a25a4b4 1653 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
f3c507ad
KB
1654 DEFINE_PROP_END_OF_LIST(),
1655};
1656
1657static const VMStateDescription nvme_vmstate = {
1658 .name = "nvme",
1659 .unmigratable = 1,
1660};
1661
1662static void nvme_class_init(ObjectClass *oc, void *data)
1663{
1664 DeviceClass *dc = DEVICE_CLASS(oc);
1665 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
1666
e01d6a41 1667 pc->realize = nvme_realize;
f3c507ad
KB
1668 pc->exit = nvme_exit;
1669 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
1670 pc->vendor_id = PCI_VENDOR_ID_INTEL;
1671 pc->device_id = 0x5845;
47989f14 1672 pc->revision = 2;
f3c507ad 1673
125ee0ed 1674 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
f3c507ad 1675 dc->desc = "Non-Volatile Memory Express";
4f67d30b 1676 device_class_set_props(dc, nvme_props);
f3c507ad
KB
1677 dc->vmsd = &nvme_vmstate;
1678}
1679
a907ec52 1680static void nvme_instance_init(Object *obj)
33739c71
GA
1681{
1682 NvmeCtrl *s = NVME(obj);
33739c71 1683
a907ec52
LE
1684 device_add_bootindex_property(obj, &s->conf.bootindex,
1685 "bootindex", "/namespace@1,0",
40c2281c 1686 DEVICE(obj));
33739c71
GA
1687}
1688
f3c507ad 1689static const TypeInfo nvme_info = {
08db59e1 1690 .name = TYPE_NVME,
f3c507ad
KB
1691 .parent = TYPE_PCI_DEVICE,
1692 .instance_size = sizeof(NvmeCtrl),
1693 .class_init = nvme_class_init,
33739c71 1694 .instance_init = nvme_instance_init,
71d78767
EH
1695 .interfaces = (InterfaceInfo[]) {
1696 { INTERFACE_PCIE_DEVICE },
1697 { }
1698 },
f3c507ad
KB
1699};
1700
1701static void nvme_register_types(void)
1702{
1703 type_register_static(&nvme_info);
1704}
1705
1706type_init(nvme_register_types)