[thirdparty/qemu.git] / block / linux-aio.c

/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"

/* Only used for assertions.  */
#include "qemu/coroutine_int.h"

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 1024

/* Maximum number of requests in a batch. (default value) */
#define DEFAULT_MAX_BATCH 32

struct qemu_laiocb {
    Coroutine *co;
    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
    QSIMPLEQ_ENTRY(qemu_laiocb) next;
};

typedef struct {
    int plugged;
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
} LaioQueue;

struct LinuxAioState {
    AioContext *aio_context;

    io_context_t ctx;
    EventNotifier e;

    /* No locking required, only accessed from AioContext home thread */
    LaioQueue io_q;
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
};

static void ioq_submit(LinuxAioState *s);

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

/*
 * Completes an AIO request.
 */
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
    int ret;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
        if (ret == laiocb->nbytes) {
            ret = 0;
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
                ret = -ENOSPC;
            }
        }
    }

    laiocb->ret = ret;

    /*
     * If the coroutine is already entered it must be in ioq_submit() and
     * will notice laio->ret has been filled in when it eventually runs
     * later.  Coroutines cannot be entered recursively so avoid doing
     * that!
     */
    assert(laiocb->co->ctx == laiocb->ctx->aio_context);
    if (!qemu_coroutine_entered(laiocb->co)) {
        aio_co_wake(laiocb->co);
    }
}

/**
 * aio_ring buffer which is shared between userspace and kernel.
 *
 * This copied from linux/fs/aio.c, common header does not exist
 * but AIO exists for ages so we assume ABI is stable.
 */
struct aio_ring {
    unsigned    id;    /* kernel internal index number */
    unsigned    nr;    /* number of io_events */
    unsigned    head;  /* Written to by userland or by kernel. */
    unsigned    tail;

    unsigned    magic;
    unsigned    compat_features;
    unsigned    incompat_features;
    unsigned    header_length;  /* size of aio_ring */

    struct io_event io_events[];
};

/**
 * io_getevents_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value

 * Returns the number of completed events and sets a pointer
 * on events array.  This function does not update the internal
 * ring buffer, only reads head and tail.  When @events has been
 * processed io_getevents_commit() must be called.
 */
static inline unsigned int io_getevents_peek(io_context_t ctx,
                                             struct io_event **events)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;
    unsigned int head = ring->head, tail = ring->tail;
    unsigned int nr;

    nr = tail >= head ? tail - head : ring->nr - head;
    *events = ring->io_events + head;
    /* To avoid speculative loads of s->events[i] before observing tail.
       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
    smp_rmb();

    return nr;
}

/**
 * io_getevents_commit:
 * @ctx: AIO context
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer.
 */
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;

    if (nr) {
        ring->head = (ring->head + nr) % ring->nr;
    }
}

/**
 * io_getevents_advance_and_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer and returns number of elements left.
 */
static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,
                              struct io_event **events,
                              unsigned int nr)
{
    io_getevents_commit(ctx, nr);
    return io_getevents_peek(ctx, events);
}

/**
 * qemu_laio_process_completions:
 * @s: AIO state
 *
 * Fetches completed I/O requests and invokes their callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
 * indices are kept in LinuxAioState.  Function schedules BH completion so it
 * can be called again in a nested event loop.  When there are no events left
 * to complete the BH is being canceled.
 */
static void qemu_laio_process_completions(LinuxAioState *s)
{
    struct io_event *events;

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
                                                         s->event_idx))) {
        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
            struct iocb *iocb = events[s->event_idx].obj;
            struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

            laiocb->ret = io_event_ret(&events[s->event_idx]);

            /* Change counters one-by-one because we can be nested. */
            s->io_q.in_flight--;
            s->event_idx++;
            qemu_laio_process_completion(laiocb);
        }
    }

    qemu_bh_cancel(s->completion_bh);

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
     * own `for` loop.  If we are the last all counters droped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
}

static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
    qemu_laio_process_completions(s);

    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static void qemu_laio_completion_bh(void *opaque)
{
    LinuxAioState *s = opaque;

    qemu_laio_process_completions_and_submit(s);
}

static void qemu_laio_completion_cb(EventNotifier *e)
{
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_laio_process_completions_and_submit(s);
    }
}

static bool qemu_laio_poll_cb(void *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);
    struct io_event *events;

    return io_getevents_peek(s->ctx, &events);
}

static void qemu_laio_poll_ready(EventNotifier *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    qemu_laio_process_completions_and_submit(s);
}

static void ioq_init(LaioQueue *io_q)
{
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->plugged = 0;
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
}

static void ioq_submit(LinuxAioState *s)
{
    int ret, len;
    struct qemu_laiocb *aiocb;
    struct iocb *iocbs[MAX_EVENTS];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
        if (s->io_q.in_flight >= MAX_EVENTS) {
            break;
        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                break;
            }
        }

        ret = io_submit(s->ctx, len, iocbs);
        if (ret == -EAGAIN) {
            break;
        }
        if (ret < 0) {
            /* Fail the first request, retry the rest */
            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
            s->io_q.in_queue--;
            aiocb->ret = ret;
            qemu_laio_process_completion(aiocb);
            continue;
        }

        s->io_q.in_flight += ret;
        s->io_q.in_queue  -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
    s->io_q.blocked = (s->io_q.in_queue > 0);

    if (s->io_q.in_flight) {
        /* We can try to complete something just right away if there are
         * still requests in-flight. */
        qemu_laio_process_completions(s);
        /*
         * Even we have completed everything (in_flight == 0), the queue can
         * have still pended requests (in_queue > 0).  We do not attempt to
         * repeat submission to avoid IO hang.  The reason is simple: s->e is
         * still set and completion callback will be called shortly and all
         * pended requests will be submitted from there.
         */
    }
}

static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
{
    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;

    /*
     * AIO context can be shared between multiple block devices, so
     * `dev_max_batch` allows reducing the batch size for latency-sensitive
     * devices.
     */
    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);

    /* limit the batch with the number of available events */
    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);

    return max_batch;
}

void laio_io_plug(void)
{
    AioContext *ctx = qemu_get_current_aio_context();
    LinuxAioState *s = aio_get_linux_aio(ctx);

    s->io_q.plugged++;
}

void laio_io_unplug(uint64_t dev_max_batch)
{
    AioContext *ctx = qemu_get_current_aio_context();
    LinuxAioState *s = aio_get_linux_aio(ctx);

    assert(s->io_q.plugged);
    s->io_q.plugged--;

    /*
     * Why max batch checking is performed here:
     * Another BDS may have queued requests with a higher dev_max_batch and
     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
     * batch so we can honor our device's dev_max_batch.
     */
    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
        (!s->io_q.plugged &&
         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
        ioq_submit(s);
    }
}

static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
                          int type, uint64_t dev_max_batch)
{
    LinuxAioState *s = laiocb->ctx;
    struct iocb *iocbs = &laiocb->iocb;
    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_ZONE_APPEND:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    /* Currently Linux kernel does not support other operations */
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
    if (!s->io_q.blocked &&
        (!s->io_q.plugged ||
         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
        ioq_submit(s);
    }

    return 0;
}

int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
                                int type, uint64_t dev_max_batch)
{
    int ret;
    AioContext *ctx = qemu_get_current_aio_context();
    struct qemu_laiocb laiocb = {
        .co         = qemu_coroutine_self(),
        .nbytes     = qiov->size,
        .ctx        = aio_get_linux_aio(ctx),
        .ret        = -EINPROGRESS,
        .is_read    = (type == QEMU_AIO_READ),
        .qiov       = qiov,
    };

    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
    if (ret < 0) {
        return ret;
    }

    if (laiocb.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
    return laiocb.ret;
}

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
    aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
}

void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
{
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e,
                           qemu_laio_completion_cb,
                           qemu_laio_poll_cb,
                           qemu_laio_poll_ready);
}

LinuxAioState *laio_init(Error **errp)
{
    int rc;
    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    rc = event_notifier_init(&s->e, false);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to initialize event notifier");
        goto out_free_state;
    }

    rc = io_setup(MAX_EVENTS, &s->ctx);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to create linux AIO context");
        goto out_close_efd;
    }

    ioq_init(&s->io_q);

    return s;

out_close_efd:
    event_notifier_cleanup(&s->e);
out_free_state:
    g_free(s);
    return NULL;
}

void laio_cleanup(LinuxAioState *s)
{
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
        fprintf(stderr, "%s: destroy AIO context %p failed\n",
                        __func__, &s->ctx);
    }
    g_free(s);
}
Commit	Line	Data
5c6c3a6c CH	1	/*
	2	* Linux native AIO support.
	3	*
	4	* Copyright (C) 2009 IBM, Corp.
	5	* Copyright (C) 2009 Red Hat, Inc.
	6	*
	7	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	8	* See the COPYING file in the top-level directory.
	9	*/
80c71a24	10	#include "qemu/osdep.h"
737e150e	11	#include "block/aio.h"
1de7afc9	12	#include "qemu/queue.h"
2174f12b	13	#include "block/block.h"
9f8540ec	14	#include "block/raw-aio.h"
1de7afc9	15	#include "qemu/event_notifier.h"
2174f12b	16	#include "qemu/coroutine.h"
ed6e2161	17	#include "qapi/error.h"
5c6c3a6c	18
ab50533b EGE	19	/* Only used for assertions. */
	20	#include "qemu/coroutine_int.h"
	21
5c6c3a6c CH	22	#include <libaio.h>
	23
	24	/*
	25	* Queue size (per-device).
	26	*
	27	* XXX: eventually we need to communicate this to the guest and/or make it
	28	* tunable by the guest. If we get more outstanding requests at a time
	29	* than this we will get EAGAIN from io_submit which is communicated to
	30	* the guest as an I/O error.
	31	*/
2558cb8d	32	#define MAX_EVENTS 1024
5c6c3a6c	33
d7ddd0a1 SG	34	/* Maximum number of requests in a batch. (default value) */
	35	#define DEFAULT_MAX_BATCH 32
	36
5c6c3a6c	37	struct qemu_laiocb {
2174f12b	38	Coroutine *co;
dd7f7ed1	39	LinuxAioState *ctx;
5c6c3a6c CH	40	struct iocb iocb;
	41	ssize_t ret;
	42	size_t nbytes;
b161e2e4 KW	43	QEMUIOVector *qiov;
b161e2e4 KW	44	bool is_read;
28b24087	45	QSIMPLEQ_ENTRY(qemu_laiocb) next;
5c6c3a6c CH	46	};
5c6c3a6c CH	47
1b3abdcc	48	typedef struct {
1b3abdcc	49	int plugged;
5e1b34a3 RP	50	unsigned int in_queue;
5e1b34a3 RP	51	unsigned int in_flight;
43f2376e	52	bool blocked;
28b24087	53	QSIMPLEQ_HEAD(, qemu_laiocb) pending;
1b3abdcc ML	54	} LaioQueue;
1b3abdcc ML	55
dd7f7ed1	56	struct LinuxAioState {
0187f5c9 PB	57	AioContext *aio_context;
0187f5c9 PB	58
5c6c3a6c	59	io_context_t ctx;
c90caf25	60	EventNotifier e;
1b3abdcc	61
ab50533b	62	/* No locking required, only accessed from AioContext home thread */
1b3abdcc	63	LaioQueue io_q;
2cdff7f6	64	QEMUBH *completion_bh;
2cdff7f6 SH	65	int event_idx;
2cdff7f6 SH	66	int event_max;
5c6c3a6c CH	67	};
5c6c3a6c CH	68
dd7f7ed1	69	static void ioq_submit(LinuxAioState *s);
28b24087	70
5c6c3a6c CH	71	static inline ssize_t io_event_ret(struct io_event *ev)
	72	{
	73	return (ssize_t)(((uint64_t)ev->res2 << 32) \| ev->res);
	74	}
	75
db0ffc24	76	/*
2b02fd81	77	* Completes an AIO request.
db0ffc24	78	*/
dd7f7ed1	79	static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
db0ffc24 KW	80	{
	81	int ret;
	82
db0ffc24 KW	83	ret = laiocb->ret;
db0ffc24 KW	84	if (ret != -ECANCELED) {
b161e2e4	85	if (ret == laiocb->nbytes) {
db0ffc24	86	ret = 0;
b161e2e4 KW	87	} else if (ret >= 0) {
	88	/* Short reads mean EOF, pad with zeros. */
	89	if (laiocb->is_read) {
3d9b4925 MT	90	qemu_iovec_memset(laiocb->qiov, ret, 0,
3d9b4925 MT	91	laiocb->qiov->size - ret);
b161e2e4	92	} else {
1c42f149	93	ret = -ENOSPC;
b161e2e4 KW	94	}
b161e2e4 KW	95	}
db0ffc24 KW	96	}
db0ffc24 KW	97
2174f12b	98	laiocb->ret = ret;
2b02fd81 JS	99
	100	/*
	101	* If the coroutine is already entered it must be in ioq_submit() and
	102	* will notice laio->ret has been filled in when it eventually runs
	103	* later. Coroutines cannot be entered recursively so avoid doing
	104	* that!
	105	*/
ab50533b	106	assert(laiocb->co->ctx == laiocb->ctx->aio_context);
2b02fd81 JS	107	if (!qemu_coroutine_entered(laiocb->co)) {
2b02fd81 JS	108	aio_co_wake(laiocb->co);
2174f12b	109	}
db0ffc24 KW	110	}
db0ffc24 KW	111
9e909a58 RP	112	/**
	113	* aio_ring buffer which is shared between userspace and kernel.
	114	*
	115	* This copied from linux/fs/aio.c, common header does not exist
	116	* but AIO exists for ages so we assume ABI is stable.
	117	*/
	118	struct aio_ring {
	119	unsigned id; /* kernel internal index number */
	120	unsigned nr; /* number of io_events */
	121	unsigned head; /* Written to by userland or by kernel. */
	122	unsigned tail;
	123
	124	unsigned magic;
	125	unsigned compat_features;
	126	unsigned incompat_features;
	127	unsigned header_length; /* size of aio_ring */
	128
f7795e40	129	struct io_event io_events[];
9e909a58 RP	130	};
	131
	132	/**
	133	* io_getevents_peek:
	134	* @ctx: AIO context
	135	* @events: pointer on events array, output value
	136
	137	* Returns the number of completed events and sets a pointer
	138	* on events array. This function does not update the internal
	139	* ring buffer, only reads head and tail. When @events has been
	140	* processed io_getevents_commit() must be called.
	141	*/
	142	static inline unsigned int io_getevents_peek(io_context_t ctx,
	143	struct io_event **events)
	144	{
	145	struct aio_ring ring = (struct aio_ring )ctx;
	146	unsigned int head = ring->head, tail = ring->tail;
	147	unsigned int nr;
	148
	149	nr = tail >= head ? tail - head : ring->nr - head;
	150	*events = ring->io_events + head;
	151	/* To avoid speculative loads of s->events[i] before observing tail.
	152	Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
	153	smp_rmb();
	154
	155	return nr;
	156	}
	157
	158	/**
	159	* io_getevents_commit:
	160	* @ctx: AIO context
	161	* @nr: the number of events on which head should be advanced
	162	*
	163	* Advances head of a ring buffer.
	164	*/
	165	static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
	166	{
	167	struct aio_ring ring = (struct aio_ring )ctx;
	168
	169	if (nr) {
	170	ring->head = (ring->head + nr) % ring->nr;
	171	}
	172	}
	173
	174	/**
	175	* io_getevents_advance_and_peek:
	176	* @ctx: AIO context
	177	* @events: pointer on events array, output value
	178	* @nr: the number of events on which head should be advanced
	179	*
	180	* Advances head of a ring buffer and returns number of elements left.
	181	*/
	182	static inline unsigned int
	183	io_getevents_advance_and_peek(io_context_t ctx,
	184	struct io_event **events,
	185	unsigned int nr)
	186	{
	187	io_getevents_commit(ctx, nr);
	188	return io_getevents_peek(ctx, events);
	189	}
	190
3407de57 RP	191	/**
	192	* qemu_laio_process_completions:
	193	* @s: AIO state
	194	*
	195	* Fetches completed I/O requests and invokes their callbacks.
2cdff7f6 SH	196	*
	197	* The function is somewhat tricky because it supports nested event loops, for
	198	* example when a request callback invokes aio_poll(). In order to do this,
3407de57 RP	199	* indices are kept in LinuxAioState. Function schedules BH completion so it
	200	* can be called again in a nested event loop. When there are no events left
	201	* to complete the BH is being canceled.
2cdff7f6	202	*/
3407de57	203	static void qemu_laio_process_completions(LinuxAioState *s)
5c6c3a6c	204	{
9e909a58	205	struct io_event *events;
5c6c3a6c	206
2cdff7f6 SH	207	/* Reschedule so nested event loops see currently pending completions */
2cdff7f6 SH	208	qemu_bh_schedule(s->completion_bh);
5c6c3a6c	209
9e909a58 RP	210	while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
	211	s->event_idx))) {
	212	for (s->event_idx = 0; s->event_idx < s->event_max; ) {
	213	struct iocb *iocb = events[s->event_idx].obj;
	214	struct qemu_laiocb *laiocb =
2cdff7f6 SH	215	container_of(iocb, struct qemu_laiocb, iocb);
2cdff7f6 SH	216
9e909a58	217	laiocb->ret = io_event_ret(&events[s->event_idx]);
2cdff7f6	218
9e909a58 RP	219	/* Change counters one-by-one because we can be nested. */
	220	s->io_q.in_flight--;
	221	s->event_idx++;
	222	qemu_laio_process_completion(laiocb);
	223	}
2cdff7f6	224	}
28b24087	225
9e909a58 RP	226	qemu_bh_cancel(s->completion_bh);
	227
	228	/* If we are nested we have to notify the level above that we are done
	229	* by setting event_max to zero, upper level will then jump out of it's
	230	* own `for` loop. If we are the last all counters droped to zero. */
	231	s->event_max = 0;
	232	s->event_idx = 0;
3407de57	233	}
9e909a58	234
3407de57 RP	235	static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
	236	{
	237	qemu_laio_process_completions(s);
1919631e	238
28b24087 PB	239	if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
	240	ioq_submit(s);
	241	}
2cdff7f6 SH	242	}
2cdff7f6 SH	243
3407de57 RP	244	static void qemu_laio_completion_bh(void *opaque)
	245	{
	246	LinuxAioState *s = opaque;
	247
	248	qemu_laio_process_completions_and_submit(s);
	249	}
	250
2cdff7f6 SH	251	static void qemu_laio_completion_cb(EventNotifier *e)
2cdff7f6 SH	252	{
dd7f7ed1	253	LinuxAioState *s = container_of(e, LinuxAioState, e);
2cdff7f6 SH	254
2cdff7f6 SH	255	if (event_notifier_test_and_clear(&s->e)) {
3407de57	256	qemu_laio_process_completions_and_submit(s);
5c6c3a6c CH	257	}
	258	}
	259
ee686975 SH	260	static bool qemu_laio_poll_cb(void *opaque)
	261	{
	262	EventNotifier *e = opaque;
	263	LinuxAioState *s = container_of(e, LinuxAioState, e);
	264	struct io_event *events;
	265
826cc324 SH	266	return io_getevents_peek(s->ctx, &events);
	267	}
	268
	269	static void qemu_laio_poll_ready(EventNotifier *opaque)
	270	{
	271	EventNotifier *e = opaque;
	272	LinuxAioState *s = container_of(e, LinuxAioState, e);
ee686975 SH	273
ee686975 SH	274	qemu_laio_process_completions_and_submit(s);
ee686975 SH	275	}
ee686975 SH	276
1b3abdcc ML	277	static void ioq_init(LaioQueue *io_q)
1b3abdcc ML	278	{
28b24087	279	QSIMPLEQ_INIT(&io_q->pending);
1b3abdcc	280	io_q->plugged = 0;
5e1b34a3 RP	281	io_q->in_queue = 0;
5e1b34a3 RP	282	io_q->in_flight = 0;
43f2376e	283	io_q->blocked = false;
1b3abdcc ML	284	}
1b3abdcc ML	285
dd7f7ed1	286	static void ioq_submit(LinuxAioState *s)
1b3abdcc	287	{
82595da8	288	int ret, len;
28b24087	289	struct qemu_laiocb *aiocb;
5e1b34a3	290	struct iocb *iocbs[MAX_EVENTS];
82595da8	291	QSIMPLEQ_HEAD(, qemu_laiocb) completed;
1b3abdcc	292
43f2376e	293	do {
5e1b34a3 RP	294	if (s->io_q.in_flight >= MAX_EVENTS) {
	295	break;
	296	}
43f2376e PB	297	len = 0;
	298	QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
	299	iocbs[len++] = &aiocb->iocb;
5e1b34a3	300	if (s->io_q.in_flight + len >= MAX_EVENTS) {
43f2376e PB	301	break;
43f2376e PB	302	}
28b24087	303	}
1b3abdcc	304
43f2376e PB	305	ret = io_submit(s->ctx, len, iocbs);
43f2376e PB	306	if (ret == -EAGAIN) {
82595da8	307	break;
43f2376e PB	308	}
43f2376e PB	309	if (ret < 0) {
44713c9e KW	310	/* Fail the first request, retry the rest */
	311	aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
	312	QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
	313	s->io_q.in_queue--;
	314	aiocb->ret = ret;
	315	qemu_laio_process_completion(aiocb);
	316	continue;
43f2376e PB	317	}
43f2376e PB	318
5e1b34a3 RP	319	s->io_q.in_flight += ret;
5e1b34a3 RP	320	s->io_q.in_queue -= ret;
82595da8 PB	321	aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
82595da8 PB	322	QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
43f2376e	323	} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
5e1b34a3	324	s->io_q.blocked = (s->io_q.in_queue > 0);
0ed93d84 RP	325
	326	if (s->io_q.in_flight) {
	327	/* We can try to complete something just right away if there are
	328	* still requests in-flight. */
	329	qemu_laio_process_completions(s);
	330	/*
	331	* Even we have completed everything (in_flight == 0), the queue can
	332	* have still pended requests (in_queue > 0). We do not attempt to
	333	* repeat submission to avoid IO hang. The reason is simple: s->e is
	334	* still set and completion callback will be called shortly and all
	335	* pended requests will be submitted from there.
	336	*/
	337	}
1b3abdcc ML	338	}
1b3abdcc ML	339
512da211 SG	340	static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
	341	{
	342	uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
	343
	344	/*
	345	* AIO context can be shared between multiple block devices, so
	346	* `dev_max_batch` allows reducing the batch size for latency-sensitive
	347	* devices.
	348	*/
	349	max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
	350
	351	/* limit the batch with the number of available events */
	352	max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
	353
	354	return max_batch;
	355	}
	356
ab50533b	357	void laio_io_plug(void)
1b3abdcc	358	{
ab50533b EGE	359	AioContext *ctx = qemu_get_current_aio_context();
	360	LinuxAioState *s = aio_get_linux_aio(ctx);
	361
0187f5c9	362	s->io_q.plugged++;
1b3abdcc ML	363	}
1b3abdcc ML	364
ab50533b	365	void laio_io_unplug(uint64_t dev_max_batch)
1b3abdcc	366	{
ab50533b EGE	367	AioContext *ctx = qemu_get_current_aio_context();
	368	LinuxAioState *s = aio_get_linux_aio(ctx);
	369
6b98bd64	370	assert(s->io_q.plugged);
f387cac5 SH	371	s->io_q.plugged--;
f387cac5 SH	372
99b969fb SH	373	/*
	374	* Why max batch checking is performed here:
	375	* Another BDS may have queued requests with a higher dev_max_batch and
	376	* therefore in_queue could now exceed our dev_max_batch. Re-check the max
	377	* batch so we can honor our device's dev_max_batch.
	378	*/
68d79466	379	if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) \|\|
f387cac5	380	(!s->io_q.plugged &&
68d79466	381	!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
de354644	382	ioq_submit(s);
1b3abdcc	383	}
1b3abdcc ML	384	}
1b3abdcc ML	385
2174f12b	386	static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
512da211	387	int type, uint64_t dev_max_batch)
5c6c3a6c	388	{
2174f12b KW	389	LinuxAioState *s = laiocb->ctx;
	390	struct iocb *iocbs = &laiocb->iocb;
	391	QEMUIOVector *qiov = laiocb->qiov;
5c6c3a6c CH	392
	393	switch (type) {
	394	case QEMU_AIO_WRITE:
	395	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	396	break;
4751d09a SL	397	case QEMU_AIO_ZONE_APPEND:
	398	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	399	break;
5c6c3a6c CH	400	case QEMU_AIO_READ:
5c6c3a6c CH	401	io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	402	break;
c30e624d	403	/* Currently Linux kernel does not support other operations */
5c6c3a6c CH	404	default:
	405	fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
	406	__func__, type);
2174f12b	407	return -EIO;
5c6c3a6c	408	}
c90caf25	409	io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
5c6c3a6c	410
28b24087	411	QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
5e1b34a3	412	s->io_q.in_queue++;
43f2376e	413	if (!s->io_q.blocked &&
5e1b34a3	414	(!s->io_q.plugged \|\|
512da211	415	s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
28b24087	416	ioq_submit(s);
1b3abdcc	417	}
5c6c3a6c	418
2174f12b KW	419	return 0;
	420	}
	421
ab50533b EGE	422	int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
ab50533b EGE	423	int type, uint64_t dev_max_batch)
2174f12b	424	{
2174f12b	425	int ret;
ab50533b	426	AioContext *ctx = qemu_get_current_aio_context();
2174f12b KW	427	struct qemu_laiocb laiocb = {
2174f12b KW	428	.co = qemu_coroutine_self(),
9d52aa3c	429	.nbytes = qiov->size,
ab50533b	430	.ctx = aio_get_linux_aio(ctx),
0ed93d84	431	.ret = -EINPROGRESS,
2174f12b KW	432	.is_read = (type == QEMU_AIO_READ),
	433	.qiov = qiov,
	434	};
	435
512da211	436	ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
2174f12b KW	437	if (ret < 0) {
	438	return ret;
	439	}
	440
0ed93d84 RP	441	if (laiocb.ret == -EINPROGRESS) {
	442	qemu_coroutine_yield();
	443	}
2174f12b KW	444	return laiocb.ret;
	445	}
	446
dd7f7ed1	447	void laio_detach_aio_context(LinuxAioState s, AioContext old_context)
c2f3426c	448	{
60f782b6	449	aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
2cdff7f6	450	qemu_bh_delete(s->completion_bh);
1919631e	451	s->aio_context = NULL;
c2f3426c SH	452	}
c2f3426c SH	453
dd7f7ed1	454	void laio_attach_aio_context(LinuxAioState s, AioContext new_context)
c2f3426c	455	{
0187f5c9	456	s->aio_context = new_context;
2cdff7f6	457	s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
60f782b6	458	aio_set_event_notifier(new_context, &s->e,
ee686975	459	qemu_laio_completion_cb,
826cc324 SH	460	qemu_laio_poll_cb,
826cc324 SH	461	qemu_laio_poll_ready);
c2f3426c SH	462	}
c2f3426c SH	463
ed6e2161	464	LinuxAioState laio_init(Error *errp)
5c6c3a6c	465	{
ed6e2161	466	int rc;
dd7f7ed1	467	LinuxAioState *s;
5c6c3a6c	468
7267c094	469	s = g_malloc0(sizeof(*s));
ed6e2161 NA	470	rc = event_notifier_init(&s->e, false);
ed6e2161 NA	471	if (rc < 0) {
7a21bee2	472	error_setg_errno(errp, -rc, "failed to initialize event notifier");
5c6c3a6c	473	goto out_free_state;
c90caf25	474	}
5c6c3a6c	475
ed6e2161 NA	476	rc = io_setup(MAX_EVENTS, &s->ctx);
	477	if (rc < 0) {
	478	error_setg_errno(errp, -rc, "failed to create linux AIO context");
5c6c3a6c	479	goto out_close_efd;
c90caf25	480	}
5c6c3a6c	481
1b3abdcc ML	482	ioq_init(&s->io_q);
1b3abdcc ML	483
5c6c3a6c CH	484	return s;
	485
	486	out_close_efd:
c90caf25	487	event_notifier_cleanup(&s->e);
5c6c3a6c	488	out_free_state:
7267c094	489	g_free(s);
5c6c3a6c CH	490	return NULL;
5c6c3a6c CH	491	}
abd269b7	492
dd7f7ed1	493	void laio_cleanup(LinuxAioState *s)
abd269b7	494	{
abd269b7	495	event_notifier_cleanup(&s->e);
a1abf40d GA	496
	497	if (io_destroy(s->ctx) != 0) {
	498	fprintf(stderr, "%s: destroy AIO context %p failed\n",
	499	__func__, &s->ctx);
	500	}
abd269b7 SH	501	g_free(s);
abd269b7 SH	502	}