/*-
 * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id$
 * $FreeBSD$
 *
 * Budget Fair Queueing: mixed service/time-domain fair queueing
 * scheduling of disk access, among per-process queues.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include <sys/tree.h>
#include "gs_scheduler.h"

/* Possible states of the scheduler. */
enum g_bfq_state {
	G_QUEUE_READY = 0,	/* Ready to dispatch. */
	G_QUEUE_BUSY,		/* Waiting for a completion. */
	G_QUEUE_IDLING		/* Waiting for a new request. */
};

/* Possible queue flags. */
enum g_bfq_flags {
	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
};

/* Possible expiration reasons. */
enum g_bfq_reason {
	G_BUDGET_EXHAUSTED,
	G_TIME_EXHAUSTED,
	G_TIMED_OUT,
};

struct g_bfq_softc;

RB_HEAD(vtree, g_bfq_queue);

/*
 * Queue descriptor, containing reference count, scheduling
 * state, a queue of pending requests, configuration parameters.
 * Queues with pending request(s) and not under service are also
 * stored in the BFQ tree.
 */
struct g_bfq_queue {
	struct g_bfq_softc *q_sc;	/* Link to parent. */

	enum g_bfq_state q_status;
	unsigned int	q_service;	/* Service received so far. */
	int		q_slice_end;	/* Actual slice end in ticks. */
	enum g_bfq_flags q_flags;	/* Queue flags. */
	struct bio_queue_head q_bioq;

	/* Scheduling parameters */
	uint64_t	q_finish;	/* Finish time. */
	uint64_t	q_start;	/* Start time. */
	struct vtree	*q_tree;	/* Tree we are enqueued into. */
	RB_ENTRY(g_bfq_queue) q_node;	/* RB tree node. */
	uint64_t	q_minstart;	/* Min. ts_start in our subtree. */
	unsigned	q_weight;	/* Weight. */
	unsigned	q_newweight;	/* Target weight after a renice. */
	uint64_t	q_budget;	/* Entitled service for next slice. */

	unsigned int	q_slice_duration; /* Slice size in ticks. */
	unsigned int	q_wait_ticks;	/* Wait time for anticipation. */

	/* Stats to drive the various heuristics. */
	struct g_savg	q_thinktime;	/* Thinktime average. */
	struct g_savg	q_seekdist;	/* Seek distance average. */

	int		q_bionum;	/* Number of requests. */

	off_t		q_lastoff;	/* Last submitted req. offset. */
	int		q_lastsub;	/* Last submitted req. time. */

	/* Expiration deadline for an empty queue. */
	int		q_expire;
};

/* List of scheduler instances. */
LIST_HEAD(g_scheds, g_bfq_softc);

/* Parameters for feedback on budget. */
#define	G_BFQ_BUDGET_STEP	0x00180000
#define	G_BFQ_BUDGET_MIN	0x00200000

/* Bits for fixed point precision in timestamp calculations. */
#define G_BFQ_SERVICE_SHIFT	22

/*
 * Per device descriptor, holding the BFQ tree of the per-process queues
 * accessing the disk, a reference to the geom, and the timer.
 */
struct g_bfq_softc {
	struct g_geom	*sc_geom;

	/*
	 * sc_current is the queue we are anticipating for.
	 * It is set only in gs_bfq_next(), and possibly cleared
	 * only in gs_bfq_next() or on a timeout.
	 * The active queue is never in the Round Robin list
	 * even if it has requests queued.
	 */
	struct g_bfq_queue *sc_current;
	struct callout	sc_wait;	/* Timer for sc_current. */

	struct vtree	sc_active;	/* Active tree. */
	struct vtree	sc_idle;	/* Idle tree. */
	struct g_bfq_queue *sc_firstidle; /* First idle queue. */
	struct g_bfq_queue *sc_lastidle; /* Last idle queue. */
	uint64_t	sc_vtime;	/* Virtual time. */
	unsigned long 	sc_error;	/* Fractional part of vtime. */
	unsigned	sc_wsum;	/* Weight sum. */

	int		sc_nqueues;	/* Number of queues. */

	/* Statistics */
	int		sc_in_flight;	/* Requests in the driver. */

	LIST_ENTRY(g_bfq_softc) sc_next;
};

/* Descriptor for bounded values, min and max are constant. */
struct x_bound {		
	const int	x_min;
	int		x_cur;
	const int	x_max;
};

/*
 * Parameters, config and stats.
 */
struct g_bfq_params {
	int	queues;			/* Total number of queues. */
	int	w_anticipate;		/* Anticipate writes. */
	int	bypass;			/* Bypass scheduling writes. */

	int	units;			/* How many instances. */

	/* sc_head is used for debugging. */
	struct g_scheds	sc_head;	/* First scheduler instance. */

	struct x_bound queue_depth;	/* Max parallel requests. */
	struct x_bound wait_ms;		/* Wait time, milliseconds. */
	struct x_bound quantum_ms;	/* Quantum size, milliseconds. */
	struct x_bound quantum_kb;	/* Quantum size, Kb (1024 bytes). */

	/* statistics */
	int	wait_hit;		/* Success in anticipation. */
	int	wait_miss;		/* Failure in anticipation. */
};

/*
 * Default parameters for the scheduler.  The quantum sizes target
 * a 80MB/s disk; if the hw is faster or slower the minimum of the
 * two will have effect: the clients will still be isolated but
 * the fairness may be limited.  A complete solution would involve
 * the on-line measurement of the actual disk throughput to derive
 * these parameters.  Or we may just choose to ignore service domain
 * fairness and accept what can be achieved with time-only budgets.
 */
static struct g_bfq_params me = {
	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
	.w_anticipate =	1,
	.queue_depth =	{ 1,	1,	50 },
	.wait_ms =	{ 1, 	10,	30 },
	.quantum_ms =	{ 1, 	100,	500 },
	.quantum_kb =	{ 16, 	8192,	65536 },
};

struct g_bfq_params *gs_bfq_me = &me;

SYSCTL_DECL(_kern_geom_sched);
SYSCTL_NODE(_kern_geom_sched, OID_AUTO, bfq, CTLFLAG_RW, 0,
    "GEOM_SCHED BFQ stuff");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, units, CTLFLAG_RD,
    &me.units, 0, "Scheduler instances");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, queues, CTLFLAG_RD,
    &me.queues, 0, "Total BFQ queues");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_ms, CTLFLAG_RW,
    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, quantum_ms, CTLFLAG_RW,
    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, bypass, CTLFLAG_RW,
    &me.bypass, 0, "Bypass scheduler");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, w_anticipate, CTLFLAG_RW,
    &me.w_anticipate, 0, "Do anticipation on writes");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, quantum_kb, CTLFLAG_RW,
    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, queue_depth, CTLFLAG_RW,
    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_hit, CTLFLAG_RW,
    &me.wait_hit, 0, "Hits in anticipation");
SYSCTL_UINT(_kern_geom_sched_bfq, OID_AUTO, wait_miss, CTLFLAG_RW,
    &me.wait_miss, 0, "Misses in anticipation");

#ifdef DEBUG_QUEUES
/* print the status of a queue */
static void
gs_bfq_dump_q(struct g_bfq_queue *qp, int index)
{
	int l = 0;
	struct bio *bp;

	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
		l++;
	}
	printf("--- bfq queue %d %p status %d len %d ---\n",
	    index, qp, qp->q_status, l);
}

/*
 * Dump the scheduler status when writing to this sysctl variable.
 * XXX right now we only dump the status of the last instance created.
 * not a severe issue because this is only for debugging
 */
static int
gs_bfq_sysctl_status(SYSCTL_HANDLER_ARGS)
{
        int error, val = 0;
	struct g_bfq_softc *sc;

        error = sysctl_handle_int(oidp, &val, 0, req);
        if (error || !req->newptr )
                return (error);

        printf("called %s\n", __FUNCTION__);

	LIST_FOREACH(sc, &me.sc_head, sc_next) {
		int i, tot = 0;
		printf("--- sc %p active %p nqueues %d "
		    "callout %d in_flight %d ---\n",
		    sc, sc->sc_current, sc->sc_nqueues,
		    callout_active(&sc->sc_wait),
		    sc->sc_in_flight);
		for (i = 0; i < G_RR_HASH_SIZE; i++) {
			struct g_bfq_queue *qp;
			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
				gs_bfq_dump_q(qp, tot);
				tot++;
			}
		}
	}
        return (0);
}

SYSCTL_PROC(_kern_geom_sched_bfq, OID_AUTO, status,
	CTLTYPE_UINT | CTLFLAG_RW,
    0, sizeof(int), gs_bfq_sysctl_status, "I", "status");

#endif	/* DEBUG_QUEUES */

/*
 * Get a bounded value, optionally convert to a min of t_min ticks.
 */
static int
get_bounded(struct x_bound *v, int t_min)
{
	int x;

	x = v->x_cur;
	if (x < v->x_min)
		x = v->x_min;
	else if (x > v->x_max)
		x = v->x_max;
	if (t_min) {
		x = x * hz / 1000;	/* convert to ticks */
		if (x < t_min)
			x = t_min;
	}
	return (x);
}

/*
 * Get a reference to the queue for bp, using the generic
 * classification mechanism.
 */
static struct g_bfq_queue *
g_bfq_queue_get(struct g_bfq_softc *sc, struct bio *bp)
{

	return (g_sched_get_class(sc->sc_geom, bp));
}

static int
g_bfq_init_class(void *data, void *priv)
{
	struct g_bfq_softc *sc = data;
	struct g_bfq_queue *qp = priv;

	gs_bioq_init(&qp->q_bioq);

	/*
	 * Set the initial parameters for the client:
	 * slice size in bytes and ticks, and wait ticks.
	 * Right now these are constant, but we could have
	 * autoconfiguration code to adjust the values based on
	 * the actual workload.
	 */
	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);

	qp->q_sc = sc;		/* link to the parent */
	qp->q_sc->sc_nqueues++;

	/*
	 * XXX should use the nice value of the issuer, or
	 * anything smarter than this...
	 */
	qp->q_weight = qp->q_newweight = 1;
	me.queues++;

	return (0);
}

/*
 * Release a reference to the queue.
 */
static void
g_bfq_queue_put(struct g_bfq_queue *qp)
{

	g_sched_put_class(qp->q_sc->sc_geom, qp);
}

static void
g_bfq_fini_class(void *data, void *priv)
{
	struct g_bfq_queue *qp = priv;

	KASSERT(!gs_bioq_first(&qp->q_bioq), ("released nonempty queue"));
	qp->q_sc->sc_nqueues--;
	me.queues--;
}

static inline int
g_bfq_gt(uint64_t a, uint64_t b)
{

	return ((int64_t)(a - b) > 0);
}

static inline int
g_bfq_cmp(struct g_bfq_queue *a, struct g_bfq_queue *b)
{
	int64_t v = (int64_t)(a->q_finish - b->q_finish);

	return (v < 0 ? -1 : 1);
}

RB_GENERATE_STATIC(vtree, g_bfq_queue, q_node, g_bfq_cmp);

static inline uint64_t
g_bfq_delta(uint64_t service, unsigned weight)
{

	return ((service << G_BFQ_SERVICE_SHIFT) / weight);
}

static inline void
g_bfq_inc_vtime(struct g_bfq_softc *sc, uint64_t service)
{
	uint64_t delta = (service << G_BFQ_SERVICE_SHIFT) + sc->sc_error;

	sc->sc_error = delta % sc->sc_wsum;
	sc->sc_vtime += delta / sc->sc_wsum;
}

static inline void
g_bfq_update_finish(struct g_bfq_queue *qp)
{
	uint64_t delta;

	if (qp->q_service < qp->q_budget) {
		delta = qp->q_budget - qp->q_service;
		qp->q_finish -= g_bfq_delta(delta, qp->q_weight);
	}
	qp->q_service = 0;
}

static inline void
g_bfq_calc_finish(struct g_bfq_queue *qp, uint64_t service)
{

	KASSERT(!qp->q_service, ("q_service not zero"));
	qp->q_finish = qp->q_start + g_bfq_delta(service, qp->q_weight);
}

static inline void
g_bfq_extract(struct vtree *root, struct g_bfq_queue *qp)
{

	KASSERT(qp->q_tree == root, ("Extracting from the wrong tree"));
	qp->q_tree = NULL;
	RB_REMOVE(vtree, root, qp);
}

static void
g_bfq_idle_extract(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{
	struct g_bfq_queue *next;

	KASSERT(qp->q_tree == &sc->sc_idle, ("Not on idle tree"));

	if (qp == sc->sc_firstidle) {
		next = RB_NEXT(vtree, &sc->sc_idle, qp);
		sc->sc_firstidle = next;
	}

	if (qp == sc->sc_lastidle) {
		next = RB_PREV(vtree, &sc->sc_idle, qp);
		sc->sc_lastidle = next;
	}

	g_bfq_extract(&sc->sc_idle, qp);
}

static void
g_bfq_forget(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{

	sc->sc_nqueues--;
	sc->sc_wsum -= qp->q_weight;
	g_bfq_queue_put(qp);
}

static void
g_bfq_forget_idle(struct g_bfq_softc *sc)
{
	struct g_bfq_queue *firstidle = sc->sc_firstidle;

	if (firstidle && !g_bfq_gt(firstidle->q_finish, sc->sc_vtime)) {
		g_bfq_idle_extract(sc, firstidle);
		g_bfq_forget(sc, firstidle);
	}
}

static void
g_bfq_insert(struct vtree *root, struct g_bfq_queue *qp)
{

	RB_INSERT(vtree, root, qp);
	qp->q_tree = root;
}

static void
g_bfq_idle_insert(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{
	struct g_bfq_queue *firstidle = sc->sc_firstidle;
	struct g_bfq_queue *lastidle = sc->sc_lastidle;

	if (!firstidle || g_bfq_gt(firstidle->q_finish, qp->q_finish))
		sc->sc_firstidle = qp;
	if (!lastidle || g_bfq_gt(qp->q_finish, lastidle->q_finish))
		sc->sc_lastidle = qp;

	g_bfq_insert(&sc->sc_idle, qp);
}

/*
 * Given qp and one of its two children child, update the qp->q_minstart
 * if necessary.
 */
static inline void
g_bfq_update_min(struct g_bfq_queue *qp, struct g_bfq_queue *child)
{

	if (child && g_bfq_gt(qp->q_minstart, child->q_minstart))
		qp->q_minstart = child->q_minstart;
}

/*
 * When qp changes position or one of its children has moved, this function
 * updates qp->q_minstart.  The left and right subtrees are assumed to hold
 * a correct ts_minstart value.
 */
static inline void
g_bfq_update_active_node(struct g_bfq_queue *qp)
{

	qp->q_minstart = qp->q_start;
	g_bfq_update_min(qp, RB_RIGHT(qp, q_node));
	g_bfq_update_min(qp, RB_LEFT(qp, q_node));
}

/*
 * qp must be the deepest modified node after an update.  This function
 * updates its ts_minstart using the values held by its children, assuming
 * that they did not change, and then updates all the nodes that may have
 * changed in the path to the root.  The only nodes that may have changed
 * are those in the path or their siblings.
 */
static void
g_bfq_update_active_tree(struct g_bfq_queue *qp)
{
	struct g_bfq_queue *parent;

up:
	g_bfq_update_active_node(qp);

	parent = RB_PARENT(qp, q_node);
	if (!parent)
		return;

	if (qp == RB_LEFT(parent, q_node) && RB_RIGHT(parent, q_node))
		g_bfq_update_active_node(RB_RIGHT(parent, q_node));
	else if (RB_LEFT(parent, q_node))
		g_bfq_update_active_node(RB_LEFT(parent, q_node));

	qp = parent;
	goto up;
}

/*
 * The active tree is ordered by finish time, but an extra key is kept
 * per each node, containing the minimum value for the start times of
 * its children (and the node itself), so it's possible to search for
 * the eligible node with the lowest finish time.
 */
static void
g_bfq_active_insert(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{

	g_bfq_insert(&sc->sc_active, qp);

	if (RB_LEFT(qp, q_node))
		qp = RB_LEFT(qp, q_node);
	else if (RB_RIGHT(qp, q_node))
		qp = RB_RIGHT(qp, q_node);

	g_bfq_update_active_tree(qp);
}

/*
 * Do the first step of an extraction in an rb tree, looking for the
 * node that will replace qp, and returning the deepest node that the
 * following modifications to the tree can touch.  If qp is the
 * last node in the tree return NULL.
 */
static struct g_bfq_queue *
g_bfq_find_deepest(struct g_bfq_queue *qp)
{
	struct g_bfq_queue *deepest;

	if (!RB_RIGHT(qp, q_node) && !RB_LEFT(qp, q_node))
		deepest = RB_PARENT(qp, q_node);
	else if (!RB_RIGHT(qp, q_node))
		deepest = RB_LEFT(qp, q_node);
	else if (!RB_LEFT(qp, q_node))
		deepest = RB_RIGHT(qp, q_node);
	else {
		deepest = RB_NEXT(vtree, NULL, qp);
		if (RB_RIGHT(deepest, q_node))
			deepest = RB_RIGHT(deepest, q_node);
		else if (RB_PARENT(deepest, q_node) != qp)
			deepest = RB_PARENT(deepest, q_node);
	}

	return (deepest);
}

static void
g_bfq_active_extract(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{
	struct g_bfq_queue *deepest;

	deepest = g_bfq_find_deepest(qp);
	g_bfq_extract(&sc->sc_active, qp);

	if (deepest)
		g_bfq_update_active_tree(deepest);
}

static inline void
g_bfq_update_weight(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{

	if (qp->q_weight != qp->q_newweight) {
		sc->sc_wsum += qp->q_newweight - qp->q_weight;
		qp->q_weight = qp->q_newweight;
	}
}

static void
g_bfq_enqueue(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{

	if (qp == sc->sc_current) {
		KASSERT(!qp->q_tree, ("Current queue is on a tree"));
		/*
		 * If we are requeueing the current entity we have
		 * to take care of not charging to it service it has
		 * not received.
		 */
		g_bfq_update_finish(qp);
		qp->q_start = qp->q_finish;
		sc->sc_current = NULL;
	} else if (qp->q_tree == &sc->sc_active) {
		g_bfq_active_extract(sc, qp);
	} else if (qp->q_tree) {
		/*
		 * Must be on the idle tree, tdq_idle_extract() will
		 * check for that.
		 */
		g_bfq_idle_extract(sc, qp);
		if (g_bfq_gt(sc->sc_vtime, qp->q_finish))
			qp->q_start = sc->sc_vtime;
		else
			qp->q_start = qp->q_finish;
	} else {
		/*
		 * The finish time of the entity can be invalid, and
		 * it is in the past for sure, otherwise the entity
		 * would have been on the idle tree.
		 */
		qp->q_start = sc->sc_vtime;
		sc->sc_nqueues++;
		sc->sc_wsum += qp->q_weight;
		g_sched_priv_ref(qp);
	}

	//printf("ENQ: %d S=%llu F=%llu P=%llu\n", ts_id(qp), qp->q_start,
	//       qp->q_finish, sc->sc_vtime);

	g_bfq_update_weight(sc, qp);
	g_bfq_calc_finish(qp, qp->q_budget);
	g_bfq_active_insert(sc, qp);
}

static inline void
g_bfq_go_idle(struct g_bfq_softc *sc, struct g_bfq_queue *qp)
{

	if (g_bfq_gt(qp->q_finish, sc->sc_vtime))
		g_bfq_idle_insert(sc, qp);
	else
		g_bfq_forget(sc, qp);

	if (qp == sc->sc_current)
		sc->sc_current = NULL;
}

/*
 * Search the first schedulable entity, starting from the root of the tree
 * and going on the left every time on this side there is a subtree with at
 * least one eligible (start <= vtime) entity.  The path on the right is
 * followed only if a) the left subtree contains no eligible entity and b)
 * no eligible entity has been found yet.
 */
static struct g_bfq_queue *
g_bfq_first_active(struct g_bfq_softc *sc)
{
	struct g_bfq_queue *qp = RB_ROOT(&sc->sc_active), *next, *first = NULL;

	while (qp != NULL) {
		if (!g_bfq_gt(qp->q_start, sc->sc_vtime))
			first = qp;

		KASSERT(!g_bfq_gt(qp->q_minstart, sc->sc_vtime),
		    ("No eligible entries on this subtree"));

		if (RB_LEFT(qp, q_node)) {
			next = RB_LEFT(qp, q_node);
			if (!g_bfq_gt(next->q_minstart, sc->sc_vtime)) {
				qp = next;
				continue;
			}
		}
		if (first != NULL)
			break;
		qp = RB_RIGHT(qp, q_node);
	}

	return (first);
}

/*
 * If necessary, update the queue vtime to have at least one eligible
 * entity, skipping to its start time.  Assumes that the active tree
 * of the queue is not empty.
 */
static void
g_bfq_update_vtime(struct g_bfq_softc *sc)
{
	struct g_bfq_queue *qp = RB_ROOT(&sc->sc_active);

	if (g_bfq_gt(qp->q_minstart, sc->sc_vtime)) {
		sc->sc_vtime = qp->q_minstart;
		sc->sc_error = 0;
		g_bfq_forget_idle(sc);
	}
}

static struct g_bfq_queue *
g_bfq_getnext(struct g_bfq_softc *sc)
{
	struct g_bfq_queue *qp;

	KASSERT(!sc->sc_current || !extract, ("Bad tdq_current"));

	if (RB_EMPTY(&sc->sc_active))
		return (NULL);

	g_bfq_update_vtime(sc);
	qp = g_bfq_first_active(sc);
	g_bfq_active_extract(sc, qp);

	KASSERT(!g_bfq_gt(qp->q_start, sc->sc_vtime),
	    ("Selected an entity that was not eligible"));

	//printf("NXT: %d S=%llu F=%llu P=%llu\n", ts_id(qp), qp->q_start,
	//       qp->q_finish, sc->sc_vtime);

	return (qp);
}

static void
g_bfq_served(struct g_bfq_softc *sc, struct g_bfq_queue *qp, uint64_t served)
{
	struct g_bfq_queue *cur = sc->sc_current;

	KASSERT(cur == qp, ("Wrong qp"));
	KASSERT(cur, ("No current queue"));
	KASSERT(cur->q_budget >= cur->q_service + served,
	    ("Too much service received"));

	cur->q_service += served;
	g_bfq_inc_vtime(sc, served);

	//printf("P=%llu\n", sc->sc_vtime);

	g_bfq_forget_idle(sc);
}

static inline int
g_bfq_queue_expired(struct g_bfq_queue *qp)
{

	if ((qp->q_flags & G_FLAG_COMPLETED) &&
	    ticks - qp->q_slice_end >= 0)
		return (1);

	return (0);
}

static inline int
g_bfq_should_anticipate(struct g_bfq_queue *qp, struct bio *bp)
{
	int wait = get_bounded(&me.wait_ms, 2);

	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
		return (0);

	if (g_savg_valid(&qp->q_thinktime) &&
	    g_savg_read(&qp->q_thinktime) > wait)
		return (0);

	if (g_savg_valid(&qp->q_seekdist) &&
	    g_savg_read(&qp->q_seekdist) > 8192)
		return (0);

	return (1);
}

static void
g_bfq_expire(struct g_bfq_softc *sc, int reason)
{
	struct g_bfq_queue *qp = sc->sc_current;
	struct bio *first;
	uint64_t new_budget;

	switch (reason) {
	case G_BUDGET_EXHAUSTED:
	case G_TIME_EXHAUSTED:
		new_budget = qmax(qp->q_budget + G_BFQ_BUDGET_STEP,
		    1024 * get_bounded(&me.quantum_kb, 0));
		break;
	case G_TIMED_OUT:
		if (qp->q_budget > G_BFQ_BUDGET_MIN + G_BFQ_BUDGET_STEP) {
			new_budget = qp->q_budget - G_BFQ_BUDGET_STEP;
			break;
		}
	default:
		new_budget = qp->q_budget;
		break;
	}

//	printf("%ld -> %ld\n", (unsigned long)qp->q_budget,
//	    (unsigned long)new_budget);

	first = gs_bioq_first(&qp->q_bioq);
	if (first) {
		qp->q_budget = qmax(new_budget, first->bio_length);
		g_bfq_enqueue(sc, qp);
	} else {
		qp->q_budget = new_budget;
		g_bfq_go_idle(sc, qp);
	}
}

/*
 * Called on a request arrival, timeout or completion.
 * Try to serve a request among those queued.
 */
static struct bio *
g_bfq_next(void *data, int force)
{
	struct g_bfq_softc *sc = data;
	struct g_bfq_queue *qp;
	struct bio *bp;

	qp = sc->sc_current;
	if (me.bypass == 0 && !force) {
		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
			return (NULL);

		/* Try with the queue under service first. */
		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
			/*
			 * Queue is anticipating, ignore request.
			 * We should check that we are not past
			 * the timeout, but in that case the timeout
			 * will fire immediately afterwards so we
			 * don't bother.
			 */
			return (NULL);
		}
	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
		g_bfq_queue_put(qp);
		qp = NULL;
	}

	if (qp == NULL) {
retry:
		qp = g_bfq_getnext(sc);
		sc->sc_current = qp;
		if (qp == NULL)
			return (NULL); /* No queues at all, return. */
		/* Select the new queue for service. */
		qp->q_service = 0;
		qp->q_flags &= ~G_FLAG_COMPLETED;
	}

	bp = gs_bioq_first(&qp->q_bioq);	/* surely not NULL */
	if (bp->bio_length > qp->q_budget - qp->q_service) {
		/*
		 * This awkward retry loop is there for the following
		 * corner case: a new request is inserted at the head
		 * of the bioq of sc_current, and the queue has not
		 * enough budget to serve it.  The scheduling decision
		 * is not retried more than once.
		 */
		g_bfq_expire(sc, G_BUDGET_EXHAUSTED);
		goto retry;
	}

	gs_bioq_remove(&qp->q_bioq, bp);
	g_bfq_served(sc, qp, bp->bio_length);

 	if (g_bfq_queue_expired(qp)) {
		g_bfq_expire(sc, G_TIME_EXHAUSTED);
	} else if (gs_bioq_first(&qp->q_bioq)) {
		qp->q_status = G_QUEUE_READY;
	} else {
		if (!force && g_bfq_should_anticipate(qp, bp)) {
			/* Anticipate. */
			qp->q_status = G_QUEUE_BUSY;
		} else {
			/* Do not anticipate. */
			g_bfq_go_idle(sc, qp);
		}
	}

	/*
	 * If sc_current != NULL, its q_status should always
	 * be correct here...
	 */
	sc->sc_in_flight++;

	return (bp);
}

static inline void
g_bfq_update_thinktime(struct g_bfq_queue *qp)
{
	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);

	if (qp->q_sc->sc_current != qp)
		return;

	qp->q_lastsub = ticks;
	delta = (delta > 2 * wait) ? 2 * wait : delta;
	if (qp->q_bionum > 7)
		g_savg_add_sample(&qp->q_thinktime, delta);
}

static inline void
g_bfq_update_seekdist(struct g_bfq_queue *qp, struct bio *bp)
{
	off_t dist;

	if (qp->q_lastoff > bp->bio_offset)
		dist = qp->q_lastoff - bp->bio_offset;
	else
		dist = bp->bio_offset - qp->q_lastoff;

	if (dist > 8192 * 8)
		dist = 8192 * 8;

	qp->q_lastoff = bp->bio_offset + bp->bio_length;
	if (qp->q_bionum > 7)
		g_savg_add_sample(&qp->q_seekdist, dist);
}

/*
 * Insert a new request in qp, activating the queue if necessary;
 * update the budget of active queues if their front request changes
 * and the old budget value is not enough to serve their first request.
 */
static void
g_bfq_disksort(struct g_bfq_softc *sc, struct g_bfq_queue *qp, struct bio *bp)
{

	/* Enqueue the new bio. */
	gs_bioq_disksort(&qp->q_bioq, bp);

	/* Queue was not empty, and we don't need to update its budget. */
	if (bp != gs_bioq_first(&qp->q_bioq))
		return;

	/* Cannot update the budget of the queue under service, return. */
	if (qp == sc->sc_current)
		return;

	/* New budget value: be sure we have enough room for the first bio. */
	qp->q_budget = qmax(qp->q_budget, bp->bio_length);

	/* (Re-)enqueue qp. */
	g_bfq_enqueue(sc, qp);
}

/*
 * Called when a real request for disk I/O arrives.
 * Locate the queue associated with the client.
 * If the queue is the one we are anticipating for, reset its timeout;
 * if the queue is not in the round robin list, insert it in the list.
 * On any error, do not queue the request and return -1, the caller
 * will take care of this request.
 */
static int
g_bfq_start(void *data, struct bio *bp)
{
	struct g_bfq_softc *sc = data;
	struct g_bfq_queue *qp;
	int empty;

	if (me.bypass)
		return (-1);	/* Bypass the scheduler. */

	/* Get the queue for the request. */
	qp = g_bfq_queue_get(sc, bp);
	if (qp == NULL)
		return (-1); /* Allocation failed, tell upstream. */

	empty = !gs_bioq_first(&qp->q_bioq);

	bp->bio_caller1 = qp;
	g_bfq_disksort(sc, qp, bp);

	if (empty && qp == sc->sc_current) {
		/* We were anticipating this request... */
		qp->q_status = G_QUEUE_READY;
		callout_stop(&sc->sc_wait);
	}

	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
	g_bfq_update_thinktime(qp);
	g_bfq_update_seekdist(qp, bp);

	return (0);
}

/*
 * Callout executed when a queue times out anticipating a new request.
 */
static void
g_bfq_wait_timeout(void *data)
{
	struct g_bfq_softc *sc = data;
	struct g_geom *geom = sc->sc_geom;

	g_sched_lock(geom);
	/*
	 * We can race with other events, so check if
	 * sc_current is still valid.
	 */
	if (sc->sc_current != NULL) {
		g_bfq_expire(sc, G_TIMED_OUT);
		me.wait_hit--;
		me.wait_miss++;	/* record the miss */
	}
	g_sched_dispatch(geom);
	g_sched_unlock(geom);
}

/*
 * Module glue: allocate descriptor, initialize its fields.
 */
static void *
g_bfq_init(struct g_geom *geom)
{
	struct g_bfq_softc *sc;

	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
	sc->sc_geom = geom;

	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
	RB_INIT(&sc->sc_active);
	RB_INIT(&sc->sc_idle);
	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
	me.units++;

	return (sc);
}

static void
g_bfq_hash_unref(void *data)
{
	struct g_bfq_softc *sc = data;
	struct g_bfq_queue *qp, *tmp;

	RB_FOREACH_SAFE(qp, vtree, &sc->sc_idle, tmp) {
		g_bfq_idle_extract(sc, qp);
		g_bfq_forget(sc, qp);
	}

	KASSERT(sc->sc_current == NULL, ("still a queue under service"));
	KASSERT(RB_EMPTY(&sc->sc_active), ("still scheduled queues"));
	KASSERT(RB_EMPTY(&sc->sc_idle), ("still idle queues"));
}

/*
 * Module glue -- drain the callout structure, destroy the
 * hash table and its element, and free the descriptor.
 */
static void
g_bfq_fini(void *data)
{
	struct g_bfq_softc *sc = data;

	callout_drain(&sc->sc_wait);
	KASSERT(sc->sc_current == NULL, ("still a queue under service"));
	KASSERT(RB_EMPTY(&sc->sc_active), ("still scheduled queues"));
	KASSERT(RB_EMPTY(&sc->sc_idle), ("still idle queues"));

	LIST_REMOVE(sc, sc_next);
	me.units--;
	free(sc, M_GEOM_SCHED);
}

/*
 * Called when the request under service terminates.
 * Start the anticipation timer if needed.
 */
static void
g_bfq_done(void *data, struct bio *bp)
{
	struct g_bfq_softc *sc = data;
	struct g_bfq_queue *qp;

	sc->sc_in_flight--;

	qp = bp->bio_caller1;
	if (qp == sc->sc_current && qp->q_status == G_QUEUE_BUSY) {
		if (!(qp->q_flags & G_FLAG_COMPLETED)) {
			qp->q_flags |= G_FLAG_COMPLETED;
			/* in case we want to make the slice adaptive */
			qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
			qp->q_slice_end = ticks + qp->q_slice_duration;
		}

		/* The queue is trying anticipation, start the timer. */
		qp->q_status = G_QUEUE_IDLING;
		/* may make this adaptive */
		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
		me.wait_hit++;
		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
		    g_bfq_wait_timeout, sc);
	} else
		g_sched_dispatch(sc->sc_geom);

	/* Release the bio's reference to the queue. */
	g_bfq_queue_put(qp);
}

static void
g_bfq_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
    struct g_consumer *cp, struct g_provider *pp)
{

	if (!indent) {   /* plaintext */
		sbuf_printf(sb, " units %d queues %d",
			me.units, me.queues);
        }
}

static struct g_gsched g_bfq = {
	.gs_name = "bfq",
	.gs_priv_size = sizeof(struct g_bfq_queue),
	.gs_init = g_bfq_init,
	.gs_fini = g_bfq_fini,
	.gs_start = g_bfq_start,
	.gs_done = g_bfq_done,
	.gs_next = g_bfq_next,
	.gs_dumpconf = g_bfq_dumpconf,
	.gs_init_class = g_bfq_init_class,
	.gs_fini_class = g_bfq_fini_class,
	.gs_hash_unref = g_bfq_hash_unref,
};

DECLARE_GSCHED_MODULE(bfq, &g_bfq);
