android_kernel_samsung_msm8976/block/bfq-sched.c

1184 lines
32 KiB
C
Raw Normal View History

block: introduce the BFQ-v7r8 I/O sched for 3.10.8+ Add the BFQ-v7r8 I/O scheduler to 3.10.8+. The general structure is borrowed from CFQ, as much of the code for handling I/O contexts Over time, several useful features have been ported from CFQ as well (details in the changelog in README.BFQ). A (bfq_)queue is associated to each task doing I/O on a device, and each time a scheduling decision has to be made a queue is selected and served until it expires. - Slices are given in the service domain: tasks are assigned budgets, measured in number of sectors. Once got the disk, a task must however consume its assigned budget within a configurable maximum time (by default, the maximum possible value of the budgets is automatically computed to comply with this timeout). This allows the desired latency vs "throughput boosting" tradeoff to be set. - Budgets are scheduled according to a variant of WF2Q+, implemented using an augmented rb-tree to take eligibility into account while preserving an O(log N) overall complexity. - A low-latency tunable is provided; if enabled, both interactive and soft real-time applications are guaranteed a very low latency. - Latency guarantees are preserved also in the presence of NCQ. - Also with flash-based devices, a high throughput is achieved while still preserving latency guarantees. - BFQ features Early Queue Merge (EQM), a sort of fusion of the cooperating-queue-merging and the preemption mechanisms present in CFQ. EQM is in fact a unified mechanism that tries to get a sequential read pattern, and hence a high throughput, with any set of processes performing interleaved I/O over a contiguous sequence of sectors. - BFQ supports full hierarchical scheduling, exporting a cgroups interface. Since each node has a full scheduler, each group can be assigned its own weight. - If the cgroups interface is not used, only I/O priorities can be assigned to processes, with ioprio values mapped to weights with the relation weight = IOPRIO_BE_NR - ioprio. - ioprio classes are served in strict priority order, i.e., lower priority queues are not served as long as there are higher priority queues. Among queues in the same class the bandwidth is distributed in proportion to the weight of each queue. A very thin extra bandwidth is however guaranteed to the Idle class, to prevent it from starving. Change-Id: Iebf9be399041b89d79b54077da1a34a81d4e4238 Signed-off-by: Paolo Valente <paolo.valente@unimore.it> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
2013-05-09 17:10:02 +00:00
/*
* BFQ: Hierarchical B-WF2Q+ scheduler.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
*/
#ifdef CONFIG_CGROUP_BFQIO
#define for_each_entity(entity) \
for (; entity != NULL; entity = entity->parent)
#define for_each_entity_safe(entity, parent) \
for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
int extract,
struct bfq_data *bfqd);
static inline void bfq_update_budget(struct bfq_entity *next_in_service)
{
struct bfq_entity *bfqg_entity;
struct bfq_group *bfqg;
struct bfq_sched_data *group_sd;
BUG_ON(next_in_service == NULL);
group_sd = next_in_service->sched_data;
bfqg = container_of(group_sd, struct bfq_group, sched_data);
/*
* bfq_group's my_entity field is not NULL only if the group
* is not the root group. We must not touch the root entity
* as it must never become an in-service entity.
*/
bfqg_entity = bfqg->my_entity;
if (bfqg_entity != NULL)
bfqg_entity->budget = next_in_service->budget;
}
static int bfq_update_next_in_service(struct bfq_sched_data *sd)
{
struct bfq_entity *next_in_service;
if (sd->in_service_entity != NULL)
/* will update/requeue at the end of service */
return 0;
/*
* NOTE: this can be improved in many ways, such as returning
* 1 (and thus propagating upwards the update) only when the
* budget changes, or caching the bfqq that will be scheduled
* next from this subtree. By now we worry more about
* correctness than about performance...
*/
next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
sd->next_in_service = next_in_service;
if (next_in_service != NULL)
bfq_update_budget(next_in_service);
return 1;
}
static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
struct bfq_entity *entity)
{
BUG_ON(sd->next_in_service != entity);
}
#else
#define for_each_entity(entity) \
for (; entity != NULL; entity = NULL)
#define for_each_entity_safe(entity, parent) \
for (parent = NULL; entity != NULL; entity = parent)
static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
{
return 0;
}
static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
struct bfq_entity *entity)
{
}
static inline void bfq_update_budget(struct bfq_entity *next_in_service)
{
}
#endif
/*
* Shift for timestamp calculations. This actually limits the maximum
* service allowed in one timestamp delta (small shift values increase it),
* the maximum total weight that can be used for the queues in the system
* (big shift values increase it), and the period of virtual time
* wraparounds.
*/
#define WFQ_SERVICE_SHIFT 22
/**
* bfq_gt - compare two timestamps.
* @a: first ts.
* @b: second ts.
*
* Return @a > @b, dealing with wrapping correctly.
*/
static inline int bfq_gt(u64 a, u64 b)
{
return (s64)(a - b) > 0;
}
static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
{
struct bfq_queue *bfqq = NULL;
BUG_ON(entity == NULL);
if (entity->my_sched_data == NULL)
bfqq = container_of(entity, struct bfq_queue, entity);
return bfqq;
}
/**
* bfq_delta - map service into the virtual time domain.
* @service: amount of service.
* @weight: scale factor (weight of an entity or weight sum).
*/
static inline u64 bfq_delta(unsigned long service,
unsigned long weight)
{
u64 d = (u64)service << WFQ_SERVICE_SHIFT;
do_div(d, weight);
return d;
}
/**
* bfq_calc_finish - assign the finish time to an entity.
* @entity: the entity to act upon.
* @service: the service to be charged to the entity.
*/
static inline void bfq_calc_finish(struct bfq_entity *entity,
unsigned long service)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
BUG_ON(entity->weight == 0);
entity->finish = entity->start +
bfq_delta(service, entity->weight);
if (bfqq != NULL) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"calc_finish: serv %lu, w %d",
service, entity->weight);
bfq_log_bfqq(bfqq->bfqd, bfqq,
"calc_finish: start %llu, finish %llu, delta %llu",
entity->start, entity->finish,
bfq_delta(service, entity->weight));
}
}
/**
* bfq_entity_of - get an entity from a node.
* @node: the node field of the entity.
*
* Convert a node pointer to the relative entity. This is used only
* to simplify the logic of some functions and not as the generic
* conversion mechanism because, e.g., in the tree walking functions,
* the check for a %NULL value would be redundant.
*/
static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
{
struct bfq_entity *entity = NULL;
if (node != NULL)
entity = rb_entry(node, struct bfq_entity, rb_node);
return entity;
}
/**
* bfq_extract - remove an entity from a tree.
* @root: the tree root.
* @entity: the entity to remove.
*/
static inline void bfq_extract(struct rb_root *root,
struct bfq_entity *entity)
{
BUG_ON(entity->tree != root);
entity->tree = NULL;
rb_erase(&entity->rb_node, root);
}
/**
* bfq_idle_extract - extract an entity from the idle tree.
* @st: the service tree of the owning @entity.
* @entity: the entity being removed.
*/
static void bfq_idle_extract(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *next;
BUG_ON(entity->tree != &st->idle);
if (entity == st->first_idle) {
next = rb_next(&entity->rb_node);
st->first_idle = bfq_entity_of(next);
}
if (entity == st->last_idle) {
next = rb_prev(&entity->rb_node);
st->last_idle = bfq_entity_of(next);
}
bfq_extract(&st->idle, entity);
if (bfqq != NULL)
list_del(&bfqq->bfqq_list);
}
/**
* bfq_insert - generic tree insertion.
* @root: tree root.
* @entity: entity to insert.
*
* This is used for the idle and the active tree, since they are both
* ordered by finish time.
*/
static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
{
struct bfq_entity *entry;
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
BUG_ON(entity->tree != NULL);
while (*node != NULL) {
parent = *node;
entry = rb_entry(parent, struct bfq_entity, rb_node);
if (bfq_gt(entry->finish, entity->finish))
node = &parent->rb_left;
else
node = &parent->rb_right;
}
rb_link_node(&entity->rb_node, parent, node);
rb_insert_color(&entity->rb_node, root);
entity->tree = root;
}
/**
* bfq_update_min - update the min_start field of a entity.
* @entity: the entity to update.
* @node: one of its children.
*
* This function is called when @entity may store an invalid value for
* min_start due to updates to the active tree. The function assumes
* that the subtree rooted at @node (which may be its left or its right
* child) has a valid min_start value.
*/
static inline void bfq_update_min(struct bfq_entity *entity,
struct rb_node *node)
{
struct bfq_entity *child;
if (node != NULL) {
child = rb_entry(node, struct bfq_entity, rb_node);
if (bfq_gt(entity->min_start, child->min_start))
entity->min_start = child->min_start;
}
}
/**
* bfq_update_active_node - recalculate min_start.
* @node: the node to update.
*
* @node may have changed position or one of its children may have moved,
* this function updates its min_start value. The left and right subtrees
* are assumed to hold a correct min_start value.
*/
static inline void bfq_update_active_node(struct rb_node *node)
{
struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
entity->min_start = entity->start;
bfq_update_min(entity, node->rb_right);
bfq_update_min(entity, node->rb_left);
}
/**
* bfq_update_active_tree - update min_start for the whole active tree.
* @node: the starting node.
*
* @node must be the deepest modified node after an update. This function
* updates its min_start using the values held by its children, assuming
* that they did not change, and then updates all the nodes that may have
* changed in the path to the root. The only nodes that may have changed
* are the ones in the path or their siblings.
*/
static void bfq_update_active_tree(struct rb_node *node)
{
struct rb_node *parent;
up:
bfq_update_active_node(node);
parent = rb_parent(node);
if (parent == NULL)
return;
if (node == parent->rb_left && parent->rb_right != NULL)
bfq_update_active_node(parent->rb_right);
else if (parent->rb_left != NULL)
bfq_update_active_node(parent->rb_left);
node = parent;
goto up;
}
static void bfq_weights_tree_add(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root);
static void bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root);
/**
* bfq_active_insert - insert an entity in the active tree of its
* group/device.
* @st: the service tree of the entity.
* @entity: the entity being inserted.
*
* The active tree is ordered by finish time, but an extra key is kept
* per each node, containing the minimum value for the start times of
* its children (and the node itself), so it's possible to search for
* the eligible node with the lowest finish time in logarithmic time.
*/
static void bfq_active_insert(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
#endif
bfq_insert(&st->active, entity);
if (node->rb_left != NULL)
node = node->rb_left;
else if (node->rb_right != NULL)
node = node->rb_right;
bfq_update_active_tree(node);
#ifdef CONFIG_CGROUP_BFQIO
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
BUG_ON(!bfqg);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
if (bfqq != NULL)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
#ifdef CONFIG_CGROUP_BFQIO
else { /* bfq_group */
BUG_ON(!bfqd);
bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
}
if (bfqg != bfqd->root_group) {
BUG_ON(!bfqg);
BUG_ON(!bfqd);
bfqg->active_entities++;
if (bfqg->active_entities == 2)
bfqd->active_numerous_groups++;
}
#endif
}
/**
* bfq_ioprio_to_weight - calc a weight from an ioprio.
* @ioprio: the ioprio value to convert.
*/
static inline unsigned short bfq_ioprio_to_weight(int ioprio)
{
BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
return IOPRIO_BE_NR - ioprio;
}
/**
* bfq_weight_to_ioprio - calc an ioprio from a weight.
* @weight: the weight value to convert.
*
* To preserve as mush as possible the old only-ioprio user interface,
* 0 is used as an escape ioprio value for weights (numerically) equal or
* larger than IOPRIO_BE_NR
*/
static inline unsigned short bfq_weight_to_ioprio(int weight)
{
BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
}
static inline void bfq_get_entity(struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
if (bfqq != NULL) {
atomic_inc(&bfqq->ref);
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, atomic_read(&bfqq->ref));
}
}
/**
* bfq_find_deepest - find the deepest node that an extraction can modify.
* @node: the node being removed.
*
* Do the first step of an extraction in an rb tree, looking for the
* node that will replace @node, and returning the deepest node that
* the following modifications to the tree can touch. If @node is the
* last node in the tree return %NULL.
*/
static struct rb_node *bfq_find_deepest(struct rb_node *node)
{
struct rb_node *deepest;
if (node->rb_right == NULL && node->rb_left == NULL)
deepest = rb_parent(node);
else if (node->rb_right == NULL)
deepest = node->rb_left;
else if (node->rb_left == NULL)
deepest = node->rb_right;
else {
deepest = rb_next(node);
if (deepest->rb_right != NULL)
deepest = deepest->rb_right;
else if (rb_parent(deepest) != node)
deepest = rb_parent(deepest);
}
return deepest;
}
/**
* bfq_active_extract - remove an entity from the active tree.
* @st: the service_tree containing the tree.
* @entity: the entity being removed.
*/
static void bfq_active_extract(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
#endif
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
if (node != NULL)
bfq_update_active_tree(node);
#ifdef CONFIG_CGROUP_BFQIO
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
BUG_ON(!bfqg);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
if (bfqq != NULL)
list_del(&bfqq->bfqq_list);
#ifdef CONFIG_CGROUP_BFQIO
else { /* bfq_group */
BUG_ON(!bfqd);
bfq_weights_tree_remove(bfqd, entity,
&bfqd->group_weights_tree);
}
if (bfqg != bfqd->root_group) {
BUG_ON(!bfqg);
BUG_ON(!bfqd);
BUG_ON(!bfqg->active_entities);
bfqg->active_entities--;
if (bfqg->active_entities == 1) {
BUG_ON(!bfqd->active_numerous_groups);
bfqd->active_numerous_groups--;
}
}
#endif
}
/**
* bfq_idle_insert - insert an entity into the idle tree.
* @st: the service tree containing the tree.
* @entity: the entity to insert.
*/
static void bfq_idle_insert(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct bfq_entity *first_idle = st->first_idle;
struct bfq_entity *last_idle = st->last_idle;
if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
st->first_idle = entity;
if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
st->last_idle = entity;
bfq_insert(&st->idle, entity);
if (bfqq != NULL)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
}
/**
* bfq_forget_entity - remove an entity from the wfq trees.
* @st: the service tree.
* @entity: the entity being removed.
*
* Update the device status and forget everything about @entity, putting
* the device reference to it, if it is a queue. Entities belonging to
* groups are not refcounted.
*/
static void bfq_forget_entity(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct bfq_sched_data *sd;
BUG_ON(!entity->on_st);
entity->on_st = 0;
st->wsum -= entity->weight;
if (bfqq != NULL) {
sd = entity->sched_data;
bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
bfqq, atomic_read(&bfqq->ref));
bfq_put_queue(bfqq);
}
}
/**
* bfq_put_idle_entity - release the idle tree ref of an entity.
* @st: service tree for the entity.
* @entity: the entity being released.
*/
static void bfq_put_idle_entity(struct bfq_service_tree *st,
struct bfq_entity *entity)
{
bfq_idle_extract(st, entity);
bfq_forget_entity(st, entity);
}
/**
* bfq_forget_idle - update the idle tree if necessary.
* @st: the service tree to act upon.
*
* To preserve the global O(log N) complexity we only remove one entry here;
* as the idle tree will not grow indefinitely this can be done safely.
*/
static void bfq_forget_idle(struct bfq_service_tree *st)
{
struct bfq_entity *first_idle = st->first_idle;
struct bfq_entity *last_idle = st->last_idle;
if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
!bfq_gt(last_idle->finish, st->vtime)) {
/*
* Forget the whole idle tree, increasing the vtime past
* the last finish time of idle entities.
*/
st->vtime = last_idle->finish;
}
if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
bfq_put_idle_entity(st, first_idle);
}
static struct bfq_service_tree *
__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
struct bfq_entity *entity)
{
struct bfq_service_tree *new_st = old_st;
if (entity->ioprio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned short prev_weight, new_weight;
struct bfq_data *bfqd = NULL;
struct rb_root *root;
#ifdef CONFIG_CGROUP_BFQIO
struct bfq_sched_data *sd;
struct bfq_group *bfqg;
#endif
if (bfqq != NULL)
bfqd = bfqq->bfqd;
#ifdef CONFIG_CGROUP_BFQIO
else {
sd = entity->my_sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
BUG_ON(!bfqg);
bfqd = (struct bfq_data *)bfqg->bfqd;
BUG_ON(!bfqd);
}
#endif
BUG_ON(old_st->wsum < entity->weight);
old_st->wsum -= entity->weight;
if (entity->new_weight != entity->orig_weight) {
if (entity->new_weight < BFQ_MIN_WEIGHT ||
entity->new_weight > BFQ_MAX_WEIGHT) {
printk(KERN_CRIT "update_weight_prio: "
"new_weight %d\n",
entity->new_weight);
BUG();
}
entity->orig_weight = entity->new_weight;
entity->ioprio =
bfq_weight_to_ioprio(entity->orig_weight);
}
entity->ioprio_class = entity->new_ioprio_class;
entity->ioprio_changed = 0;
/*
* NOTE: here we may be changing the weight too early,
* this will cause unfairness. The correct approach
* would have required additional complexity to defer
* weight changes to the proper time instants (i.e.,
* when entity->finish <= old_st->vtime).
*/
new_st = bfq_entity_service_tree(entity);
prev_weight = entity->weight;
new_weight = entity->orig_weight *
(bfqq != NULL ? bfqq->wr_coeff : 1);
/*
* If the weight of the entity changes, remove the entity
* from its old weight counter (if there is a counter
* associated with the entity), and add it to the counter
* associated with its new weight.
*/
if (prev_weight != new_weight) {
root = bfqq ? &bfqd->queue_weights_tree :
&bfqd->group_weights_tree;
bfq_weights_tree_remove(bfqd, entity, root);
}
entity->weight = new_weight;
/*
* Add the entity to its weights tree only if it is
* not associated with a weight-raised queue.
*/
if (prev_weight != new_weight &&
(bfqq ? bfqq->wr_coeff == 1 : 1))
/* If we get here, root has been initialized. */
bfq_weights_tree_add(bfqd, entity, root);
new_st->wsum += entity->weight;
if (new_st != old_st)
entity->start = new_st->vtime;
}
return new_st;
}
/**
* bfq_bfqq_served - update the scheduler status after selection for
* service.
* @bfqq: the queue being served.
* @served: bytes to transfer.
*
* NOTE: this can be optimized, as the timestamps of upper level entities
* are synchronized every time a new bfqq is selected for service. By now,
* we keep it to better check consistency.
*/
static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
{
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);
entity->service += served;
BUG_ON(entity->service > entity->budget);
BUG_ON(st->wsum == 0);
st->vtime += bfq_delta(served, st->wsum);
bfq_forget_idle(st);
}
bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
}
/**
* bfq_bfqq_charge_full_budget - set the service to the entity budget.
* @bfqq: the queue that needs a service update.
*
* When it's not possible to be fair in the service domain, because
* a queue is not consuming its budget fast enough (the meaning of
* fast depends on the timeout parameter), we charge it a full
* budget. In this way we should obtain a sort of time-domain
* fairness among all the seeky/slow queues.
*/
static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
bfq_bfqq_served(bfqq, entity->budget - entity->service);
}
/**
* __bfq_activate_entity - activate an entity.
* @entity: the entity being activated.
*
* Called whenever an entity is activated, i.e., it is not active and one
* of its children receives a new request, or has to be reactivated due to
* budget exhaustion. It uses the current budget of the entity (and the
* service received if @entity is active) of the queue to calculate its
* timestamps.
*/
static void __bfq_activate_entity(struct bfq_entity *entity)
{
struct bfq_sched_data *sd = entity->sched_data;
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
if (entity == sd->in_service_entity) {
BUG_ON(entity->tree != NULL);
/*
* If we are requeueing the current entity we have
* to take care of not charging to it service it has
* not received.
*/
bfq_calc_finish(entity, entity->service);
entity->start = entity->finish;
sd->in_service_entity = NULL;
} else if (entity->tree == &st->active) {
/*
* Requeueing an entity due to a change of some
* next_in_service entity below it. We reuse the
* old start time.
*/
bfq_active_extract(st, entity);
} else if (entity->tree == &st->idle) {
/*
* Must be on the idle tree, bfq_idle_extract() will
* check for that.
*/
bfq_idle_extract(st, entity);
entity->start = bfq_gt(st->vtime, entity->finish) ?
st->vtime : entity->finish;
} else {
/*
* The finish time of the entity may be invalid, and
* it is in the past for sure, otherwise the queue
* would have been on the idle tree.
*/
entity->start = st->vtime;
st->wsum += entity->weight;
bfq_get_entity(entity);
BUG_ON(entity->on_st);
entity->on_st = 1;
}
st = __bfq_entity_update_weight_prio(st, entity);
bfq_calc_finish(entity, entity->budget);
bfq_active_insert(st, entity);
}
/**
* bfq_activate_entity - activate an entity and its ancestors if necessary.
* @entity: the entity to activate.
*
* Activate @entity and all the entities on the path from it to the root.
*/
static void bfq_activate_entity(struct bfq_entity *entity)
{
struct bfq_sched_data *sd;
for_each_entity(entity) {
__bfq_activate_entity(entity);
sd = entity->sched_data;
if (!bfq_update_next_in_service(sd))
/*
* No need to propagate the activation to the
* upper entities, as they will be updated when
* the in-service entity is rescheduled.
*/
break;
}
}
/**
* __bfq_deactivate_entity - deactivate an entity from its service tree.
* @entity: the entity to deactivate.
* @requeue: if false, the entity will not be put into the idle tree.
*
* Deactivate an entity, independently from its previous state. If the
* entity was not on a service tree just return, otherwise if it is on
* any scheduler tree, extract it from that tree, and if necessary
* and if the caller did not specify @requeue, put it on the idle tree.
*
* Return %1 if the caller should update the entity hierarchy, i.e.,
* if the entity was in service or if it was the next_in_service for
* its sched_data; return %0 otherwise.
*/
static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
{
struct bfq_sched_data *sd = entity->sched_data;
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
int was_in_service = entity == sd->in_service_entity;
int ret = 0;
if (!entity->on_st)
return 0;
BUG_ON(was_in_service && entity->tree != NULL);
if (was_in_service) {
bfq_calc_finish(entity, entity->service);
sd->in_service_entity = NULL;
} else if (entity->tree == &st->active)
bfq_active_extract(st, entity);
else if (entity->tree == &st->idle)
bfq_idle_extract(st, entity);
else if (entity->tree != NULL)
BUG();
if (was_in_service || sd->next_in_service == entity)
ret = bfq_update_next_in_service(sd);
if (!requeue || !bfq_gt(entity->finish, st->vtime))
bfq_forget_entity(st, entity);
else
bfq_idle_insert(st, entity);
BUG_ON(sd->in_service_entity == entity);
BUG_ON(sd->next_in_service == entity);
return ret;
}
/**
* bfq_deactivate_entity - deactivate an entity.
* @entity: the entity to deactivate.
* @requeue: true if the entity can be put on the idle tree
*/
static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
{
struct bfq_sched_data *sd;
struct bfq_entity *parent;
for_each_entity_safe(entity, parent) {
sd = entity->sched_data;
if (!__bfq_deactivate_entity(entity, requeue))
/*
* The parent entity is still backlogged, and
* we don't need to update it as it is still
* in service.
*/
break;
if (sd->next_in_service != NULL)
/*
* The parent entity is still backlogged and
* the budgets on the path towards the root
* need to be updated.
*/
goto update;
/*
* If we reach there the parent is no more backlogged and
* we want to propagate the dequeue upwards.
*/
requeue = 1;
}
return;
update:
entity = parent;
for_each_entity(entity) {
__bfq_activate_entity(entity);
sd = entity->sched_data;
if (!bfq_update_next_in_service(sd))
break;
}
}
/**
* bfq_update_vtime - update vtime if necessary.
* @st: the service tree to act upon.
*
* If necessary update the service tree vtime to have at least one
* eligible entity, skipping to its start time. Assumes that the
* active tree of the device is not empty.
*
* NOTE: this hierarchical implementation updates vtimes quite often,
* we may end up with reactivated processes getting timestamps after a
* vtime skip done because we needed a ->first_active entity on some
* intermediate node.
*/
static void bfq_update_vtime(struct bfq_service_tree *st)
{
struct bfq_entity *entry;
struct rb_node *node = st->active.rb_node;
entry = rb_entry(node, struct bfq_entity, rb_node);
if (bfq_gt(entry->min_start, st->vtime)) {
st->vtime = entry->min_start;
bfq_forget_idle(st);
}
}
/**
* bfq_first_active_entity - find the eligible entity with
* the smallest finish time
* @st: the service tree to select from.
*
* This function searches the first schedulable entity, starting from the
* root of the tree and going on the left every time on this side there is
* a subtree with at least one eligible (start >= vtime) entity. The path on
* the right is followed only if a) the left subtree contains no eligible
* entities and b) no eligible entity has been found yet.
*/
static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
{
struct bfq_entity *entry, *first = NULL;
struct rb_node *node = st->active.rb_node;
while (node != NULL) {
entry = rb_entry(node, struct bfq_entity, rb_node);
left:
if (!bfq_gt(entry->start, st->vtime))
first = entry;
BUG_ON(bfq_gt(entry->min_start, st->vtime));
if (node->rb_left != NULL) {
entry = rb_entry(node->rb_left,
struct bfq_entity, rb_node);
if (!bfq_gt(entry->min_start, st->vtime)) {
node = node->rb_left;
goto left;
}
}
if (first != NULL)
break;
node = node->rb_right;
}
BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
return first;
}
/**
* __bfq_lookup_next_entity - return the first eligible entity in @st.
* @st: the service tree.
*
* Update the virtual time in @st and return the first eligible entity
* it contains.
*/
static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
bool force)
{
struct bfq_entity *entity, *new_next_in_service = NULL;
if (RB_EMPTY_ROOT(&st->active))
return NULL;
bfq_update_vtime(st);
entity = bfq_first_active_entity(st);
BUG_ON(bfq_gt(entity->start, st->vtime));
/*
* If the chosen entity does not match with the sched_data's
* next_in_service and we are forcedly serving the IDLE priority
* class tree, bubble up budget update.
*/
if (unlikely(force && entity != entity->sched_data->next_in_service)) {
new_next_in_service = entity;
for_each_entity(new_next_in_service)
bfq_update_budget(new_next_in_service);
}
return entity;
}
/**
* bfq_lookup_next_entity - return the first eligible entity in @sd.
* @sd: the sched_data.
* @extract: if true the returned entity will be also extracted from @sd.
*
* NOTE: since we cache the next_in_service entity at each level of the
* hierarchy, the complexity of the lookup can be decreased with
* absolutely no effort just returning the cached next_in_service value;
* we prefer to do full lookups to test the consistency of * the data
* structures.
*/
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
int extract,
struct bfq_data *bfqd)
{
struct bfq_service_tree *st = sd->service_tree;
struct bfq_entity *entity;
int i = 0;
BUG_ON(sd->in_service_entity != NULL);
if (bfqd != NULL &&
jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
true);
if (entity != NULL) {
i = BFQ_IOPRIO_CLASSES - 1;
bfqd->bfq_class_idle_last_service = jiffies;
sd->next_in_service = entity;
}
}
for (; i < BFQ_IOPRIO_CLASSES; i++) {
entity = __bfq_lookup_next_entity(st + i, false);
if (entity != NULL) {
if (extract) {
bfq-sched: Forcefully lookup entities when the cache is inconsistent bfq maintains a 'next-in-service' cache to prevent expensive lookups in the hot path. However, the cache sometimes becomes inconsistent and triggers a BUG: [44042.622839] -(3)[154:mmcqd/0]BUG: failure at ../../../../../../kernel/cyanogen/mt6735/block/bfq-sched.c:72/bfq_check_next_in_service()! [44042.622858] -(3)[154:mmcqd/0]Unable to handle kernel paging request at virtual address 0000dead [44042.622866] -(3)[154:mmcqd/0]pgd = ffffffc001361000 [44042.622872] [0000dead] *pgd=000000007d816003, *pud=000000007d816003, *pmd=000000007d817003, *pte=0000000000000000 [44042.622890] -(3)[154:mmcqd/0]Internal error: Oops: 96000045 [#1] PREEMPT SMP [44042.622907] -(3)[154:mmcqd/0]CPU: 3 PID: 154 Comm: mmcqd/0 Tainted: [44042.622915] -(3)[154:mmcqd/0]Hardware name: MT6735 (DT) [44042.622922] -(3)[154:mmcqd/0]task: ffffffc0378a6000 ti: ffffffc0378c4000 [44042.622936] -(3)[154:mmcqd/0]PC is at bfq_dispatch_requests+0x6c4/0x9bc [44042.622944] -(3)[154:mmcqd/0]LR is at bfq_dispatch_requests+0x6bc/0x9bc [44042.622952] -(3)[154:mmcqd/0]pc : [<ffffffc000306a68>] lr : [<ffffffc000306a60>] pstate: 800001c5 [44042.622958] -(3)[154:mmcqd/0]sp : ffffffc0378c7d30 [44042.622962] x29: ffffffc0378c7d30 x28: 0000000000000000 [44042.622972] x27: 0000000000000000 x26: ffffffc006c58810 [44042.622981] x25: ffffffc037f89820 x24: ffffffc000f14000 [44042.622990] x23: ffffffc036adb088 x22: ffffffc0369b2800 [44042.623000] x21: ffffffc036adb098 x20: ffffffc01d6a3b60 [44042.623009] x19: ffffffc036adb0c8 x18: 0000007f8cfa1500 [44042.623018] x17: 0000007f8db44f40 x16: ffffffc00012d0c0 [44042.623027] x15: 0000007f8dde04d8 x14: 676f6e6179632f6c [44042.623037] x13: 656e72656b2f2e2e x12: 2f2e2e2f2e2e2f2e [44042.623046] x11: 2e2f2e2e2f2e2e20 x10: 7461206572756c69 [44042.623055] x9 : 6166203a4755425d x8 : 00000000001f0cc5 [44042.623064] x7 : ffffffc000f3d5a0 x6 : 000000000000008b [44042.623073] x5 : 0000000000000000 x4 : 0000000000000004 [44042.623082] x3 : 0000000000000002 x2 : 0000000000000001 [44042.623091] x1 : 0000000000000aee x0 : 000000000000dead This patch makes the lookup resilient to cache inconsistencies by doing the expensive recomputation in cases where the bug would otherwise be triggered. Ticket: PORRDIGE-527 Change-Id: I5dd701960057983a42d3d3bd57521e8d17c03d7f
2016-11-23 14:46:50 +00:00
if (sd->next_in_service != entity) {
entity = __bfq_lookup_next_entity(st + i, true);
}
block: introduce the BFQ-v7r8 I/O sched for 3.10.8+ Add the BFQ-v7r8 I/O scheduler to 3.10.8+. The general structure is borrowed from CFQ, as much of the code for handling I/O contexts Over time, several useful features have been ported from CFQ as well (details in the changelog in README.BFQ). A (bfq_)queue is associated to each task doing I/O on a device, and each time a scheduling decision has to be made a queue is selected and served until it expires. - Slices are given in the service domain: tasks are assigned budgets, measured in number of sectors. Once got the disk, a task must however consume its assigned budget within a configurable maximum time (by default, the maximum possible value of the budgets is automatically computed to comply with this timeout). This allows the desired latency vs "throughput boosting" tradeoff to be set. - Budgets are scheduled according to a variant of WF2Q+, implemented using an augmented rb-tree to take eligibility into account while preserving an O(log N) overall complexity. - A low-latency tunable is provided; if enabled, both interactive and soft real-time applications are guaranteed a very low latency. - Latency guarantees are preserved also in the presence of NCQ. - Also with flash-based devices, a high throughput is achieved while still preserving latency guarantees. - BFQ features Early Queue Merge (EQM), a sort of fusion of the cooperating-queue-merging and the preemption mechanisms present in CFQ. EQM is in fact a unified mechanism that tries to get a sequential read pattern, and hence a high throughput, with any set of processes performing interleaved I/O over a contiguous sequence of sectors. - BFQ supports full hierarchical scheduling, exporting a cgroups interface. Since each node has a full scheduler, each group can be assigned its own weight. - If the cgroups interface is not used, only I/O priorities can be assigned to processes, with ioprio values mapped to weights with the relation weight = IOPRIO_BE_NR - ioprio. - ioprio classes are served in strict priority order, i.e., lower priority queues are not served as long as there are higher priority queues. Among queues in the same class the bandwidth is distributed in proportion to the weight of each queue. A very thin extra bandwidth is however guaranteed to the Idle class, to prevent it from starving. Change-Id: Iebf9be399041b89d79b54077da1a34a81d4e4238 Signed-off-by: Paolo Valente <paolo.valente@unimore.it> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
2013-05-09 17:10:02 +00:00
bfq_check_next_in_service(sd, entity);
bfq_active_extract(st + i, entity);
sd->in_service_entity = entity;
sd->next_in_service = NULL;
}
break;
}
}
return entity;
}
/*
* Get next queue for service.
*/
static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
{
struct bfq_entity *entity = NULL;
struct bfq_sched_data *sd;
struct bfq_queue *bfqq;
BUG_ON(bfqd->in_service_queue != NULL);
if (bfqd->busy_queues == 0)
return NULL;
sd = &bfqd->root_group->sched_data;
for (; sd != NULL; sd = entity->my_sched_data) {
entity = bfq_lookup_next_entity(sd, 1, bfqd);
BUG_ON(entity == NULL);
entity->service = 0;
}
bfqq = bfq_entity_to_bfqq(entity);
BUG_ON(bfqq == NULL);
return bfqq;
}
static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
{
if (bfqd->in_service_bic != NULL) {
put_io_context(bfqd->in_service_bic->icq.ioc);
bfqd->in_service_bic = NULL;
}
bfqd->in_service_queue = NULL;
del_timer(&bfqd->idle_slice_timer);
}
static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
int requeue)
{
struct bfq_entity *entity = &bfqq->entity;
if (bfqq == bfqd->in_service_queue)
__bfq_bfqd_reset_in_service(bfqd);
bfq_deactivate_entity(entity, requeue);
}
static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
bfq_activate_entity(entity);
}
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree.
*/
static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
int requeue)
{
BUG_ON(!bfq_bfqq_busy(bfqq));
BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
bfq_log_bfqq(bfqd, bfqq, "del from busy");
bfq_clear_bfqq_busy(bfqq);
BUG_ON(bfqd->busy_queues == 0);
bfqd->busy_queues--;
if (!bfqq->dispatched) {
bfq_weights_tree_remove(bfqd, &bfqq->entity,
&bfqd->queue_weights_tree);
if (!blk_queue_nonrot(bfqd->queue)) {
BUG_ON(!bfqd->busy_in_flight_queues);
bfqd->busy_in_flight_queues--;
if (bfq_bfqq_constantly_seeky(bfqq)) {
BUG_ON(!bfqd->
const_seeky_busy_in_flight_queues);
bfqd->const_seeky_busy_in_flight_queues--;
}
}
}
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
bfq_deactivate_bfqq(bfqd, bfqq, requeue);
}
/*
* Called when an inactive queue receives a new request.
*/
static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
BUG_ON(bfq_bfqq_busy(bfqq));
BUG_ON(bfqq == bfqd->in_service_queue);
bfq_log_bfqq(bfqd, bfqq, "add to busy");
bfq_activate_bfqq(bfqd, bfqq);
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues++;
if (!bfqq->dispatched) {
if (bfqq->wr_coeff == 1)
bfq_weights_tree_add(bfqd, &bfqq->entity,
&bfqd->queue_weights_tree);
if (!blk_queue_nonrot(bfqd->queue)) {
bfqd->busy_in_flight_queues++;
if (bfq_bfqq_constantly_seeky(bfqq))
bfqd->const_seeky_busy_in_flight_queues++;
}
}
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;
}