Last active
August 29, 2015 14:23
-
-
Save Kairi/d52ccef5526deb189974 to your computer and use it in GitHub Desktop.
TPPS I/O Scheduler modify to run in Linux 4.x kernel. (tested in linux kernel 4.0-rc7 with QEMU)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * TPPS, or Tiny Parallel Proportion disk Scheduler. | |
| * | |
| * Based on ideas from Zhu Yanhai <gaoyang.zyh@taobao.com> | |
| * | |
| * Copyright (C) 2013 Robin Dong <sanbai@taobao.com> | |
| */ | |
| #include <linux/module.h> | |
| #include <linux/blkdev.h> | |
| #include <linux/elevator.h> | |
| #include <linux/jiffies.h> | |
| #include <linux/rbtree.h> | |
| #include <linux/ioprio.h> | |
| #include <linux/blktrace_api.h> | |
| #include "blk-cgroup.h" | |
| #include "blk.h" | |
| static struct kmem_cache *tpps_pool; | |
| struct tpps_queue { | |
| /* reference count */ | |
| int ref; | |
| /* parent tpps_data */ | |
| struct tpps_data *tppd; | |
| /* tpps_group member */ | |
| struct list_head tppg_node; | |
| /* sorted list of pending requests */ | |
| struct list_head sort_list; | |
| struct tpps_group *tppg; | |
| pid_t pid; | |
| int online; | |
| int rq_queued; | |
| }; | |
| struct tppg_stats { | |
| /* total bytes transferred */ | |
| struct blkg_rwstat service_bytes; | |
| /* total IOs serviced, post merge */ | |
| struct blkg_rwstat serviced; | |
| /* number of ios merged */ | |
| struct blkg_rwstat merged; | |
| /* total time spent on device in ns, may not be accurate w/ queueing */ | |
| struct blkg_rwstat service_time; | |
| /* total time spent waiting in scheduler queue in ns */ | |
| struct blkg_rwstat wait_time; | |
| /* number of IOs queued up */ | |
| struct blkg_rwstat queued; | |
| /* total sectors transferred */ | |
| struct blkg_stat sectors; | |
| /* total disk time and nr sectors dispatched by this group */ | |
| struct blkg_stat time; | |
| }; | |
| struct tpps_group { | |
| struct blkg_policy_data pd; | |
| /* tpps_data member */ | |
| struct list_head tppd_node; | |
| struct list_head *cur_dispatcher; | |
| unsigned int weight; | |
| unsigned int new_weight; | |
| unsigned int dev_weight; | |
| unsigned int leaf_weight; | |
| unsigned int new_leaf_weight; | |
| unsigned int dev_leaf_weight; | |
| bool needs_update; | |
| /* | |
| * lists of queues with requests. | |
| */ | |
| struct list_head queue_list; | |
| int nr_tppq; | |
| int rq_queued; | |
| int rq_in_driver; | |
| struct tppg_stats stats; /* stats for this tppg */ | |
| struct tppg_stats dead_stats; /* stats pushed from dead children */ | |
| }; | |
| struct tpps_io_cq { | |
| struct io_cq icq; /* must be the first member */ | |
| struct tpps_queue *tppq; | |
| uint64_t blkcg_serial_nr; /* the current blkcg ID */ | |
| }; | |
| struct tpps_data { | |
| struct request_queue *queue; | |
| struct tpps_group *root_group; | |
| /* List of tpps groups being managed on this device*/ | |
| struct list_head group_list; | |
| unsigned int busy_queues; | |
| int dispatched; | |
| int rq_in_driver; | |
| struct work_struct unplug_work; | |
| /* Number of groups which are on blkcg->blkg_list */ | |
| unsigned int nr_blkcg_linked_grps; | |
| unsigned total_weight; | |
| }; | |
| static inline struct blkcg_gq *tppg_to_blkg(struct tpps_group *tppg) | |
| { | |
| return pd_to_blkg(&tppg->pd); | |
| } | |
| #define tpps_log_tppq(tppd, tppq, fmt, args...) do { \ | |
| char __pbuf[128]; \ | |
| \ | |
| blkg_path(tppg_to_blkg((tppq)->tppg), __pbuf, sizeof(__pbuf)); \ | |
| blk_add_trace_msg((tppd)->queue, "tpps%d %s " fmt, (tppq)->pid, \ | |
| __pbuf, ##args); \ | |
| } while (0) | |
| #define tpps_log_tppg(tppd, tppg, fmt, args...) do { \ | |
| char __pbuf[128]; \ | |
| \ | |
| blkg_path(tppg_to_blkg(tppg), __pbuf, sizeof(__pbuf)); \ | |
| blk_add_trace_msg((tppd)->queue, "%s " fmt, __pbuf, ##args); \ | |
| } while (0) | |
| #define tpps_log(tppd, fmt, args...) \ | |
| blk_add_trace_msg((tppd)->queue, "tpps " fmt, ##args) | |
| static inline struct tpps_io_cq *icq_to_tic(struct io_cq *icq) | |
| { | |
| /* tic->icq is the first member, %NULL will convert to %NULL */ | |
| return container_of(icq, struct tpps_io_cq, icq); | |
| } | |
| #define RQ_TIC(rq) icq_to_tic((rq)->elv.icq) | |
| #define RQ_TPPQ(rq) (struct tpps_queue *) ((rq)->elv.priv[0]) | |
| #define RQ_TPPG(rq) (struct tpps_group *) ((rq)->elv.priv[1]) | |
| #define TPPS_WEIGHT_DEFAULT (500) | |
| #define MIN_DISPATCH_RQ (8) | |
| static struct blkcg_policy blkcg_policy_tpps; | |
| static inline struct tpps_group *pd_to_tppg(struct blkg_policy_data *pd) | |
| { | |
| return pd ? container_of(pd, struct tpps_group, pd) : NULL; | |
| } | |
| static inline struct tpps_group *blkg_to_tppg(struct blkcg_gq *blkg) | |
| { | |
| return pd_to_tppg(blkg_to_pd(blkg, &blkcg_policy_tpps)); | |
| } | |
| static inline struct tpps_io_cq * | |
| tpps_tic_lookup(struct tpps_data *tppd, struct io_context *ioc) | |
| { | |
| if (ioc) | |
| return icq_to_tic(ioc_lookup_icq(ioc, tppd->queue)); | |
| return NULL; | |
| } | |
| static inline struct tpps_queue *tic_to_tppq(struct tpps_io_cq *tic) | |
| { | |
| return tic->tppq; | |
| } | |
| static inline void tic_set_tppq(struct tpps_io_cq *tic, struct tpps_queue *tppq) | |
| { | |
| tic->tppq = tppq; | |
| } | |
| static inline struct tpps_data *tic_to_tppd(struct tpps_io_cq *tic) | |
| { | |
| return tic->icq.q->elevator->elevator_data; | |
| } | |
| static inline void tppg_get(struct tpps_group *tppg) | |
| { | |
| return blkg_get(tppg_to_blkg(tppg)); | |
| } | |
| static inline void tppg_put(struct tpps_group *tppg) | |
| { | |
| return blkg_put(tppg_to_blkg(tppg)); | |
| } | |
| static inline void tppg_stats_update_io_add(struct tpps_group *tppg, | |
| struct tpps_group *curr_tppg, int rw) | |
| { | |
| blkg_rwstat_add(&tppg->stats.queued, rw, 1); | |
| } | |
| static inline void tppg_stats_update_io_remove(struct tpps_group *tppg, int rw) | |
| { | |
| blkg_rwstat_add(&tppg->stats.queued, rw, -1); | |
| } | |
| static inline void tppg_stats_update_io_merged(struct tpps_group *tppg, int rw) | |
| { | |
| blkg_rwstat_add(&tppg->stats.merged, rw, 1); | |
| } | |
| static inline void tppg_stats_update_dispatch(struct tpps_group *tppg, | |
| uint64_t bytes, int rw) | |
| { | |
| blkg_stat_add(&tppg->stats.sectors, bytes >> 9); | |
| blkg_rwstat_add(&tppg->stats.serviced, rw, 1); | |
| blkg_rwstat_add(&tppg->stats.service_bytes, rw, bytes); | |
| } | |
| static inline void tppg_stats_update_completion(struct tpps_group *tppg, | |
| uint64_t start_time, uint64_t io_start_time, int rw) | |
| { | |
| struct tppg_stats *stats = &tppg->stats; | |
| unsigned long long now = sched_clock(); | |
| if (time_after64(now, io_start_time)) | |
| blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); | |
| if (time_after64(io_start_time, start_time)) | |
| blkg_rwstat_add(&stats->wait_time, rw, | |
| io_start_time - start_time); | |
| } | |
| static void tpps_del_queue(struct tpps_queue *tppq) | |
| { | |
| struct tpps_data *tppd = tppq->tppd; | |
| struct tpps_group *tppg = tppq->tppg; | |
| if (!list_empty(&tppq->tppg_node)) { | |
| list_del_init(&tppq->tppg_node); | |
| tpps_log_tppq(tppd, tppq, "del queue\n"); | |
| tppg->cur_dispatcher = NULL; | |
| tppq->tppg = NULL; | |
| } | |
| printk("%p nr_tppq:%d\n", tppg, tppg->nr_tppq); | |
| BUG_ON(tppg->nr_tppq < 1); | |
| tppg->nr_tppq--; | |
| if (!tppg->nr_tppq) | |
| tppd->total_weight -= tppg->pd.blkg->blkcg->cfq_weight; | |
| BUG_ON(!tppd->busy_queues); | |
| tppd->busy_queues--; | |
| } | |
| /* | |
| * task holds one reference to the queue, dropped when task exits. each rq | |
| * in-flight on this queue also holds a reference, dropped when rq is freed. | |
| * | |
| * Each tpps queue took a reference on the parent group. Drop it now. | |
| * queue lock must be held here. | |
| */ | |
| static void tpps_put_queue(struct tpps_queue *tppq) | |
| { | |
| struct tpps_data *tppd = tppq->tppd; | |
| struct tpps_group *tppg; | |
| BUG_ON(tppq->ref <= 0); | |
| tppq->ref--; | |
| if (tppq->ref) | |
| return; | |
| tpps_log_tppq(tppd, tppq, "put_queue"); | |
| BUG_ON(!list_empty(&tppq->sort_list)); | |
| tppg = tppq->tppg; | |
| tpps_del_queue(tppq); | |
| kmem_cache_free(tpps_pool, tppq); | |
| tppg_put(tppg); | |
| } | |
| static void tpps_init_tppq(struct tpps_data *tppd, struct tpps_queue *tppq, | |
| pid_t pid) | |
| { | |
| INIT_LIST_HEAD(&tppq->tppg_node); | |
| INIT_LIST_HEAD(&tppq->sort_list); | |
| tppq->ref = 0; | |
| tppq->tppd = tppd; | |
| tppq->pid = pid; | |
| } | |
| static void tpps_link_tppq_tppg(struct tpps_queue *tppq, | |
| struct tpps_group *tppg) | |
| { | |
| tppq->tppg = tppg; | |
| /* tppq reference on tppg */ | |
| tppg_get(tppg); | |
| } | |
| static struct tpps_group *tpps_lookup_create_tppg(struct tpps_data *tppd, | |
| struct blkcg *blkcg) | |
| { | |
| struct request_queue *q = tppd->queue; | |
| struct tpps_group *tppg = NULL; | |
| /* avoid lookup for the common case where there's no blkcg */ | |
| if (blkcg == &blkcg_root) { | |
| tppg = tppd->root_group; | |
| } else { | |
| struct blkcg_gq *blkg; | |
| blkg = blkg_lookup_create(blkcg, q); | |
| if (!IS_ERR(blkg)) | |
| tppg = blkg_to_tppg(blkg); | |
| } | |
| return tppg; | |
| } | |
| static struct tpps_queue * | |
| tpps_find_alloc_queue(struct tpps_data *tppd, struct tpps_io_cq* tic, struct bio *bio, | |
| gfp_t gfp_mask) | |
| { | |
| struct tpps_queue *tppq, *new_tppq = NULL; | |
| struct tpps_group *tppg; | |
| struct blkcg *blkcg; | |
| retry: | |
| rcu_read_lock(); | |
| blkcg = bio_blkcg(bio); | |
| tppg = tpps_lookup_create_tppg(tppd, blkcg); | |
| tppq = tic_to_tppq(tic); | |
| if (!tppq) { | |
| if (new_tppq) { | |
| tppq = new_tppq; | |
| new_tppq = NULL; | |
| } else if (gfp_mask & __GFP_WAIT) { | |
| rcu_read_unlock(); | |
| spin_unlock_irq(tppd->queue->queue_lock); | |
| new_tppq = kmem_cache_alloc_node(tpps_pool, | |
| gfp_mask | __GFP_ZERO, | |
| tppd->queue->node); | |
| spin_lock_irq(tppd->queue->queue_lock); | |
| if (new_tppq) | |
| goto retry; | |
| } else | |
| tppq = kmem_cache_alloc_node(tpps_pool, | |
| gfp_mask | __GFP_ZERO, | |
| tppd->queue->node); | |
| if (tppq) { | |
| tpps_init_tppq(tppd, tppq, current->pid); | |
| tpps_link_tppq_tppg(tppq, tppg); | |
| tpps_log_tppq(tppd, tppq, "alloced"); | |
| } | |
| } | |
| if (new_tppq) | |
| kmem_cache_free(tpps_pool, new_tppq); | |
| rcu_read_unlock(); | |
| return tppq; | |
| } | |
| static struct tpps_queue * | |
| tpps_get_queue(struct tpps_data *tppd, struct tpps_io_cq *tic, struct bio *bio, | |
| gfp_t gfp_mask) | |
| { | |
| struct tpps_queue *tppq; | |
| tppq = tpps_find_alloc_queue(tppd, tic, bio, gfp_mask); | |
| tppq->ref++; | |
| return tppq; | |
| } | |
| /* | |
| * scheduler run of queue, if there are requests pending and no one in the | |
| * driver that will restart queueing | |
| */ | |
| static inline void tpps_schedule_dispatch(struct tpps_data *tppd) | |
| { | |
| if (tppd->busy_queues) { | |
| tpps_log(tppd, "schedule dispatch"); | |
| kblockd_schedule_work(&tppd->unplug_work); | |
| } | |
| } | |
| static void check_blkcg_changed(struct tpps_io_cq *tic, struct bio *bio) | |
| { | |
| struct tpps_data *tppd = tic_to_tppd(tic); | |
| struct tpps_queue *tppq; | |
| uint64_t serial_nr; | |
| rcu_read_lock(); | |
| serial_nr = bio_blkcg(bio)->css.serial_nr; | |
| rcu_read_unlock(); | |
| /* | |
| * Check whether blkcg has changed. The condition may trigger | |
| * spuriously on a newly created tic but there's no harm. | |
| */ | |
| if (unlikely(!tppd) || likely(tic->blkcg_serial_nr == serial_nr)) | |
| return; | |
| tppq = tic_to_tppq(tic); | |
| if (tppq) { | |
| /* | |
| * Drop reference to sync queue. A new sync queue will be | |
| * assigned in new group upon arrival of a fresh request. | |
| */ | |
| tpps_log_tppq(tppd, tppq, "changed cgroup"); | |
| tic_set_tppq(tic, NULL); | |
| tpps_put_queue(tppq); | |
| } | |
| tic->blkcg_serial_nr = serial_nr; | |
| } | |
| static int | |
| tpps_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | |
| gfp_t gfp_mask) | |
| { | |
| struct tpps_data *tppd = q->elevator->elevator_data; | |
| struct tpps_io_cq *tic = icq_to_tic(rq->elv.icq); | |
| struct tpps_queue *tppq; | |
| might_sleep_if(gfp_mask & __GFP_WAIT); | |
| spin_lock_irq(q->queue_lock); | |
| check_blkcg_changed(tic, bio); | |
| tppq = tic_to_tppq(tic); | |
| if (!tppq) { | |
| tppq = tpps_get_queue(tppd, tic, bio, gfp_mask); | |
| tic_set_tppq(tic, tppq); | |
| } | |
| tppq->ref++; | |
| tppg_get(tppq->tppg); | |
| rq->elv.priv[0] = tppq; | |
| rq->elv.priv[1] = tppq->tppg; | |
| spin_unlock_irq(q->queue_lock); | |
| return 0; | |
| } | |
| /* | |
| * queue lock held here | |
| */ | |
| static void tpps_put_request(struct request *rq) | |
| { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| if (tppq) { | |
| WARN_ON(tppq->tppg != RQ_TPPG(rq)); | |
| /* Put down rq reference on cfqg */ | |
| tppg_put(RQ_TPPG(rq)); | |
| rq->elv.priv[0] = NULL; | |
| rq->elv.priv[1] = NULL; | |
| tpps_put_queue(tppq); | |
| } | |
| } | |
| static void | |
| tpps_update_group_weight(struct tpps_group *tppg) | |
| { | |
| if (tppg->needs_update) { | |
| tppg->weight = tppg->new_weight; | |
| tppg->needs_update = false; | |
| } | |
| } | |
| static void tpps_add_queue(struct tpps_data *tppd, struct tpps_queue *tppq) | |
| { | |
| struct tpps_group *tppg; | |
| if (!tppq->online) { | |
| tppq->online = 1; | |
| tppg = tppq->tppg; | |
| tpps_log_tppq(tppd, tppq, "add queue"); | |
| tppg->nr_tppq++; | |
| tppd->busy_queues++; | |
| list_add(&tppq->tppg_node, &tppg->queue_list); | |
| printk("add tppq %p to %p\n", tppq, tppg); | |
| tpps_update_group_weight(tppg); | |
| if (tppg->nr_tppq <= 1) { | |
| tppd->total_weight += tppg->pd.blkg->blkcg->cfq_weight; | |
| list_add(&tppg->tppd_node, &tppd->group_list); | |
| printk("twt:%u, wt:%u %u %d %p\n", tppd->total_weight, tppg->weight, | |
| tppg->pd.blkg->blkcg->cfq_weight, | |
| tppg->nr_tppq, | |
| tppg); | |
| } | |
| } | |
| } | |
| static void tpps_insert_request(struct request_queue *q, struct request *rq) | |
| { | |
| struct tpps_data *tppd = q->elevator->elevator_data; | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| tpps_log_tppq(tppd, tppq, "insert_request"); | |
| list_add_tail(&rq->queuelist, &tppq->sort_list); | |
| tppq->rq_queued++; | |
| tppq->tppg->rq_queued++; | |
| tppd->dispatched++; | |
| tpps_add_queue(tppd, tppq); | |
| tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags); | |
| } | |
| static void tpps_remove_request(struct request *rq) | |
| { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| list_del_init(&rq->queuelist); | |
| tppq->rq_queued--; | |
| tppq->tppg->rq_queued--; | |
| tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags); | |
| } | |
| /* | |
| * Move request from internal lists to the request queue dispatch list. | |
| */ | |
| static int tpps_dispatch_insert(struct request_queue *q, | |
| struct tpps_queue *tppq) | |
| { | |
| struct list_head *rbnext = tppq->sort_list.next; | |
| struct request *rq; | |
| if (rbnext == &tppq->sort_list) | |
| return 0; | |
| rq = rq_entry_fifo(rbnext); | |
| tpps_remove_request(rq); | |
| elv_dispatch_sort(q, rq); | |
| tppg_stats_update_dispatch(tppq->tppg, blk_rq_bytes(rq), rq->cmd_flags); | |
| return 1; | |
| } | |
| static int tpps_dispatch_requests_nr(struct tpps_data *tppd, | |
| struct tpps_queue *tppq, int count) | |
| { | |
| int cnt = 0, ret; | |
| if (!tppq->rq_queued) | |
| return cnt; | |
| do { | |
| ret = tpps_dispatch_insert(tppd->queue, tppq); | |
| if (ret) { | |
| cnt++; | |
| tppd->dispatched--; | |
| } | |
| } while (ret && cnt < count); | |
| return cnt; | |
| } | |
| static int tpps_dispatch_requests(struct request_queue *q, int force) | |
| { | |
| struct tpps_data *tppd = q->elevator->elevator_data; | |
| struct tpps_group *tppg, *group_n; | |
| struct tpps_queue *tppq; | |
| struct list_head *next; | |
| int count = 0, total = 0, ret; | |
| int quota, grp_quota; | |
| if (!tppd->total_weight) | |
| return 0; | |
| quota = q->nr_requests - tppd->rq_in_driver; | |
| if (quota < MIN_DISPATCH_RQ && !force) | |
| return 0; | |
| list_for_each_entry_safe(tppg, group_n, &tppd->group_list, tppd_node) { | |
| if (!tppg->nr_tppq) | |
| continue; | |
| grp_quota = (quota * tppg->pd.blkg->blkcg->cfq_weight | |
| / tppd->total_weight) - tppg->rq_in_driver; | |
| tpps_log_tppg(tppd, tppg, | |
| "nr:%d, wt:%u total_wt:%u in_driver:%d %d quota:%d grp_quota:%d", | |
| tppg->nr_tppq, tppg->pd.blkg->blkcg->cfq_weight, | |
| tppd->total_weight, tppg->rq_in_driver, tppg->rq_queued, | |
| quota, grp_quota); | |
| if (grp_quota <= 0 && !force) | |
| continue; | |
| BUG_ON(tppg->queue_list.next == &tppg->queue_list); | |
| if (!tppg->cur_dispatcher) | |
| tppg->cur_dispatcher = tppg->queue_list.next; | |
| next = tppg->cur_dispatcher; | |
| count = 0; | |
| do { | |
| tppq = list_entry(next, struct tpps_queue, tppg_node); | |
| tpps_log_tppq(tppd, tppq, "tppq: %d\n", tppq->rq_queued); | |
| if (force) | |
| ret = tpps_dispatch_requests_nr(tppd, tppq, -1); | |
| else | |
| ret = tpps_dispatch_requests_nr(tppd, tppq, 1); | |
| count += ret; | |
| total += ret; | |
| next = next->next; | |
| if (next == &tppg->queue_list) | |
| next = tppg->queue_list.next; | |
| if (count >= grp_quota && !force) { | |
| tppg->cur_dispatcher = next; | |
| break; | |
| } | |
| BUG_ON(tppg->cur_dispatcher == &tppg->queue_list); | |
| } while (next != tppg->cur_dispatcher); | |
| } | |
| return total > 0; | |
| } | |
| static void tpps_kick_queue(struct work_struct *work) | |
| { | |
| struct tpps_data *tppd = | |
| container_of(work, struct tpps_data, unplug_work); | |
| struct request_queue *q = tppd->queue; | |
| spin_lock_irq(q->queue_lock); | |
| __blk_run_queue(q); | |
| spin_unlock_irq(q->queue_lock); | |
| } | |
| static void tpps_init_tppg_base(struct tpps_group *tppg) | |
| { | |
| INIT_LIST_HEAD(&tppg->tppd_node); | |
| INIT_LIST_HEAD(&tppg->queue_list); | |
| tppg->cur_dispatcher = NULL; | |
| } | |
| static int tpps_init_queue(struct request_queue *q, struct elevator_type *e) | |
| { | |
| struct tpps_data *tppd; | |
| struct tpps_group *tppg; | |
| int ret; | |
| struct elevator_queue *eq; | |
| eq = elevator_alloc(q, e); | |
| if(!eq) | |
| return -ENOMEM; | |
| tppd = kzalloc_node(sizeof(*tppd), GFP_KERNEL, q->node); | |
| if (!tppd) { | |
| kobject_put(&eq->kobj); | |
| return -ENOMEM; | |
| } | |
| eq->elevator_data = tppd; | |
| tppd->queue = q; | |
| spin_lock_irq(q->queue_lock); | |
| q->elevator = eq; | |
| spin_unlock_irq(q->queue_lock); | |
| INIT_LIST_HEAD(&tppd->group_list); | |
| ret = blkcg_activate_policy(q, &blkcg_policy_tpps); | |
| if (ret) | |
| goto out_free; | |
| /* Init root group */ | |
| tppd->root_group = blkg_to_tppg(q->root_blkg); | |
| tppg = tppd->root_group; | |
| tpps_init_tppg_base(tppg); | |
| /* Give preference to root group over other groups */ | |
| tppg->weight = 2 * TPPS_WEIGHT_DEFAULT; | |
| tppg->leaf_weight = 2 * TPPS_WEIGHT_DEFAULT; | |
| INIT_WORK(&tppd->unplug_work, tpps_kick_queue); | |
| return 0; | |
| out_free: | |
| kfree(tppd); | |
| kobject_put(&eq->kobj); | |
| return ret; | |
| } | |
| static void tpps_exit_queue(struct elevator_queue *e) | |
| { | |
| struct tpps_data *tppd = e->elevator_data; | |
| struct request_queue *q = tppd->queue; | |
| cancel_work_sync(&tppd->unplug_work); | |
| blkcg_deactivate_policy(q, &blkcg_policy_tpps); | |
| kfree(tppd->root_group); | |
| kfree(tppd); | |
| } | |
| static void tpps_activate_request(struct request_queue *q, struct request *rq) | |
| { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| struct tpps_data *tppd = q->elevator->elevator_data; | |
| tppd->rq_in_driver++; | |
| tppq->tppg->rq_in_driver++; | |
| tpps_log_tppq(tppd, RQ_TPPQ(rq), "activate rq, drv=%d", | |
| tppd->rq_in_driver); | |
| } | |
| static void tpps_deactivate_request(struct request_queue *q, struct request *rq) | |
| { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| struct tpps_data *tppd = q->elevator->elevator_data; | |
| WARN_ON(!tppd->rq_in_driver); | |
| tppd->rq_in_driver--; | |
| tppq->tppg->rq_in_driver--; | |
| tpps_log_tppq(tppd, RQ_TPPQ(rq), "deactivate rq, drv=%d", | |
| tppd->rq_in_driver); | |
| } | |
| static void tpps_completed_request(struct request_queue *q, struct request *rq) | |
| { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| struct tpps_data *tppd = tppq->tppd; | |
| WARN_ON(!tppq); | |
| WARN_ON(tppq->tppg != RQ_TPPG(rq)); | |
| tpps_log_tppq(tppd, tppq, "complete rqnoidle %d", | |
| !!(rq->cmd_flags & REQ_NOIDLE)); | |
| WARN_ON(!tppd->rq_in_driver); | |
| tppd->rq_in_driver--; | |
| tppq->tppg->rq_in_driver--; | |
| tppg_stats_update_completion(tppq->tppg, | |
| rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq->cmd_flags); | |
| if (!tppd->rq_in_driver) | |
| tpps_schedule_dispatch(tppd); | |
| } | |
| static void | |
| tpps_merged_request(struct request_queue *q, struct request *rq, int type) | |
| { | |
| if (type == ELEVATOR_FRONT_MERGE) { | |
| struct tpps_queue *tppq = RQ_TPPQ(rq); | |
| list_del_init(&rq->queuelist); | |
| tppq->rq_queued--; | |
| tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags); | |
| list_add_tail(&rq->queuelist, &tppq->sort_list); | |
| tppq->rq_queued++; | |
| tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags); | |
| } | |
| } | |
| static void | |
| tpps_merged_requests(struct request_queue *q, struct request *rq, | |
| struct request *next) | |
| { | |
| tpps_remove_request(next); | |
| tppg_stats_update_io_merged(RQ_TPPG(rq), rq->cmd_flags); | |
| } | |
| static void tpps_init_icq(struct io_cq *icq) | |
| { } | |
| static void tpps_exit_icq(struct io_cq *icq) | |
| { | |
| struct tpps_io_cq *tic = icq_to_tic(icq); | |
| if (tic->tppq) { | |
| tpps_put_queue(tic->tppq); | |
| tic->tppq = NULL; | |
| } | |
| } | |
| static struct elevator_type iosched_tpps = { | |
| .ops = { | |
| .elevator_merged_fn = tpps_merged_request, | |
| .elevator_merge_req_fn = tpps_merged_requests, | |
| .elevator_dispatch_fn = tpps_dispatch_requests, | |
| .elevator_add_req_fn = tpps_insert_request, | |
| .elevator_activate_req_fn = tpps_activate_request, | |
| .elevator_deactivate_req_fn = tpps_deactivate_request, | |
| .elevator_completed_req_fn = tpps_completed_request, | |
| .elevator_init_icq_fn = tpps_init_icq, | |
| .elevator_exit_icq_fn = tpps_exit_icq, | |
| .elevator_set_req_fn = tpps_set_request, | |
| .elevator_put_req_fn = tpps_put_request, | |
| .elevator_init_fn = tpps_init_queue, | |
| .elevator_exit_fn = tpps_exit_queue, | |
| }, | |
| .icq_size = sizeof(struct tpps_io_cq), | |
| .icq_align = __alignof__(struct tpps_io_cq), | |
| .elevator_name = "tpps", | |
| .elevator_owner = THIS_MODULE, | |
| }; | |
| static u64 tppg_prfill_weight_device(struct seq_file *sf, | |
| struct blkg_policy_data *pd, int off) | |
| { | |
| struct tpps_group *tppg = pd_to_tppg(pd); | |
| if (!tppg->dev_weight) | |
| return 0; | |
| return __blkg_prfill_u64(sf, pd, tppg->dev_weight); | |
| } | |
| static int tppg_print_weight_device(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | |
| tppg_prfill_weight_device, &blkcg_policy_tpps, | |
| 0, false); | |
| return 0; | |
| } | |
| static u64 tppg_prfill_leaf_weight_device(struct seq_file *sf, | |
| struct blkg_policy_data *pd, int off) | |
| { | |
| struct tpps_group *tppg = pd_to_tppg(pd); | |
| if (!tppg->dev_leaf_weight) | |
| return 0; | |
| return __blkg_prfill_u64(sf, pd, tppg->dev_leaf_weight); | |
| } | |
| static int tppg_print_leaf_weight_device(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | |
| tppg_prfill_leaf_weight_device, &blkcg_policy_tpps, | |
| 0, false); | |
| return 0; | |
| } | |
| static int tppg_print_weight(struct seq_file *sf, void *v) | |
| { | |
| seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); | |
| return 0; | |
| } | |
| static int tppg_print_leaf_weight(struct seq_file *sf, void *v) | |
| { | |
| seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); | |
| return 0; | |
| } | |
| static ssize_t __tppg_set_weight_device(struct kernfs_open_file *of, | |
| char *buf, size_t nbytes, loff_t off, | |
| bool is_leaf_weight) | |
| { | |
| struct blkcg *blkcg = css_to_blkcg(of_css(of)); | |
| struct blkg_conf_ctx ctx; | |
| struct tpps_group *tppg; | |
| int ret; | |
| ret = blkg_conf_prep(blkcg, &blkcg_policy_tpps, buf, &ctx); | |
| if (ret) | |
| return ret; | |
| ret = -EINVAL; | |
| tppg = blkg_to_tppg(ctx.blkg); | |
| if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { | |
| if (!is_leaf_weight) { | |
| tppg->dev_weight = ctx.v; | |
| tppg->new_weight = ctx.v ?: blkcg->cfq_weight; | |
| } else { | |
| tppg->dev_leaf_weight = ctx.v; | |
| tppg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; | |
| } | |
| ret = 0; | |
| } | |
| blkg_conf_finish(&ctx); | |
| return ret; | |
| } | |
| static ssize_t tppg_set_weight_device(struct kernfs_open_file *of, | |
| char *buf, size_t nbytes, loff_t off) | |
| { | |
| return __tppg_set_weight_device(of, buf, nbytes, off, false); | |
| } | |
| static ssize_t tppg_set_leaf_weight_device(struct kernfs_open_file *of, | |
| char *buf, size_t nbytes, loff_t off) | |
| { | |
| return __tppg_set_weight_device(of, buf, nbytes, off, true); | |
| } | |
| static int __tpps_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | |
| u64 val, bool is_leaf_weight) | |
| { | |
| // struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | |
| struct blkcg *blkcg = css_to_blkcg(css); | |
| struct blkcg_gq *blkg; | |
| if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) | |
| return -EINVAL; | |
| spin_lock_irq(&blkcg->lock); | |
| if (!is_leaf_weight) | |
| blkcg->cfq_weight = val; | |
| else | |
| blkcg->cfq_leaf_weight = val; | |
| hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | |
| struct tpps_group *tppg = blkg_to_tppg(blkg); | |
| if (!tppg) | |
| continue; | |
| if (!is_leaf_weight) { | |
| if (!tppg->dev_weight) | |
| tppg->new_weight = blkcg->cfq_weight; | |
| } else { | |
| if (!tppg->dev_leaf_weight) | |
| tppg->new_leaf_weight = blkcg->cfq_leaf_weight; | |
| } | |
| } | |
| spin_unlock_irq(&blkcg->lock); | |
| return 0; | |
| } | |
| static int tpps_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | |
| u64 val) | |
| { | |
| return __tpps_set_weight(css, cft, val, false); | |
| } | |
| static int tpps_set_leaf_weight(struct cgroup_subsys_state *css, | |
| struct cftype *cft, u64 val) | |
| { | |
| return __tpps_set_weight(css, cft, val, true); | |
| } | |
| /* offset delta from tppg->stats to tppg->dead_stats */ | |
| static const int dead_stats_off_delta = offsetof(struct tpps_group, dead_stats) - | |
| offsetof(struct tpps_group, stats); | |
| /* to be used by recursive prfill, sums live and dead rwstats recursively */ | |
| static struct blkg_rwstat tppg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, | |
| int off) | |
| { | |
| struct blkg_rwstat a, b; | |
| a = blkg_rwstat_recursive_sum(pd, off); | |
| b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); | |
| blkg_rwstat_merge(&a, &b); | |
| return a; | |
| } | |
| /* to be used by recursive prfill, sums live and dead stats recursively */ | |
| static u64 tppg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) | |
| { | |
| u64 sum = 0; | |
| sum += blkg_stat_recursive_sum(pd, off); | |
| sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); | |
| return sum; | |
| } | |
| static int tppg_print_stat(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, | |
| &blkcg_policy_tpps, seq_cft(sf)->private, false); | |
| return 0; | |
| } | |
| static int tppg_print_rwstat(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, | |
| &blkcg_policy_tpps, seq_cft(sf)->private, true); | |
| return 0; | |
| } | |
| static u64 tppg_prfill_stat_recursive(struct seq_file *sf, | |
| struct blkg_policy_data *pd, int off) | |
| { | |
| u64 sum = tppg_stat_pd_recursive_sum(pd, off); | |
| return __blkg_prfill_u64(sf, pd, sum); | |
| } | |
| static u64 tppg_prfill_rwstat_recursive(struct seq_file *sf, | |
| struct blkg_policy_data *pd, int off) | |
| { | |
| struct blkg_rwstat sum = tppg_rwstat_pd_recursive_sum(pd, off); | |
| return __blkg_prfill_rwstat(sf, pd, &sum); | |
| } | |
| static int tppg_print_stat_recursive(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | |
| tppg_prfill_stat_recursive, &blkcg_policy_tpps, | |
| seq_cft(sf)->private, false); | |
| return 0; | |
| } | |
| static int tppg_print_rwstat_recursive(struct seq_file *sf, void *v) | |
| { | |
| blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | |
| tppg_prfill_rwstat_recursive, &blkcg_policy_tpps, | |
| seq_cft(sf)->private, true); | |
| return 0; | |
| } | |
| static struct cftype tpps_blkcg_files[] = { | |
| /* on root, weight is mapped to leaf_weight */ | |
| { | |
| .name = "tpps.weight_device", | |
| .flags = CFTYPE_ONLY_ON_ROOT, | |
| .seq_show = tppg_print_leaf_weight_device, | |
| .write = tppg_set_leaf_weight_device, | |
| .max_write_len = 256, | |
| }, | |
| { | |
| .name = "tpps.weight", | |
| .flags = CFTYPE_ONLY_ON_ROOT, | |
| .seq_show = tppg_print_leaf_weight, | |
| .write_u64 = tpps_set_leaf_weight, | |
| }, | |
| /* no such mapping necessary for !roots */ | |
| { | |
| .name = "tpps.weight_device", | |
| .flags = CFTYPE_NOT_ON_ROOT, | |
| .seq_show = tppg_print_weight_device, | |
| .write = tppg_set_weight_device, | |
| .max_write_len = 256, | |
| }, | |
| { | |
| .name = "tpps.weight", | |
| .flags = CFTYPE_NOT_ON_ROOT, | |
| .seq_show = tppg_print_weight, | |
| .write_u64 = tpps_set_weight, | |
| }, | |
| { | |
| .name = "tpps.leaf_weight_device", | |
| .seq_show = tppg_print_leaf_weight_device, | |
| .write = tppg_set_leaf_weight_device, | |
| .max_write_len = 256, | |
| }, | |
| { | |
| .name = "tpps.leaf_weight", | |
| .seq_show = tppg_print_leaf_weight, | |
| .write_u64 = tpps_set_leaf_weight, | |
| }, | |
| /* statistics, covers only the tasks in the tppg */ | |
| { | |
| .name = "tpps.time", | |
| .private = offsetof(struct tpps_group, stats.time), | |
| .seq_show = tppg_print_stat, | |
| }, | |
| { | |
| .name = "tpps.sectors", | |
| .private = offsetof(struct tpps_group, stats.sectors), | |
| .seq_show = tppg_print_stat, | |
| }, | |
| { | |
| .name = "tpps.io_service_bytes", | |
| .private = offsetof(struct tpps_group, stats.service_bytes), | |
| .seq_show = tppg_print_rwstat, | |
| }, | |
| { | |
| .name = "tpps.io_serviced", | |
| .private = offsetof(struct tpps_group, stats.serviced), | |
| .seq_show= tppg_print_rwstat, | |
| }, | |
| { | |
| .name = "tpps.io_service_time", | |
| .private = offsetof(struct tpps_group, stats.service_time), | |
| .seq_show = tppg_print_rwstat, | |
| }, | |
| { | |
| .name = "tpps.io_wait_time", | |
| .private = offsetof(struct tpps_group, stats.wait_time), | |
| .seq_show = tppg_print_rwstat, | |
| }, | |
| { | |
| .name = "tpps.io_merged", | |
| .private = offsetof(struct tpps_group, stats.merged), | |
| .seq_show = tppg_print_rwstat, | |
| }, | |
| { | |
| .name = "tpps.io_queued", | |
| .private = offsetof(struct tpps_group, stats.queued), | |
| .seq_show = tppg_print_rwstat, | |
| }, | |
| /* the same statictics which cover the tppg and its descendants */ | |
| { | |
| .name = "tpps.time_recursive", | |
| .private = offsetof(struct tpps_group, stats.time), | |
| .seq_show = tppg_print_stat_recursive, | |
| }, | |
| { | |
| .name = "tpps.sectors_recursive", | |
| .private = offsetof(struct tpps_group, stats.sectors), | |
| .seq_show = tppg_print_stat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_service_bytes_recursive", | |
| .private = offsetof(struct tpps_group, stats.service_bytes), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_serviced_recursive", | |
| .private = offsetof(struct tpps_group, stats.serviced), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_service_time_recursive", | |
| .private = offsetof(struct tpps_group, stats.service_time), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_wait_time_recursive", | |
| .private = offsetof(struct tpps_group, stats.wait_time), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_merged_recursive", | |
| .private = offsetof(struct tpps_group, stats.merged), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { | |
| .name = "tpps.io_queued_recursive", | |
| .private = offsetof(struct tpps_group, stats.queued), | |
| .seq_show = tppg_print_rwstat_recursive, | |
| }, | |
| { } /* terminate */ | |
| }; | |
| static void tpps_pd_init(struct blkcg_gq *blkg) | |
| { | |
| struct tpps_group *tppg = blkg_to_tppg(blkg); | |
| tpps_init_tppg_base(tppg); | |
| tppg->weight = blkg->blkcg->cfq_weight; | |
| tppg->leaf_weight = blkg->blkcg->cfq_leaf_weight; | |
| } | |
| static inline struct tpps_group *tppg_parent(struct tpps_group *tppg) | |
| { | |
| struct blkcg_gq *pblkg = tppg_to_blkg(tppg)->parent; | |
| return pblkg ? blkg_to_tppg(pblkg) : NULL; | |
| } | |
| static void tppg_stats_reset(struct tppg_stats *stats) | |
| { | |
| /* queued stats shouldn't be cleared */ | |
| blkg_rwstat_reset(&stats->service_bytes); | |
| blkg_rwstat_reset(&stats->serviced); | |
| blkg_rwstat_reset(&stats->merged); | |
| blkg_rwstat_reset(&stats->service_time); | |
| blkg_rwstat_reset(&stats->wait_time); | |
| blkg_stat_reset(&stats->time); | |
| #ifdef CONFIG_DEBUG_BLK_CGROUP | |
| blkg_stat_reset(&stats->unaccounted_time); | |
| blkg_stat_reset(&stats->avg_queue_size_sum); | |
| blkg_stat_reset(&stats->avg_queue_size_samples); | |
| blkg_stat_reset(&stats->dequeue); | |
| blkg_stat_reset(&stats->group_wait_time); | |
| blkg_stat_reset(&stats->idle_time); | |
| blkg_stat_reset(&stats->empty_time); | |
| #endif | |
| } | |
| /* @to += @from */ | |
| static void tppg_stats_merge(struct tppg_stats *to, struct tppg_stats *from) | |
| { | |
| /* queued stats shouldn't be cleared */ | |
| blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); | |
| blkg_rwstat_merge(&to->serviced, &from->serviced); | |
| blkg_rwstat_merge(&to->merged, &from->merged); | |
| blkg_rwstat_merge(&to->service_time, &from->service_time); | |
| blkg_rwstat_merge(&to->wait_time, &from->wait_time); | |
| blkg_stat_merge(&from->time, &from->time); | |
| #ifdef CONFIG_DEBUG_BLK_CGROUP | |
| blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); | |
| blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | |
| blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); | |
| blkg_stat_merge(&to->dequeue, &from->dequeue); | |
| blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); | |
| blkg_stat_merge(&to->idle_time, &from->idle_time); | |
| blkg_stat_merge(&to->empty_time, &from->empty_time); | |
| #endif | |
| } | |
| static void tppg_stats_xfer_dead(struct tpps_group *tppg) | |
| { | |
| struct tpps_group *parent = tppg_parent(tppg); | |
| lockdep_assert_held(tppg_to_blkg(tppg)->q->queue_lock); | |
| if (unlikely(!parent)) | |
| return; | |
| tppg_stats_merge(&parent->dead_stats, &tppg->stats); | |
| tppg_stats_merge(&parent->dead_stats, &tppg->dead_stats); | |
| tppg_stats_reset(&tppg->stats); | |
| tppg_stats_reset(&tppg->dead_stats); | |
| } | |
| static void tpps_pd_offline(struct blkcg_gq *blkg) | |
| { | |
| struct tpps_group *tppg = blkg_to_tppg(blkg); | |
| /* | |
| * @blkg is going offline and will be ignored by | |
| * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | |
| * that they don't get lost. If IOs complete after this point, the | |
| * stats for them will be lost. Oh well... | |
| */ | |
| tppg_stats_xfer_dead(tppg); | |
| if (!list_empty(&tppg->tppd_node)) | |
| list_del_init(&tppg->tppd_node); | |
| //BUG_ON(!list_empty(&(tppg->queue_list))); | |
| } | |
| static void tpps_pd_reset_stats(struct blkcg_gq *blkg) | |
| { | |
| struct tpps_group *tppg = blkg_to_tppg(blkg); | |
| tppg_stats_reset(&tppg->stats); | |
| tppg_stats_reset(&tppg->dead_stats); | |
| } | |
| static struct blkcg_policy blkcg_policy_tpps = { | |
| .pd_size = sizeof(struct tpps_group), | |
| .cftypes = tpps_blkcg_files, | |
| .pd_init_fn = tpps_pd_init, | |
| .pd_offline_fn = tpps_pd_offline, | |
| .pd_reset_stats_fn = tpps_pd_reset_stats, | |
| }; | |
| // MEMO:reference to cfq-iosched.c __init | |
| static int __init tpps_init(void) | |
| { | |
| int ret; | |
| ret = blkcg_policy_register(&blkcg_policy_tpps); | |
| if (ret) | |
| return ret; | |
| ret = -ENOMEM; | |
| tpps_pool = KMEM_CACHE(tpps_queue, 0); | |
| if (!tpps_pool) | |
| goto err_pol_unreg; | |
| ret = elv_register(&iosched_tpps); | |
| if (ret) | |
| goto err_free_pool; | |
| return 0; | |
| err_free_pool: | |
| kmem_cache_destroy(tpps_pool); | |
| err_pol_unreg: | |
| blkcg_policy_unregister(&blkcg_policy_tpps); | |
| return ret; | |
| } | |
| static void __exit tpps_exit(void) | |
| { | |
| blkcg_policy_unregister(&blkcg_policy_tpps); | |
| elv_unregister(&iosched_tpps); | |
| kmem_cache_destroy(tpps_pool); | |
| } | |
| module_init(tpps_init); | |
| module_exit(tpps_exit); | |
| MODULE_AUTHOR("Robin Dong"); | |
| MODULE_LICENSE("GPL"); | |
| MODULE_DESCRIPTION("Tiny Parallel Proportion io Scheduler"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment