forked from mirrors/linux
		
	blkcg: fix ref count issue with bio_blkcg using task_css
The accessor function bio_blkcg either returns the blkcg associated with the bio or finds one in the current context. This can cause an issue when trying to associate a bio with a blkcg. Particularly, it's the third case that is problematic: return css_to_blkcg(task_css(current, io_cgrp_id)); As the above may race against task migration and the cgroup exiting, it is not always ok to take a reference on the blkcg returned from bio_blkcg. This patch adds association ahead of calling bio_blkcg rather than after. This makes association a required and explicit step along the code paths for calling bio_blkcg. blk_get_rl is modified as well to get a reference to the blkcg it may use and blk_put_rl will always put the reference back. Association is also moved above the bio_blkcg call to ensure it will not return NULL in blk-iolatency. BFQ and CFQ utilize this flaw, but due to the complexity, I do not want to address this in this series. I've created a private version of the function with notes not to use it describing the flaw. Hopefully soon, that code can be cleaned up. Signed-off-by: Dennis Zhou <dennisszhou@gmail.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									9ff01255a0
								
							
						
					
					
						commit
						27e6fa996c
					
				
					 6 changed files with 108 additions and 17 deletions
				
			
		|  | @ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | ||||||
| 	uint64_t serial_nr; | 	uint64_t serial_nr; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	serial_nr = bio_blkcg(bio)->css.serial_nr; | 	serial_nr = __bio_blkcg(bio)->css.serial_nr; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Check whether blkcg has changed.  The condition may trigger | 	 * Check whether blkcg has changed.  The condition may trigger | ||||||
|  | @ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | ||||||
| 	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) | 	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); | 	bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Update blkg_path for bfq_log_* functions. We cache this | 	 * Update blkg_path for bfq_log_* functions. We cache this | ||||||
| 	 * path, and update it here, for the following | 	 * path, and update it here, for the following | ||||||
|  |  | ||||||
|  | @ -4359,7 +4359,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 
 | 
 | ||||||
| 	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); | 	bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); | ||||||
| 	if (!bfqg) { | 	if (!bfqg) { | ||||||
| 		bfqq = &bfqd->oom_bfqq; | 		bfqq = &bfqd->oom_bfqq; | ||||||
| 		goto out; | 		goto out; | ||||||
|  |  | ||||||
|  | @ -1988,13 +1988,19 @@ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) | ||||||
|  * |  * | ||||||
|  * This function takes an extra reference of @blkcg_css which will be put |  * This function takes an extra reference of @blkcg_css which will be put | ||||||
|  * when @bio is released.  The caller must own @bio and is responsible for |  * when @bio is released.  The caller must own @bio and is responsible for | ||||||
|  * synchronizing calls to this function. |  * synchronizing calls to this function.  If @blkcg_css is NULL, a call to | ||||||
|  |  * blkcg_get_css finds the current css from the kthread or task. | ||||||
|  */ |  */ | ||||||
| int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) | ||||||
| { | { | ||||||
| 	if (unlikely(bio->bi_css)) | 	if (unlikely(bio->bi_css)) | ||||||
| 		return -EBUSY; | 		return -EBUSY; | ||||||
|  | 
 | ||||||
|  | 	if (blkcg_css) | ||||||
| 		css_get(blkcg_css); | 		css_get(blkcg_css); | ||||||
|  | 	else | ||||||
|  | 		blkcg_css = blkcg_get_css(); | ||||||
|  | 
 | ||||||
| 	bio->bi_css = blkcg_css; | 	bio->bi_css = blkcg_css; | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -401,8 +401,8 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
|  | 	bio_associate_blkcg(bio, NULL); | ||||||
| 	blkcg = bio_blkcg(bio); | 	blkcg = bio_blkcg(bio); | ||||||
| 	bio_associate_blkcg(bio, &blkcg->css); |  | ||||||
| 	blkg = blkg_lookup(blkcg, q); | 	blkg = blkg_lookup(blkcg, q); | ||||||
| 	if (unlikely(!blkg)) { | 	if (unlikely(!blkg)) { | ||||||
| 		if (!lock) | 		if (!lock) | ||||||
|  |  | ||||||
|  | @ -3753,7 +3753,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | ||||||
| 	uint64_t serial_nr; | 	uint64_t serial_nr; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	serial_nr = bio_blkcg(bio)->css.serial_nr; | 	serial_nr = __bio_blkcg(bio)->css.serial_nr; | ||||||
| 	rcu_read_unlock(); | 	rcu_read_unlock(); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -3818,7 +3818,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | ||||||
| 	struct cfq_group *cfqg; | 	struct cfq_group *cfqg; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); | 	cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio)); | ||||||
| 	if (!cfqg) { | 	if (!cfqg) { | ||||||
| 		cfqq = &cfqd->oom_cfqq; | 		cfqq = &cfqd->oom_cfqq; | ||||||
| 		goto out; | 		goto out; | ||||||
|  |  | ||||||
|  | @ -230,22 +230,100 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | ||||||
| 		   char *input, struct blkg_conf_ctx *ctx); | 		   char *input, struct blkg_conf_ctx *ctx); | ||||||
| void blkg_conf_finish(struct blkg_conf_ctx *ctx); | void blkg_conf_finish(struct blkg_conf_ctx *ctx); | ||||||
| 
 | 
 | ||||||
|  | /**
 | ||||||
|  |  * blkcg_css - find the current css | ||||||
|  |  * | ||||||
|  |  * Find the css associated with either the kthread or the current task. | ||||||
|  |  * This may return a dying css, so it is up to the caller to use tryget logic | ||||||
|  |  * to confirm it is alive and well. | ||||||
|  |  */ | ||||||
|  | static inline struct cgroup_subsys_state *blkcg_css(void) | ||||||
|  | { | ||||||
|  | 	struct cgroup_subsys_state *css; | ||||||
|  | 
 | ||||||
|  | 	css = kthread_blkcg(); | ||||||
|  | 	if (css) | ||||||
|  | 		return css; | ||||||
|  | 	return task_css(current, io_cgrp_id); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * blkcg_get_css - find and get a reference to the css | ||||||
|  |  * | ||||||
|  |  * Find the css associated with either the kthread or the current task. | ||||||
|  |  * This takes a reference on the blkcg which will need to be managed by the | ||||||
|  |  * caller. | ||||||
|  |  */ | ||||||
|  | static inline struct cgroup_subsys_state *blkcg_get_css(void) | ||||||
|  | { | ||||||
|  | 	struct cgroup_subsys_state *css; | ||||||
|  | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 
 | ||||||
|  | 	css = kthread_blkcg(); | ||||||
|  | 	if (css) { | ||||||
|  | 		css_get(css); | ||||||
|  | 	} else { | ||||||
|  | 		/*
 | ||||||
|  | 		 * This is a bit complicated.  It is possible task_css is seeing | ||||||
|  | 		 * an old css pointer here.  This is caused by the current | ||||||
|  | 		 * thread migrating away from this cgroup and this cgroup dying. | ||||||
|  | 		 * css_tryget() will fail when trying to take a ref on a cgroup | ||||||
|  | 		 * that's ref count has hit 0. | ||||||
|  | 		 * | ||||||
|  | 		 * Therefore, if it does fail, this means current must have | ||||||
|  | 		 * been swapped away already and this is waiting for it to | ||||||
|  | 		 * propagate on the polling cpu.  Hence the use of cpu_relax(). | ||||||
|  | 		 */ | ||||||
|  | 		while (true) { | ||||||
|  | 			css = task_css(current, io_cgrp_id); | ||||||
|  | 			if (likely(css_tryget(css))) | ||||||
|  | 				break; | ||||||
|  | 			cpu_relax(); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	rcu_read_unlock(); | ||||||
|  | 
 | ||||||
|  | 	return css; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) | static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) | ||||||
| { | { | ||||||
| 	return css ? container_of(css, struct blkcg, css) : NULL; | 	return css ? container_of(css, struct blkcg, css) : NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline struct blkcg *bio_blkcg(struct bio *bio) | /**
 | ||||||
|  |  * __bio_blkcg - internal version of bio_blkcg for bfq and cfq | ||||||
|  |  * | ||||||
|  |  * DO NOT USE. | ||||||
|  |  * There is a flaw using this version of the function.  In particular, this was | ||||||
|  |  * used in a broken paradigm where association was called on the given css.  It | ||||||
|  |  * is possible though that the returned css from task_css() is in the process | ||||||
|  |  * of dying due to migration of the current task.  So it is improper to assume | ||||||
|  |  * *_get() is going to succeed.  Both BFQ and CFQ rely on this logic and will | ||||||
|  |  * take additional work to handle more gracefully. | ||||||
|  |  */ | ||||||
|  | static inline struct blkcg *__bio_blkcg(struct bio *bio) | ||||||
| { | { | ||||||
| 	struct cgroup_subsys_state *css; |  | ||||||
| 
 |  | ||||||
| 	if (bio && bio->bi_css) | 	if (bio && bio->bi_css) | ||||||
| 		return css_to_blkcg(bio->bi_css); | 		return css_to_blkcg(bio->bi_css); | ||||||
| 	css = kthread_blkcg(); | 	return css_to_blkcg(blkcg_css()); | ||||||
| 	if (css) | } | ||||||
| 		return css_to_blkcg(css); | 
 | ||||||
| 	return css_to_blkcg(task_css(current, io_cgrp_id)); | /**
 | ||||||
|  |  * bio_blkcg - grab the blkcg associated with a bio | ||||||
|  |  * @bio: target bio | ||||||
|  |  * | ||||||
|  |  * This returns the blkcg associated with a bio, NULL if not associated. | ||||||
|  |  * Callers are expected to either handle NULL or know association has been | ||||||
|  |  * done prior to calling this. | ||||||
|  |  */ | ||||||
|  | static inline struct blkcg *bio_blkcg(struct bio *bio) | ||||||
|  | { | ||||||
|  | 	if (bio && bio->bi_css) | ||||||
|  | 		return css_to_blkcg(bio->bi_css); | ||||||
|  | 	return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline bool blk_cgroup_congested(void) | static inline bool blk_cgroup_congested(void) | ||||||
|  | @ -534,6 +612,10 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 
 | 
 | ||||||
| 	blkcg = bio_blkcg(bio); | 	blkcg = bio_blkcg(bio); | ||||||
|  | 	if (blkcg) | ||||||
|  | 		css_get(&blkcg->css); | ||||||
|  | 	else | ||||||
|  | 		blkcg = css_to_blkcg(blkcg_get_css()); | ||||||
| 
 | 
 | ||||||
| 	/* bypass blkg lookup and use @q->root_rl directly for root */ | 	/* bypass blkg lookup and use @q->root_rl directly for root */ | ||||||
| 	if (blkcg == &blkcg_root) | 	if (blkcg == &blkcg_root) | ||||||
|  | @ -565,6 +647,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, | ||||||
|  */ |  */ | ||||||
| static inline void blk_put_rl(struct request_list *rl) | static inline void blk_put_rl(struct request_list *rl) | ||||||
| { | { | ||||||
|  | 	/* an additional ref is always taken for rl */ | ||||||
|  | 	css_put(&rl->blkg->blkcg->css); | ||||||
| 	if (rl->blkg->blkcg != &blkcg_root) | 	if (rl->blkg->blkcg != &blkcg_root) | ||||||
| 		blkg_put(rl->blkg); | 		blkg_put(rl->blkg); | ||||||
| } | } | ||||||
|  | @ -805,10 +889,10 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, | ||||||
| 	bool throtl = false; | 	bool throtl = false; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	blkcg = bio_blkcg(bio); |  | ||||||
| 
 | 
 | ||||||
| 	/* associate blkcg if bio hasn't attached one */ | 	/* associate blkcg if bio hasn't attached one */ | ||||||
| 	bio_associate_blkcg(bio, &blkcg->css); | 	bio_associate_blkcg(bio, NULL); | ||||||
|  | 	blkcg = bio_blkcg(bio); | ||||||
| 
 | 
 | ||||||
| 	blkg = blkg_lookup(blkcg, q); | 	blkg = blkg_lookup(blkcg, q); | ||||||
| 	if (unlikely(!blkg)) { | 	if (unlikely(!blkg)) { | ||||||
|  | @ -930,6 +1014,7 @@ static inline int blkcg_activate_policy(struct request_queue *q, | ||||||
| static inline void blkcg_deactivate_policy(struct request_queue *q, | static inline void blkcg_deactivate_policy(struct request_queue *q, | ||||||
| 					   const struct blkcg_policy *pol) { } | 					   const struct blkcg_policy *pol) { } | ||||||
| 
 | 
 | ||||||
|  | static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } | ||||||
| static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } | static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } | ||||||
| 
 | 
 | ||||||
| static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Dennis Zhou (Facebook)
						Dennis Zhou (Facebook)