mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	crush: add chooseleaf_vary_r tunable
The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
This commit is contained in:
		
							parent
							
								
									6ed1002f36
								
							
						
					
					
						commit
						e2b149cc4b
					
				
					 2 changed files with 30 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -173,6 +173,12 @@ struct crush_map {
 | 
			
		|||
	 * apply to a collision: in that case we will retry as we used
 | 
			
		||||
	 * to. */
 | 
			
		||||
	__u32 chooseleaf_descend_once;
 | 
			
		||||
 | 
			
		||||
	/* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
 | 
			
		||||
	 * bits.  a value of 1 is best for new clusters.  for legacy clusters
 | 
			
		||||
	 * that want to limit reshuffling, a value of 3 or 4 will make the
 | 
			
		||||
	 * mappings line up a bit better with previous mappings. */
 | 
			
		||||
	__u8 chooseleaf_vary_r;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
 | 
			
		|||
 * @local_retries: localized retries
 | 
			
		||||
 * @local_fallback_retries: localized fallback retries
 | 
			
		||||
 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
 | 
			
		||||
 * @vary_r: pass r to recursive calls
 | 
			
		||||
 * @out2: second output vector for leaf items (if @recurse_to_leaf)
 | 
			
		||||
 * @parent_r: r value passed from the parent
 | 
			
		||||
 */
 | 
			
		||||
static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		||||
			       struct crush_bucket *bucket,
 | 
			
		||||
| 
						 | 
				
			
			@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		|||
			       unsigned int local_retries,
 | 
			
		||||
			       unsigned int local_fallback_retries,
 | 
			
		||||
			       int recurse_to_leaf,
 | 
			
		||||
			       int *out2)
 | 
			
		||||
			       unsigned int vary_r,
 | 
			
		||||
			       int *out2,
 | 
			
		||||
			       int parent_r)
 | 
			
		||||
{
 | 
			
		||||
	int rep;
 | 
			
		||||
	unsigned int ftotal, flocal;
 | 
			
		||||
| 
						 | 
				
			
			@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		|||
	int itemtype;
 | 
			
		||||
	int collide, reject;
 | 
			
		||||
 | 
			
		||||
	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 | 
			
		||||
		bucket->id, x, outpos, numrep);
 | 
			
		||||
	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
 | 
			
		||||
		recurse_to_leaf ? "_LEAF" : "",
 | 
			
		||||
		bucket->id, x, outpos, numrep,
 | 
			
		||||
		tries, recurse_tries, local_retries, local_fallback_retries,
 | 
			
		||||
		parent_r);
 | 
			
		||||
 | 
			
		||||
	for (rep = outpos; rep < numrep; rep++) {
 | 
			
		||||
		/* keep trying until we get a non-out, non-colliding item */
 | 
			
		||||
| 
						 | 
				
			
			@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		|||
			do {
 | 
			
		||||
				collide = 0;
 | 
			
		||||
				retry_bucket = 0;
 | 
			
		||||
				r = rep;
 | 
			
		||||
				r = rep + parent_r;
 | 
			
		||||
				/* r' = r + f_total */
 | 
			
		||||
				r += ftotal;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		|||
				reject = 0;
 | 
			
		||||
				if (!collide && recurse_to_leaf) {
 | 
			
		||||
					if (item < 0) {
 | 
			
		||||
						int sub_r;
 | 
			
		||||
						if (vary_r)
 | 
			
		||||
							sub_r = r >> (vary_r-1);
 | 
			
		||||
						else
 | 
			
		||||
							sub_r = 0;
 | 
			
		||||
						if (crush_choose_firstn(map,
 | 
			
		||||
							 map->buckets[-1-item],
 | 
			
		||||
							 weight, weight_max,
 | 
			
		||||
| 
						 | 
				
			
			@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
 | 
			
		|||
							 local_retries,
 | 
			
		||||
							 local_fallback_retries,
 | 
			
		||||
							 0,
 | 
			
		||||
							 NULL) <= outpos)
 | 
			
		||||
							 vary_r,
 | 
			
		||||
							 NULL,
 | 
			
		||||
							 sub_r) <= outpos)
 | 
			
		||||
							/* didn't get leaf */
 | 
			
		||||
							reject = 1;
 | 
			
		||||
					} else {
 | 
			
		||||
| 
						 | 
				
			
			@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
 | 
			
		|||
	int choose_local_retries = map->choose_local_tries;
 | 
			
		||||
	int choose_local_fallback_retries = map->choose_local_fallback_tries;
 | 
			
		||||
 | 
			
		||||
	int vary_r = map->chooseleaf_vary_r;
 | 
			
		||||
 | 
			
		||||
	if ((__u32)ruleno >= map->max_rules) {
 | 
			
		||||
		dprintk(" bad ruleno %d\n", ruleno);
 | 
			
		||||
		return 0;
 | 
			
		||||
| 
						 | 
				
			
			@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
 | 
			
		|||
						choose_local_retries,
 | 
			
		||||
						choose_local_fallback_retries,
 | 
			
		||||
						recurse_to_leaf,
 | 
			
		||||
						c+osize);
 | 
			
		||||
						vary_r,
 | 
			
		||||
						c+osize,
 | 
			
		||||
						0);
 | 
			
		||||
				} else {
 | 
			
		||||
					crush_choose_indep(
 | 
			
		||||
						map,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue