mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	eventpoll: Set epoll timeout if it's in the future
Avoid an edge case where epoll_wait arms a timer and calls schedule()
even if the timer will expire immediately.
For example: if the user has specified an epoll busy poll usecs which is
equal or larger than the epoll_wait/epoll_pwait2 timeout, it is
unnecessary to call schedule_hrtimeout_range; the busy poll usecs have
consumed the entire timeout duration so it is unnecessary to induce
scheduling latency by calling schedule() (via schedule_hrtimeout_range).
This can be measured using a simple bpftrace script:
tracepoint:sched:sched_switch
/ args->prev_pid == $1 /
{
  print(kstack());
  print(ustack());
}
Before this patch is applied:
  Testing an epoll_wait app with busy poll usecs set to 1000, and
  epoll_wait timeout set to 1ms using the script above shows:
     __traceiter_sched_switch+69
     __schedule+1495
     schedule+32
     schedule_hrtimeout_range+159
     do_epoll_wait+1424
     __x64_sys_epoll_wait+97
     do_syscall_64+95
     entry_SYSCALL_64_after_hwframe+118
     epoll_wait+82
  Which is unexpected; the busy poll usecs should have consumed the
  entire timeout and there should be no reason to arm a timer.
After this patch is applied: the same test scenario does not generate a
call to schedule() in the above edge case. If the busy poll usecs are
reduced (for example usecs: 100, epoll_wait timeout 1ms) the timer is
armed as expected.
Fixes: bf3b9f6372 ("epoll: Add busy poll support to epoll with socket fds.")
Signed-off-by: Joe Damato <jdamato@fastly.com>
Link: https://lore.kernel.org/20250416185826.26375-1-jdamato@fastly.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
			
			
This commit is contained in:
		
							parent
							
								
									a681b7c17d
								
							
						
					
					
						commit
						0a65bc27bd
					
				
					 1 changed files with 9 additions and 1 deletions
				
			
		| 
						 | 
					@ -1996,6 +1996,14 @@ static int ep_try_send_events(struct eventpoll *ep,
 | 
				
			||||||
	return res;
 | 
						return res;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int ep_schedule_timeout(ktime_t *to)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (to)
 | 
				
			||||||
 | 
							return ktime_after(*to, ktime_get());
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 | 
					 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 | 
				
			||||||
 *           event buffer.
 | 
					 *           event buffer.
 | 
				
			||||||
| 
						 | 
					@ -2103,7 +2111,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		write_unlock_irq(&ep->lock);
 | 
							write_unlock_irq(&ep->lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (!eavail)
 | 
							if (!eavail && ep_schedule_timeout(to))
 | 
				
			||||||
			timed_out = !schedule_hrtimeout_range(to, slack,
 | 
								timed_out = !schedule_hrtimeout_range(to, slack,
 | 
				
			||||||
							      HRTIMER_MODE_ABS);
 | 
												      HRTIMER_MODE_ABS);
 | 
				
			||||||
		__set_current_state(TASK_RUNNING);
 | 
							__set_current_state(TASK_RUNNING);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue