mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue
AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET with
SS == 0 results in an invalid usermode state in which SS is apparently
equal to __USER_DS but causes #SS if used.
Work around the issue by setting SS to __KERNEL_DS __switch_to, thus
ensuring that SYSRET never happens with SS set to NULL.
This was exposed by a recent vDSO cleanup.
Fixes: e7d6eefaaa x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									1190944f4b
								
							
						
					
					
						commit
						61f01dd941
					
				
					 5 changed files with 48 additions and 0 deletions
				
			
		| 
						 | 
					@ -427,6 +427,13 @@ sysretl_from_sys_call:
 | 
				
			||||||
	 * cs and ss are loaded from MSRs.
 | 
						 * cs and ss are loaded from MSRs.
 | 
				
			||||||
	 * (Note: 32bit->32bit SYSRET is different: since r11
 | 
						 * (Note: 32bit->32bit SYSRET is different: since r11
 | 
				
			||||||
	 * does not exist, it merely sets eflags.IF=1).
 | 
						 * does not exist, it merely sets eflags.IF=1).
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
 | 
				
			||||||
 | 
						 * descriptor is not reinitialized.  This means that we must
 | 
				
			||||||
 | 
						 * avoid SYSRET with SS == NULL, which could happen if we schedule,
 | 
				
			||||||
 | 
						 * exit the kernel, and re-enter using an interrupt vector.  (All
 | 
				
			||||||
 | 
						 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
 | 
				
			||||||
 | 
						 * from happening by reloading SS in __switch_to.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	USERGS_SYSRET32
 | 
						USERGS_SYSRET32
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -265,6 +265,7 @@
 | 
				
			||||||
#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
 | 
					#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
 | 
				
			||||||
#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 | 
					#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 | 
				
			||||||
#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 | 
					#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 | 
				
			||||||
 | 
					#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 | 
					#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
 | 
				
			||||||
	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
 | 
						if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
 | 
				
			||||||
		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
 | 
							if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
 | 
				
			||||||
			set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 | 
								set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* AMD CPUs don't reset SS attributes on SYSRET */
 | 
				
			||||||
 | 
						set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_X86_32
 | 
					#ifdef CONFIG_X86_32
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -295,6 +295,15 @@ system_call_fastpath:
 | 
				
			||||||
	 * rflags from r11 (but RF and VM bits are forced to 0),
 | 
						 * rflags from r11 (but RF and VM bits are forced to 0),
 | 
				
			||||||
	 * cs and ss are loaded from MSRs.
 | 
						 * cs and ss are loaded from MSRs.
 | 
				
			||||||
	 * Restoration of rflags re-enables interrupts.
 | 
						 * Restoration of rflags re-enables interrupts.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
 | 
				
			||||||
 | 
						 * descriptor is not reinitialized.  This means that we should
 | 
				
			||||||
 | 
						 * avoid SYSRET with SS == NULL, which could happen if we schedule,
 | 
				
			||||||
 | 
						 * exit the kernel, and re-enter using an interrupt vector.  (All
 | 
				
			||||||
 | 
						 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
 | 
				
			||||||
 | 
						 * from happening by reloading SS in __switch_to.  (Actually
 | 
				
			||||||
 | 
						 * detecting the failure in 64-bit userspace is tricky but can be
 | 
				
			||||||
 | 
						 * done.)
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	USERGS_SYSRET64
 | 
						USERGS_SYSRET64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 | 
				
			||||||
		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 | 
							     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 | 
				
			||||||
		__switch_to_xtra(prev_p, next_p, tss);
 | 
							__switch_to_xtra(prev_p, next_p, tss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 | 
				
			||||||
 | 
							 * does not update the cached descriptor.  As a result, if we
 | 
				
			||||||
 | 
							 * do SYSRET while SS is NULL, we'll end up in user mode with
 | 
				
			||||||
 | 
							 * SS apparently equal to __USER_DS but actually unusable.
 | 
				
			||||||
 | 
							 *
 | 
				
			||||||
 | 
							 * The straightforward workaround would be to fix it up just
 | 
				
			||||||
 | 
							 * before SYSRET, but that would slow down the system call
 | 
				
			||||||
 | 
							 * fast paths.  Instead, we ensure that SS is never NULL in
 | 
				
			||||||
 | 
							 * system call context.  We do this by replacing NULL SS
 | 
				
			||||||
 | 
							 * selectors at every context switch.  SYSCALL sets up a valid
 | 
				
			||||||
 | 
							 * SS, so the only way to get NULL is to re-enter the kernel
 | 
				
			||||||
 | 
							 * from CPL 3 through an interrupt.  Since that can't happen
 | 
				
			||||||
 | 
							 * in the same task as a running syscall, we are guaranteed to
 | 
				
			||||||
 | 
							 * context switch between every interrupt vector entry and a
 | 
				
			||||||
 | 
							 * subsequent SYSRET.
 | 
				
			||||||
 | 
							 *
 | 
				
			||||||
 | 
							 * We read SS first because SS reads are much faster than
 | 
				
			||||||
 | 
							 * writes.  Out of caution, we force SS to __KERNEL_DS even if
 | 
				
			||||||
 | 
							 * it previously had a different non-NULL value.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							unsigned short ss_sel;
 | 
				
			||||||
 | 
							savesegment(ss, ss_sel);
 | 
				
			||||||
 | 
							if (ss_sel != __KERNEL_DS)
 | 
				
			||||||
 | 
								loadsegment(ss, __KERNEL_DS);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return prev_p;
 | 
						return prev_p;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue