mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	x86/mm, tracing: Fix CR2 corruption
Despite the current efforts to read CR2 before tracing happens there still
exist a number of possible holes:
  idtentry page_fault             do_page_fault           has_error_code=1
    call error_entry
      TRACE_IRQS_OFF
        call trace_hardirqs_off*
          #PF // modifies CR2
      CALL_enter_from_user_mode
        __context_tracking_exit()
          trace_user_exit(0)
            #PF // modifies CR2
    call do_page_fault
      address = read_cr2(); /* whoopsie */
And similar for i386.
Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.
Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: bp@alien8.de
Cc: rostedt@goodmis.org
Cc: torvalds@linux-foundation.org
Cc: hpa@zytor.com
Cc: dave.hansen@linux.intel.com
Cc: jgross@suse.com
Cc: joel@joelfernandes.org
Link: https://lkml.kernel.org/r/20190711114336.116812491@infradead.org
Debugged-by: Steven Rostedt <rostedt@goodmis.org>
			
			
This commit is contained in:
		
							parent
							
								
									4234653e88
								
							
						
					
					
						commit
						a0d14b8909
					
				
					 7 changed files with 59 additions and 51 deletions
				
			
		| 
						 | 
				
			
			@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
 | 
			
		|||
 | 
			
		||||
ENTRY(page_fault)
 | 
			
		||||
	ASM_CLAC
 | 
			
		||||
	pushl	$do_page_fault
 | 
			
		||||
	ALIGN
 | 
			
		||||
	jmp common_exception
 | 
			
		||||
	pushl	$0; /* %gs's slot on the stack */
 | 
			
		||||
 | 
			
		||||
	SAVE_ALL switch_stacks=1 skip_gs=1
 | 
			
		||||
 | 
			
		||||
	ENCODE_FRAME_POINTER
 | 
			
		||||
	UNWIND_ESPFIX_STACK
 | 
			
		||||
 | 
			
		||||
	/* fixup %gs */
 | 
			
		||||
	GS_TO_REG %ecx
 | 
			
		||||
	REG_TO_PTGS %ecx
 | 
			
		||||
	SET_KERNEL_GS %ecx
 | 
			
		||||
 | 
			
		||||
	GET_CR2_INTO(%ecx)			# might clobber %eax
 | 
			
		||||
 | 
			
		||||
	/* fixup orig %eax */
 | 
			
		||||
	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
 | 
			
		||||
	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 | 
			
		||||
 | 
			
		||||
	TRACE_IRQS_OFF
 | 
			
		||||
	movl	%esp, %eax			# pt_regs pointer
 | 
			
		||||
	call	do_page_fault
 | 
			
		||||
	jmp	ret_from_exception
 | 
			
		||||
END(page_fault)
 | 
			
		||||
 | 
			
		||||
common_exception:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 | 
			
		|||
 */
 | 
			
		||||
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 | 
			
		||||
 | 
			
		||||
.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
 | 
			
		||||
.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
 | 
			
		||||
 | 
			
		||||
	.if \paranoid
 | 
			
		||||
	call	paranoid_entry
 | 
			
		||||
| 
						 | 
				
			
			@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 | 
			
		|||
	.endif
 | 
			
		||||
	UNWIND_HINT_REGS
 | 
			
		||||
 | 
			
		||||
	.if \paranoid
 | 
			
		||||
	.if \read_cr2
 | 
			
		||||
	GET_CR2_INTO(%rdx);			/* can clobber %rax */
 | 
			
		||||
	.endif
 | 
			
		||||
 | 
			
		||||
	.if \shift_ist != -1
 | 
			
		||||
	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 | 
			
		||||
	.else
 | 
			
		||||
	TRACE_IRQS_OFF
 | 
			
		||||
	.endif
 | 
			
		||||
 | 
			
		||||
	.if \paranoid == 0
 | 
			
		||||
	testb	$3, CS(%rsp)
 | 
			
		||||
	jz	.Lfrom_kernel_no_context_tracking_\@
 | 
			
		||||
	CALL_enter_from_user_mode
 | 
			
		||||
.Lfrom_kernel_no_context_tracking_\@:
 | 
			
		||||
	.endif
 | 
			
		||||
 | 
			
		||||
	movq	%rsp, %rdi			/* pt_regs pointer */
 | 
			
		||||
| 
						 | 
				
			
			@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 | 
			
		|||
 *			fresh stack.  (This is for #DB, which has a nasty habit
 | 
			
		||||
 *			of recursing.)
 | 
			
		||||
 * @create_gap:		create a 6-word stack gap when coming from kernel mode.
 | 
			
		||||
 * @read_cr2:		load CR2 into the 3rd argument; done before calling any C code
 | 
			
		||||
 *
 | 
			
		||||
 * idtentry generates an IDT stub that sets up a usable kernel context,
 | 
			
		||||
 * creates struct pt_regs, and calls @do_sym.  The stub has the following
 | 
			
		||||
| 
						 | 
				
			
			@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 | 
			
		|||
 * @paranoid == 2 is special: the stub will never switch stacks.  This is for
 | 
			
		||||
 * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
 | 
			
		||||
 */
 | 
			
		||||
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
 | 
			
		||||
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
 | 
			
		||||
ENTRY(\sym)
 | 
			
		||||
	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -985,7 +995,7 @@ ENTRY(\sym)
 | 
			
		|||
.Lfrom_usermode_no_gap_\@:
 | 
			
		||||
	.endif
 | 
			
		||||
 | 
			
		||||
	idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
 | 
			
		||||
	idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
 | 
			
		||||
 | 
			
		||||
	.if \paranoid == 1
 | 
			
		||||
	/*
 | 
			
		||||
| 
						 | 
				
			
			@ -994,7 +1004,7 @@ ENTRY(\sym)
 | 
			
		|||
	 * run in real process context if user_mode(regs).
 | 
			
		||||
	 */
 | 
			
		||||
.Lfrom_usermode_switch_stack_\@:
 | 
			
		||||
	idtentry_part \do_sym, \has_error_code, paranoid=0
 | 
			
		||||
	idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
 | 
			
		||||
	.endif
 | 
			
		||||
 | 
			
		||||
_ASM_NOKPROBE(\sym)
 | 
			
		||||
| 
						 | 
				
			
			@ -1006,7 +1016,7 @@ idtentry overflow			do_overflow			has_error_code=0
 | 
			
		|||
idtentry bounds				do_bounds			has_error_code=0
 | 
			
		||||
idtentry invalid_op			do_invalid_op			has_error_code=0
 | 
			
		||||
idtentry device_not_available		do_device_not_available		has_error_code=0
 | 
			
		||||
idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2
 | 
			
		||||
idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2 read_cr2=1
 | 
			
		||||
idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
 | 
			
		||||
idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
 | 
			
		||||
idtentry segment_not_present		do_segment_not_present		has_error_code=1
 | 
			
		||||
| 
						 | 
				
			
			@ -1179,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_code=0
 | 
			
		|||
#endif
 | 
			
		||||
 | 
			
		||||
idtentry general_protection	do_general_protection	has_error_code=1
 | 
			
		||||
idtentry page_fault		do_page_fault		has_error_code=1
 | 
			
		||||
idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_KVM_GUEST
 | 
			
		||||
idtentry async_page_fault	do_async_page_fault	has_error_code=1
 | 
			
		||||
idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_X86_MCE
 | 
			
		||||
| 
						 | 
				
			
			@ -1281,18 +1291,9 @@ ENTRY(error_entry)
 | 
			
		|||
	movq	%rax, %rsp			/* switch stack */
 | 
			
		||||
	ENCODE_FRAME_POINTER
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * We need to tell lockdep that IRQs are off.  We can't do this until
 | 
			
		||||
	 * we fix gsbase, and we should do it before enter_from_user_mode
 | 
			
		||||
	 * (which can take locks).
 | 
			
		||||
	 */
 | 
			
		||||
	TRACE_IRQS_OFF
 | 
			
		||||
	CALL_enter_from_user_mode
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
.Lerror_entry_done:
 | 
			
		||||
	TRACE_IRQS_OFF
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 | 
			
		|||
void kvm_async_pf_task_wake(u32 token);
 | 
			
		||||
u32 kvm_read_and_reset_pf_reason(void);
 | 
			
		||||
extern void kvm_disable_steal_time(void);
 | 
			
		||||
void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
 | 
			
		||||
void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_PARAVIRT_SPINLOCKS
 | 
			
		||||
void __init kvm_spinlock_init(void);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
 | 
			
		|||
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
 | 
			
		||||
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
 | 
			
		||||
#ifdef CONFIG_X86_64
 | 
			
		||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code);
 | 
			
		||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address);
 | 
			
		||||
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
 | 
			
		||||
asmlinkage __visible notrace
 | 
			
		||||
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
 | 
			
		||||
void __init trap_init(void);
 | 
			
		||||
#endif
 | 
			
		||||
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
 | 
			
		||||
dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
 | 
			
		||||
dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 | 
			
		||||
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
 | 
			
		||||
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 | 
			
		||||
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 | 
			
		|||
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 | 
			
		||||
 | 
			
		||||
dotraplinkage void
 | 
			
		||||
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 | 
			
		||||
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 | 
			
		||||
{
 | 
			
		||||
	enum ctx_state prev_state;
 | 
			
		||||
 | 
			
		||||
	switch (kvm_read_and_reset_pf_reason()) {
 | 
			
		||||
	default:
 | 
			
		||||
		do_page_fault(regs, error_code);
 | 
			
		||||
		do_page_fault(regs, error_code, address);
 | 
			
		||||
		break;
 | 
			
		||||
	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 | 
			
		||||
		/* page is swapped out by the host. */
 | 
			
		||||
		prev_state = exception_enter();
 | 
			
		||||
		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
 | 
			
		||||
		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
 | 
			
		||||
		exception_exit(prev_state);
 | 
			
		||||
		break;
 | 
			
		||||
	case KVM_PV_REASON_PAGE_READY:
 | 
			
		||||
		rcu_irq_enter();
 | 
			
		||||
		kvm_async_pf_task_wake((u32)read_cr2());
 | 
			
		||||
		kvm_async_pf_task_wake((u32)address);
 | 
			
		||||
		rcu_irq_exit();
 | 
			
		||||
		break;
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const char *message,
 | 
			
		|||
 | 
			
		||||
#ifdef CONFIG_X86_64
 | 
			
		||||
/* Runs on IST stack */
 | 
			
		||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 | 
			
		||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
 | 
			
		||||
{
 | 
			
		||||
	static const char str[] = "double fault";
 | 
			
		||||
	struct task_struct *tsk = current;
 | 
			
		||||
#ifdef CONFIG_VMAP_STACK
 | 
			
		||||
	unsigned long cr2;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_X86_ESPFIX64
 | 
			
		||||
	extern unsigned char native_irq_return_iret[];
 | 
			
		||||
| 
						 | 
				
			
			@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 | 
			
		|||
	 * stack even if the actual trigger for the double fault was
 | 
			
		||||
	 * something else.
 | 
			
		||||
	 */
 | 
			
		||||
	cr2 = read_cr2();
 | 
			
		||||
	if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
 | 
			
		||||
		handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 | 
			
		||||
#endif
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1507,9 +1507,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 | 
			
		|||
NOKPROBE_SYMBOL(do_user_addr_fault);
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * This routine handles page faults.  It determines the address,
 | 
			
		||||
 * and the problem, and then passes it off to one of the appropriate
 | 
			
		||||
 * routines.
 | 
			
		||||
 * Explicitly marked noinline such that the function tracer sees this as the
 | 
			
		||||
 * page_fault entry point.
 | 
			
		||||
 */
 | 
			
		||||
static noinline void
 | 
			
		||||
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
 | 
			
		||||
| 
						 | 
				
			
			@ -1528,33 +1527,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
 | 
			
		|||
}
 | 
			
		||||
NOKPROBE_SYMBOL(__do_page_fault);
 | 
			
		||||
 | 
			
		||||
static nokprobe_inline void
 | 
			
		||||
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
 | 
			
		||||
			 unsigned long error_code)
 | 
			
		||||
static __always_inline void
 | 
			
		||||
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
 | 
			
		||||
			 unsigned long address)
 | 
			
		||||
{
 | 
			
		||||
	if (!trace_pagefault_enabled())
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	if (user_mode(regs))
 | 
			
		||||
		trace_page_fault_user(address, regs, error_code);
 | 
			
		||||
	else
 | 
			
		||||
		trace_page_fault_kernel(address, regs, error_code);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * We must have this function blacklisted from kprobes, tagged with notrace
 | 
			
		||||
 * and call read_cr2() before calling anything else. To avoid calling any
 | 
			
		||||
 * kind of tracing machinery before we've observed the CR2 value.
 | 
			
		||||
 *
 | 
			
		||||
 * exception_{enter,exit}() contains all sorts of tracepoints.
 | 
			
		||||
 */
 | 
			
		||||
dotraplinkage void notrace
 | 
			
		||||
do_page_fault(struct pt_regs *regs, unsigned long error_code)
 | 
			
		||||
dotraplinkage void
 | 
			
		||||
do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long address = read_cr2(); /* Get the faulting address */
 | 
			
		||||
	enum ctx_state prev_state;
 | 
			
		||||
 | 
			
		||||
	prev_state = exception_enter();
 | 
			
		||||
	if (trace_pagefault_enabled())
 | 
			
		||||
		trace_page_fault_entries(address, regs, error_code);
 | 
			
		||||
 | 
			
		||||
	trace_page_fault_entries(regs, error_code, address);
 | 
			
		||||
	__do_page_fault(regs, error_code, address);
 | 
			
		||||
	exception_exit(prev_state);
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue