riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall

This patch will reduce cpu usage dramatically in kernel space especially for application which use sys-call with large buffer size, such as network applications. The main reason behind this is that every unaligned memory access will raise exceptions and switch between s-mode and m-mode causing large overhead. First copy in bytes until reaches the first word aligned boundary in destination memory address. This is the preparation before the bulk aligned word copy. The destination address is aligned now, but oftentimes the source address is not in an aligned boundary. To reduce the unaligned memory access, it reads the data from source in aligned boundaries, which will cause the data to have an offset, and then combines the data in the next iteration by fixing offset with shifting before writing to destination. The majority of the improving copy speed comes from this shift copy. In the lucky situation that the both source and destination address are on the aligned boundary, perform load and store with register size to copy the data. Without the unrolling, it will reduce the speed since the next store instruction for the same register using from the load will stall the pipeline. At last, copying the remainder in one byte at a time. Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com> Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
2021-06-23 21:40:39 +09:00 · 2021-06-23 21:40:39 +09:00 · ca6eaaa210
commit ca6eaaa210
parent 31da94c25a
1 changed files with 146 additions and 35 deletions
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@ -19,50 +19,161 @@ ENTRY(__asm_copy_from_user)
 	li t6, SR_SUM
 	csrs CSR_STATUS, t6
-	add a3, a1, a2
+	/* Save for return value */
-	/* Use word-oriented copy only if low-order bits match */
+	mv	t5, a2
 	andi t0, a0, SZREG-1
 	andi t1, a1, SZREG-1
 	bne t0, t1, 2f
 	addi t0, a1, SZREG-1
 	andi t1, a3, ~(SZREG-1)
 	andi t0, t0, ~(SZREG-1)
 	/*
-	 * a3: terminal address of source region
+	 * Register allocation for code below:
-	 * t0: lowest XLEN-aligned address in source
+	 * a0 - start of uncopied dst
-	 * t1: highest XLEN-aligned address in source
+	 * a1 - start of uncopied src
 	 * a2 - size
 	 * t0 - end of uncopied dst
 	 */
-	bgeu t0, t1, 2f
+	add	t0, a0, a2
-	bltu a1, t0, 4f
+	bgtu	a0, t0, 5f
 	/*
 	 * Use byte copy only if too small.
 	 */
 	li	a3, 8*SZREG /* size must be larger than size in word_copy */
 	bltu	a2, a3, .Lbyte_copy_tail
 	/*
 	 * Copy first bytes until dst is align to word boundary.
 	 * a0 - start of dst
 	 * t1 - start of aligned dst
 	 */
 	addi	t1, a0, SZREG-1
 	andi	t1, t1, ~(SZREG-1)
 	/* dst is already aligned, skip */
 	beq	a0, t1, .Lskip_first_bytes
 1:
-	fixup REG_L, t2, (a1), 10f
+	/* a5 - one byte for copying data */
-	fixup REG_S, t2, (a0), 10f
+	fixup lb      a5, 0(a1), 10f
-	addi a1, a1, SZREG
+	addi	a1, a1, 1	/* src */
-	addi a0, a0, SZREG
+	fixup sb      a5, 0(a0), 10f
-	bltu a1, t1, 1b
+	addi	a0, a0, 1	/* dst */
 	bltu	a0, t1, 1b	/* t1 - start of aligned dst */
 .Lskip_first_bytes:
 	/*
 	 * Now dst is aligned.
 	 * Use shift-copy if src is misaligned.
 	 * Use word-copy if both src and dst are aligned because
 	 * can not use shift-copy which do not require shifting
 	 */
 	/* a1 - start of src */
 	andi	a3, a1, SZREG-1
 	bnez	a3, .Lshift_copy
 .Lword_copy:
        /*
 	 * Both src and dst are aligned, unrolled word copy
 	 *
 	 * a0 - start of aligned dst
 	 * a1 - start of aligned src
 	 * a3 - a1 & mask:(SZREG-1)
 	 * t0 - end of aligned dst
 	 */
 	addi	t0, t0, -(8*SZREG-1) /* not to over run */
 2:
-	bltu a1, a3, 5f
+	fixup REG_L   a4,        0(a1), 10f
 	fixup REG_L   a5,    SZREG(a1), 10f
 	fixup REG_L   a6,  2*SZREG(a1), 10f
 	fixup REG_L   a7,  3*SZREG(a1), 10f
 	fixup REG_L   t1,  4*SZREG(a1), 10f
 	fixup REG_L   t2,  5*SZREG(a1), 10f
 	fixup REG_L   t3,  6*SZREG(a1), 10f
 	fixup REG_L   t4,  7*SZREG(a1), 10f
 	fixup REG_S   a4,        0(a0), 10f
 	fixup REG_S   a5,    SZREG(a0), 10f
 	fixup REG_S   a6,  2*SZREG(a0), 10f
 	fixup REG_S   a7,  3*SZREG(a0), 10f
 	fixup REG_S   t1,  4*SZREG(a0), 10f
 	fixup REG_S   t2,  5*SZREG(a0), 10f
 	fixup REG_S   t3,  6*SZREG(a0), 10f
 	fixup REG_S   t4,  7*SZREG(a0), 10f
 	addi	a0, a0, 8*SZREG
 	addi	a1, a1, 8*SZREG
 	bltu	a0, t0, 2b
 	addi	t0, t0, 8*SZREG-1 /* revert to original value */
 	j	.Lbyte_copy_tail
 .Lshift_copy:
 	/*
 	 * Word copy with shifting.
 	 * For misaligned copy we still perform aligned word copy, but
 	 * we need to use the value fetched from the previous iteration and
 	 * do some shifts.
 	 * This is safe because reading less than a word size.
 	 *
 	 * a0 - start of aligned dst
 	 * a1 - start of src
 	 * a3 - a1 & mask:(SZREG-1)
 	 * t0 - end of uncopied dst
 	 * t1 - end of aligned dst
 	 */
 	/* calculating aligned word boundary for dst */
 	andi	t1, t0, ~(SZREG-1)
 	/* Converting unaligned src to aligned arc */
 	andi	a1, a1, ~(SZREG-1)
 	/*
 	 * Calculate shifts
 	 * t3 - prev shift
 	 * t4 - current shift
 	 */
 	slli	t3, a3, LGREG
 	li	a5, SZREG*8
 	sub	t4, a5, t3
 	/* Load the first word to combine with seceond word */
 	fixup REG_L   a5, 0(a1), 10f
 3:
 	/* Main shifting copy
 	 *
 	 * a0 - start of aligned dst
 	 * a1 - start of aligned src
 	 * t1 - end of aligned dst
 	 */
 	/* At least one iteration will be executed */
 	srl	a4, a5, t3
 	fixup REG_L   a5, SZREG(a1), 10f
 	addi	a1, a1, SZREG
 	sll	a2, a5, t4
 	or	a2, a2, a4
 	fixup REG_S   a2, 0(a0), 10f
 	addi	a0, a0, SZREG
 	bltu	a0, t1, 3b
 	/* Revert src to original unaligned value  */
 	add	a1, a1, a3
 .Lbyte_copy_tail:
 	/*
 	 * Byte copy anything left.
 	 *
 	 * a0 - start of remaining dst
 	 * a1 - start of remaining src
 	 * t0 - end of remaining dst
 	 */
 	bgeu	a0, t0, 5f
 4:
 	fixup lb      a5, 0(a1), 10f
 	addi	a1, a1, 1	/* src */
 	fixup sb      a5, 0(a0), 10f
 	addi	a0, a0, 1	/* dst */
 	bltu	a0, t0, 4b	/* t0 - end of dst */
 5:
 	/* Disable access to user memory */
 	csrc CSR_STATUS, t6
 	li	a0, 0
 	ret
 4: /* Edge case: unalignment */
 	fixup lbu, t2, (a1), 10f
 	fixup sb, t2, (a0), 10f
 	addi a1, a1, 1
 	addi a0, a0, 1
 	bltu a1, t0, 4b
 	j 1b
 5: /* Edge case: remainder */
 	fixup lbu, t2, (a1), 10f
 	fixup sb, t2, (a0), 10f
 	addi a1, a1, 1
 	addi a0, a0, 1
 	bltu a1, a3, 5b
 	j 3b
 ENDPROC(__asm_copy_to_user)
 ENDPROC(__asm_copy_from_user)
 EXPORT_SYMBOL(__asm_copy_to_user)
@ -117,7 +228,7 @@ EXPORT_SYMBOL(__clear_user)
 10:
 	/* Disable access to user memory */
 	csrs CSR_STATUS, t6
-	mv a0, a2
+	mv a0, t5
 	ret
 11:
 	csrs CSR_STATUS, t6