poly-hash-ce-core.S 4.92 KB
Newer Older
Tony Feng committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
/*
 * Accelerated poly_hash implementation with ARMv8 PMULL instructions.
 *
 * Based on ghash-ce-core.S.
 *
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2017 Google, Inc. <ebiggers@google.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

	KEY	.req	v0
	KEY2	.req	v1
	T1	.req	v2
	T2	.req	v3
	GSTAR	.req	v4
	XL	.req	v5
	XM	.req	v6
	XH	.req	v7

	.text
	.arch		armv8-a+crypto

	/* 16-byte aligned (2**4 = 16); not required, but might as well */
	.align		4
.Lgstar:
	.quad		0x87, 0x87

/*
 * void pmull_poly_hash_update(le128 *digest, const le128 *key,
 *			       const u8 *src, unsigned int blocks,
 *			       unsigned int partial);
 */
ENTRY(pmull_poly_hash_update)

	/* Load digest into XL */
	ld1		{XL.16b}, [x0]

	/* Load key into KEY */
	ld1		{KEY.16b}, [x1]

	/* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of
	 * GSTAR */
	adr		x1, .Lgstar
	ld1		{GSTAR.2d}, [x1]

	/* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]).  This is needed for
	 * Karatsuba multiplication. */
	ext		KEY2.16b, KEY.16b, KEY.16b, #8
	eor		KEY2.16b, KEY2.16b, KEY.16b

	/* If 'partial' is nonzero, then we're finishing a pending block and
	 * should go right to the multiplication. */
	cbnz		w4, 1f

0:
	/* Add the next block from 'src' to the digest */
	ld1		{T1.16b}, [x2], #16
	eor		XL.16b, XL.16b, T1.16b
	sub		w3, w3, #1

1:
	/*
	 * Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key
	 * (b1:b0, in KEY) using Karatsuba multiplication.
	 */

	/* T1 = (a1+a0):(a1+a0) */
	ext		T1.16b, XL.16b, XL.16b, #8
	eor		T1.16b, T1.16b, XL.16b

	/* XH = a1 * b1 */
	pmull2		XH.1q, XL.2d, KEY.2d

	/* XL = a0 * b0 */
	pmull		XL.1q, XL.1d, KEY.1d

	/* XM = (a1+a0) * (b1+b0) */
	pmull		XM.1q, T1.1d, KEY2.1d

	/* XM += (XH[0]:XL[1]) + XL + XH */
	ext		T1.16b, XL.16b, XH.16b, #8
	eor		T2.16b, XL.16b, XH.16b
	eor		XM.16b, XM.16b, T1.16b
	eor		XM.16b, XM.16b, T2.16b

	/*
	 * Now the 256-bit product is in XH[1]:XM:XL[0].  It represents a
	 * polynomial over GF(2) with degree as large as 255.  We need to
	 * compute its remainder modulo g(x) = x^128+x^7+x^2+x+1.  For this it
	 * is sufficient to compute the remainder of the high half 'c(x)x^128'
	 * add it to the low half.  To reduce the high half we use the Barrett
	 * reduction method.  The basic idea is that we can express the
	 * remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x).
	 * As detailed in [1], to avoid having to divide by g(x) at runtime the
	 * following equivalent expression can be derived:
	 *
	 *	p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128
	 *
	 * where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x)
	 * in this case.  This is also equivalent to:
	 *
	 *	p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128
	 *	     = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128
	 *
	 * Since deg g*(x) < 64:
	 *
	 *	p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128
	 *	     = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) +
	 *				((c(x)/x^64)g*(x))/x^64) ] mod x^128
	 *
	 * Letting t(x) = g*(x)(c(x)/x^64):
	 *
	 *	p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128
	 *
	 * Therefore, to do the reduction we only need to issue two 64-bit =>
	 * 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x)
	 * times ((c(x) mod x^64) + t(x)/x^64).  (Multiplication by x^64 doesn't
	 * count since it is simply a shift or move.)
	 *
	 * An alternate reduction method, also based on Barrett reduction and
	 * described in [1], uses only shifts and XORs --- no multiplications.
	 * However, the method with multiplications requires fewer instructions
	 * and is faster on processors with fast carryless multiplication.
	 *
	 * [1] "Intel Carry-Less Multiplication Instruction and its Usage for
	 * Computing the GCM Mode",
	 * https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
	 */

	/* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */

	/* T1 = t(x) = g*(x)(c(x)/x^64) */
	pmull2		T1.1q, GSTAR.2d, XH.2d

	/* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */
	eor		T2.16b, XM.16b, T1.16b
	pmull2		T2.1q, GSTAR.2d, T2.2d

	/* Make XL[0] be the low half of the 128-bit result by adding the low 64
	 * bits of the T2 term to what was already there.  The 't(x)x^64' term
	 * makes no difference, so skip it. */
	eor		XL.16b, XL.16b, T2.16b

	/* Make XL[1] be the high half of the 128-bit result by adding the high
	 * 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0],
	 * then moving XM[0] to XL[1]. */
	eor		XM.16b, XM.16b, T1.16b
	ext		T2.16b, T2.16b, T2.16b, #8
	eor		XM.16b, XM.16b, T2.16b
	mov		XL.d[1], XM.d[0]

	/* If more blocks remain, then loop back to process the next block;
	 * else, store the digest and return. */
	cbnz		w3, 0b
	st1		{XL.16b}, [x0]
	ret
ENDPROC(pmull_poly_hash_update)