[thirdparty/kernel/stable.git] / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*
 * Given a value i in 0..255 as the byte overflow when a field element
 * in GF(2^128) is multiplied by x^8, the following macro returns the
 * 16-bit value that must be XOR-ed into the low-degree end of the
 * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
 *
 * There are two versions of the macro, and hence two tables: one for
 * the "be" convention where the highest-order bit is the coefficient of
 * the highest-degree polynomial term, and one for the "le" convention
 * where the highest-order bit is the coefficient of the lowest-degree
 * polynomial term.  In both cases the values are stored in CPU byte
 * endianness such that the coefficients are ordered consistently across
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
 * correspond to the coefficients of x^15..x^0, and in the "le" table
 * bits 15..0 correspond to the coefficients of x^0..x^15.
 *
 * Therefore, provided that the appropriate byte endianness conversions
 * are done by the multiplication functions (and these must be in place
 * anyway to support both little endian and big endian CPUs), the "be"
 * table can be used for multiplications of both "bbe" and "ble"
 * elements, and the "le" table can be used for multiplications of both
 * "lle" and "lbe" elements.
 */

#define xda_be(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_le(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);

/*
 * The following functions multiply a field element by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

void gf128mul_x8_ble(le128 *r, const le128 *x)
{
	u64 a = le64_to_cpu(x->a);
	u64 b = le64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	r->a = cpu_to_le64((a << 8) | (b >> 56));
	r->b = cpu_to_le64((b << 8) ^ _tt);
}
EXPORT_SYMBOL(gf128mul_x8_ble);

void gf128mul_lle(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[0]);
		if (ch & 0x40)
			be128_xor(r, r, &p[1]);
		if (ch & 0x20)
			be128_xor(r, r, &p[2]);
		if (ch & 0x10)
			be128_xor(r, r, &p[3]);
		if (ch & 0x08)
			be128_xor(r, r, &p[4]);
		if (ch & 0x04)
			be128_xor(r, r, &p[5]);
		if (ch & 0x02)
			be128_xor(r, r, &p[6]);
		if (ch & 0x01)
			be128_xor(r, r, &p[7]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle(r);
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kzfree(t->t[i]);
	kzfree(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
63be5b53	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
f33fd647 EB	91	/*
	92	* Given a value i in 0..255 as the byte overflow when a field element
	93	* in GF(2^128) is multiplied by x^8, the following macro returns the
	94	* 16-bit value that must be XOR-ed into the low-degree end of the
	95	* product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
	96	*
	97	* There are two versions of the macro, and hence two tables: one for
	98	* the "be" convention where the highest-order bit is the coefficient of
	99	* the highest-degree polynomial term, and one for the "le" convention
	100	* where the highest-order bit is the coefficient of the lowest-degree
	101	* polynomial term. In both cases the values are stored in CPU byte
	102	* endianness such that the coefficients are ordered consistently across
	103	* bytes, i.e. in the "be" table bits 15..0 of the stored value
	104	* correspond to the coefficients of x^15..x^0, and in the "le" table
	105	* bits 15..0 correspond to the coefficients of x^0..x^15.
	106	*
	107	* Therefore, provided that the appropriate byte endianness conversions
	108	* are done by the multiplication functions (and these must be in place
	109	* anyway to support both little endian and big endian CPUs), the "be"
	110	* table can be used for multiplications of both "bbe" and "ble"
	111	* elements, and the "le" table can be used for multiplications of both
	112	* "lle" and "lbe" elements.
	113	*/
c494e070	114
f33fd647	115	#define xda_be(i) ( \
2416e4fa EB	116	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	117	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	118	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	119	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	120	)
c494e070 RS	121
f33fd647	122	#define xda_le(i) ( \
2416e4fa EB	123	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	124	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	125	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	126	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	127	)
c494e070 RS	128
f33fd647 EB	129	static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
f33fd647 EB	130	static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
c494e070	131
63be5b53	132	/*
acb9b159	133	* The following functions multiply a field element by x^8 in
63be5b53 EB	134	* the polynomial field representation. They use 64-bit word operations
63be5b53 EB	135	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	136	* correctly on both styles of machine.
	137	*/
	138
c494e070 RS	139	static void gf128mul_x8_lle(be128 *x)
	140	{
	141	u64 a = be64_to_cpu(x->a);
	142	u64 b = be64_to_cpu(x->b);
f33fd647	143	u64 _tt = gf128mul_table_le[b & 0xff];
c494e070 RS	144
	145	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	146	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	147	}
	148
	149	static void gf128mul_x8_bbe(be128 *x)
	150	{
	151	u64 a = be64_to_cpu(x->a);
	152	u64 b = be64_to_cpu(x->b);
f33fd647	153	u64 _tt = gf128mul_table_be[a >> 56];
c494e070 RS	154
	155	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	156	x->b = cpu_to_be64((b << 8) ^ _tt);
	157	}
	158
acfc5878 HJ	159	void gf128mul_x8_ble(le128 r, const le128 x)
	160	{
	161	u64 a = le64_to_cpu(x->a);
	162	u64 b = le64_to_cpu(x->b);
acfc5878 HJ	163	u64 _tt = gf128mul_table_be[a >> 56];
	164
	165	r->a = cpu_to_le64((a << 8) \| (b >> 56));
	166	r->b = cpu_to_le64((b << 8) ^ _tt);
	167	}
	168	EXPORT_SYMBOL(gf128mul_x8_ble);
	169
c494e070 RS	170	void gf128mul_lle(be128 r, const be128 b)
	171	{
	172	be128 p[8];
	173	int i;
	174
	175	p[0] = *r;
	176	for (i = 0; i < 7; ++i)
	177	gf128mul_x_lle(&p[i + 1], &p[i]);
	178
62542663	179	memset(r, 0, sizeof(*r));
c494e070 RS	180	for (i = 0;;) {
	181	u8 ch = ((u8 *)b)[15 - i];
	182
	183	if (ch & 0x80)
	184	be128_xor(r, r, &p[0]);
	185	if (ch & 0x40)
	186	be128_xor(r, r, &p[1]);
	187	if (ch & 0x20)
	188	be128_xor(r, r, &p[2]);
	189	if (ch & 0x10)
	190	be128_xor(r, r, &p[3]);
	191	if (ch & 0x08)
	192	be128_xor(r, r, &p[4]);
	193	if (ch & 0x04)
	194	be128_xor(r, r, &p[5]);
	195	if (ch & 0x02)
	196	be128_xor(r, r, &p[6]);
	197	if (ch & 0x01)
	198	be128_xor(r, r, &p[7]);
	199
	200	if (++i >= 16)
	201	break;
	202
	203	gf128mul_x8_lle(r);
	204	}
	205	}
	206	EXPORT_SYMBOL(gf128mul_lle);
	207
	208	void gf128mul_bbe(be128 r, const be128 b)
	209	{
	210	be128 p[8];
	211	int i;
	212
	213	p[0] = *r;
	214	for (i = 0; i < 7; ++i)
	215	gf128mul_x_bbe(&p[i + 1], &p[i]);
	216
62542663	217	memset(r, 0, sizeof(*r));
c494e070 RS	218	for (i = 0;;) {
	219	u8 ch = ((u8 *)b)[i];
	220
	221	if (ch & 0x80)
	222	be128_xor(r, r, &p[7]);
	223	if (ch & 0x40)
	224	be128_xor(r, r, &p[6]);
	225	if (ch & 0x20)
	226	be128_xor(r, r, &p[5]);
	227	if (ch & 0x10)
	228	be128_xor(r, r, &p[4]);
	229	if (ch & 0x08)
	230	be128_xor(r, r, &p[3]);
	231	if (ch & 0x04)
	232	be128_xor(r, r, &p[2]);
	233	if (ch & 0x02)
	234	be128_xor(r, r, &p[1]);
	235	if (ch & 0x01)
	236	be128_xor(r, r, &p[0]);
	237
	238	if (++i >= 16)
	239	break;
	240
	241	gf128mul_x8_bbe(r);
	242	}
	243	}
	244	EXPORT_SYMBOL(gf128mul_bbe);
	245
	246	/* This version uses 64k bytes of table space.
	247	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	248	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	249	the buffer's lowest byte, we can construct a table of
	250	the 256 16 byte values that result from the 256 values
	251	of this byte. This requires 4096 bytes. But we also
	252	need tables for each of the 16 higher bytes in the
	253	buffer as well, which makes 64 kbytes in total.
	254	*/
	255	/* additional explanation
	256	* t[0][BYTE] contains g*BYTE
	257	* t[1][BYTE] contains gx^8BYTE
	258	* ..
	259	* t[15][BYTE] contains gx^120BYTE */
c494e070 RS	260	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	261	{
	262	struct gf128mul_64k *t;
	263	int i, j, k;
	264
	265	t = kzalloc(sizeof(*t), GFP_KERNEL);
	266	if (!t)
	267	goto out;
	268
	269	for (i = 0; i < 16; i++) {
	270	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	271	if (!t->t[i]) {
	272	gf128mul_free_64k(t);
	273	t = NULL;
	274	goto out;
	275	}
	276	}
	277
	278	t->t[0]->t[1] = *g;
	279	for (j = 1; j <= 64; j <<= 1)
	280	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
	281
	282	for (i = 0;;) {
	283	for (j = 2; j < 256; j += j)
	284	for (k = 1; k < j; ++k)
	285	be128_xor(&t->t[i]->t[j + k],
	286	&t->t[i]->t[j], &t->t[i]->t[k]);
	287
	288	if (++i >= 16)
	289	break;
	290
	291	for (j = 128; j > 0; j >>= 1) {
	292	t->t[i]->t[j] = t->t[i - 1]->t[j];
	293	gf128mul_x8_bbe(&t->t[i]->t[j]);
	294	}
	295	}
	296
	297	out:
	298	return t;
	299	}
	300	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
	301
	302	void gf128mul_free_64k(struct gf128mul_64k *t)
	303	{
	304	int i;
	305
	306	for (i = 0; i < 16; i++)
75aa0a7c AC	307	kzfree(t->t[i]);
75aa0a7c AC	308	kzfree(t);
c494e070 RS	309	}
	310	EXPORT_SYMBOL(gf128mul_free_64k);
	311
3ea996dd	312	void gf128mul_64k_bbe(be128 a, const struct gf128mul_64k t)
c494e070 RS	313	{
	314	u8 ap = (u8 )a;
	315	be128 r[1];
	316	int i;
	317
	318	*r = t->t[0]->t[ap[15]];
	319	for (i = 1; i < 16; ++i)
	320	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	321	a = r;
	322	}
	323	EXPORT_SYMBOL(gf128mul_64k_bbe);
	324
	325	/* This version uses 4k bytes of table space.
	326	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	327	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	328	single byte, we can construct a table of the 256 16 byte
	329	values that result from the 256 values of this byte.
	330	This requires 4096 bytes. If we take the highest byte in
	331	the buffer and use this table to get the result, we then
	332	have to multiply by x^120 to get the final value. For the
	333	next highest byte the result has to be multiplied by x^112
	334	and so on. But we can do this by accumulating the result
	335	in an accumulator starting with the result for the top
	336	byte. We repeatedly multiply the accumulator value by
	337	x^8 and then add in (i.e. xor) the 16 bytes of the next
	338	lower byte in the buffer, stopping when we reach the
	339	lowest byte. This requires a 4096 byte table.
	340	*/
	341	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	342	{
	343	struct gf128mul_4k *t;
	344	int j, k;
	345
	346	t = kzalloc(sizeof(*t), GFP_KERNEL);
	347	if (!t)
	348	goto out;
	349
	350	t->t[128] = *g;
	351	for (j = 64; j > 0; j >>= 1)
	352	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	353
	354	for (j = 2; j < 256; j += j)
	355	for (k = 1; k < j; ++k)
	356	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	357
	358	out:
	359	return t;
	360	}
	361	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	362
	363	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	364	{
	365	struct gf128mul_4k *t;
	366	int j, k;
	367
	368	t = kzalloc(sizeof(*t), GFP_KERNEL);
	369	if (!t)
	370	goto out;
	371
	372	t->t[1] = *g;
	373	for (j = 1; j <= 64; j <<= 1)
	374	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	375
	376	for (j = 2; j < 256; j += j)
	377	for (k = 1; k < j; ++k)
	378	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	379
	380	out:
	381	return t;
	382	}
	383	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	384
3ea996dd	385	void gf128mul_4k_lle(be128 a, const struct gf128mul_4k t)
c494e070 RS	386	{
	387	u8 ap = (u8 )a;
	388	be128 r[1];
	389	int i = 15;
	390
	391	*r = t->t[ap[15]];
	392	while (i--) {
	393	gf128mul_x8_lle(r);
	394	be128_xor(r, r, &t->t[ap[i]]);
	395	}
	396	a = r;
	397	}
	398	EXPORT_SYMBOL(gf128mul_4k_lle);
	399
3ea996dd	400	void gf128mul_4k_bbe(be128 a, const struct gf128mul_4k t)
c494e070 RS	401	{
	402	u8 ap = (u8 )a;
	403	be128 r[1];
	404	int i = 0;
	405
	406	*r = t->t[ap[0]];
	407	while (++i < 16) {
	408	gf128mul_x8_bbe(r);
	409	be128_xor(r, r, &t->t[ap[i]]);
	410	}
	411	a = r;
	412	}
	413	EXPORT_SYMBOL(gf128mul_4k_bbe);
	414
	415	MODULE_LICENSE("GPL");
	416	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");