[thirdparty/nettle.git] / serpent-encrypt.c

/* serpent-encrypt.c

   The serpent block cipher.

   For more details on this algorithm, see the Serpent website at
   http://www.cl.cam.ac.uk/~rja14/serpent.html

   Copyright (C) 2011  Niels Möller
   Copyright (C) 2010, 2011  Simon Josefsson
   Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
*/

/* This file is derived from cipher/serpent.c in Libgcrypt v1.4.6.
   The adaption to Nettle was made by Simon Josefsson on 2010-12-07
   with final touches on 2011-05-30.  Changes include replacing
   libgcrypt with nettle in the license template, renaming
   serpent_context to serpent_ctx, renaming u32 to uint32_t, removing
   libgcrypt stubs and selftests, modifying entry function prototypes,
   using FOR_BLOCKS to iterate through data in encrypt/decrypt, using
   LE_READ_UINT32 and LE_WRITE_UINT32 to access data in
   encrypt/decrypt, and running indent on the code. */

#if HAVE_CONFIG_H
#include "config.h"
#endif

#include <assert.h>
#include <limits.h>

#include "serpent.h"

#include "macros.h"
#include "serpent-internal.h"

/* These are the S-Boxes of Serpent.  They are copied from Serpents
   reference implementation (the optimized one, contained in
   `floppy2') and are therefore:

     Copyright (C) 1998 Ross Anderson, Eli Biham, Lars Knudsen.

  To quote the Serpent homepage
  (http://www.cl.cam.ac.uk/~rja14/serpent.html):

  "Serpent is now completely in the public domain, and we impose no
   restrictions on its use.  This was announced on the 21st August at
   the First AES Candidate Conference. The optimised implementations
   in the submission package are now under the GNU PUBLIC LICENSE
   (GPL), although some comments in the code still say otherwise. You
   are welcome to use Serpent for any application."  */

/* S0:  3  8 15  1 10  6  5 11 14 13  4  2  7  0  9 12 */
/* Could easily let y0, y1 overlap with x0, x1, and possibly also x2 and y2 */
#define SBOX0(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y3  = x1 ^ x2;					\
    y0  = x0 | x3;					\
    y1  = x0 ^ x1;					\
    y3 ^= y0;						\
    y2  = x2 | y3;					\
    x0 ^= x3;						\
    y2 &= x3;						\
    x3 ^= x2;						\
    x2 |= x1;						\
    y0  = y1 & x2;					\
    y2 ^= y0;						\
    y0 &= y2;						\
    y0 ^= x2;						\
    x1 &= x0;						\
    y0 ^= x0;						\
    y0  = ~ y0;						\
    y1  = y0 ^ x1;					\
    y1 ^= x3;						\
  } while (0)

/* FIXME: Arrange for some overlap between inputs and outputs? */
/* S1: 15 12  2  7  9  0  5 10  1 11 14  8  6 13  3  4 */
/* Original single-assignment form:
   
     t01 = x0  | x3;
     t02 = x2  ^ x3;
     t03 =     ~ x1;
     t04 = x0  ^ x2;
     t05 = x0  | t03;
     t06 = x3  & t04;
     t07 = t01 & t02;
     t08 = x1  | t06;
     y2  = t02 ^ t05;
     t10 = t07 ^ t08;
     t11 = t01 ^ t10;
     t12 = y2  ^ t11;
     t13 = x1  & x3;
     y3  =     ~ t10;
     y1  = t13 ^ t12;
     t16 = t10 | y1;
     t17 = t05 & t16;
     y0  = x2  ^ t17;
*/
#define SBOX1(x0, x1, x2, x3, y0, y1, y2, y3)		\
  do {							\
    y1  = x0 | x3;					\
    y2  = x2 ^ x3;					\
    y0  = ~ x1;						\
    y3  = x0 ^ x2;					\
    y0 |= x0;						\
    y3 &= x3;						\
    x0 = y1 & y2;					\
    y3 |= x1;						\
    y2 ^= y0;						\
    y3 ^= x0;						\
    x0  = y1 ^ y3;					\
    x0 ^= y2;						\
    y1  = x1 & x3;					\
    y1 ^= x0;						\
    x3  = y1 | y3;					\
    y3  = ~ y3;						\
    y0 &= x3;						\
    y0 ^= x2;						\
  } while (0)

/* FIXME: Arrange for some overlap between inputs and outputs? */
/* S2:  8  6  7  9  3 12 10 15 13  1 14  4  0 11  5  2 */
#define SBOX2(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y2  = x0 | x2;					\
    y1  = x0 ^ x1;					\
    y3  = x3 ^ y2;					\
    y0  = y1 ^ y3;					\
    x3 |= x0;						\
    x2 ^= y0;						\
    x0  = x1 ^ x2;					\
    x2 |= x1;						\
    x0 &= y2;						\
    y3 ^= x2;						\
    y1 |= y3;						\
    y1 ^= x0;						\
    y2  = y3 ^ y1;					\
    y2 ^= x1;						\
    y3  = ~ y3;						\
    y2 ^= x3;						\
  } while (0)

/* S3:  0 15 11  8 12  9  6  3 13  1  2  4 10  7  5 14 */
/* Original single-assignment form:
   
     t01 = x0  ^ x2;
     t02 = x0  | x3;
     t03 = x0  & x3;
     t04 = t01 & t02;
     t05 = x1  | t03;
     t06 = x0  & x1;
     t07 = x3  ^ t04;
     t08 = x2  | t06;
     t09 = x1  ^ t07;
     t10 = x3  & t05;
     t11 = t02 ^ t10;
     y3  = t08 ^ t09;
     t13 = x3  | y3;
     t14 = x0  | t07;
     t15 = x1  & t13;
     y2  = t08 ^ t11;
     y0  = t14 ^ t15;
     y1  = t05 ^ t04;
*/
#define SBOX3(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y1  = x0 ^ x2;					\
    y0  = x0 | x3;					\
    y3  = x0 & x3;					\
    y1 &= y0;						\
    y3 |= x1;						\
    y2  = x0 & x1;					\
    y2 |= x2;						\
    x2  = x3 ^ y1;					\
    y1 ^= y3;						\
    x0 |= x2;						\
    x2 ^= x1;						\
    y3 &= x3;						\
    y0 ^= y3;						\
    y3  = y2 ^ x2;					\
    y2 ^= y0;						\
    x3 |= y3;						\
    x1 &= x3;						\
    y0  = x0 ^ x1;					\
  } while (0)


/* S4:  1 15  8  3 12  0 11  6  2  5  4 10  9 14  7 13 */
/* Original single-assignment form:
    t01 = x0  | x1;
    t02 = x1  | x2;
    t03 = x0  ^ t02;
    t04 = x1  ^ x3;
    t05 = x3  | t03;
    t06 = x3  & t01;
    y3  = t03 ^ t06;
    t08 = y3  & t04;
    t09 = t04 & t05;
    t10 = x2  ^ t06;
    t11 = x1  & x2;
    t12 = t04 ^ t08;
    t13 = t11 | t03;
    t14 = t10 ^ t09;
    t15 = x0  & t05;
    t16 = t11 | t12;
    y2  = t13 ^ t08;
    y1  = t15 ^ t16;
    y0  =     ~ t14;
*/
#define SBOX4(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y3  = x0 | x1;					\
    y2  = x1 | x2;					\
    y2 ^= x0;						\
    y3 &= x3;						\
    y0  = x1 ^ x3;					\
    x3 |= y2;						\
    x0 &= x3;						\
    x1 &= x2;						\
    x2 ^= y3;						\
    y3 ^= y2;						\
    y2 |= x1;						\
    y1  = y3 & y0;					\
    y2 ^= y1;						\
    y1 ^= y0;						\
    y1 |= x1;						\
    y1 ^= x0;						\
    y0 &= x3;						\
    y0 ^= x2;						\
    y0  = ~y0;						\
  } while (0)

/* S5: 15  5  2 11  4 10  9 12  0  3 14  8 13  6  7  1 */
/* Original single-assignment form:
    t01 = x1  ^ x3;
    t02 = x1  | x3;
    t03 = x0  & t01;
    t04 = x2  ^ t02;
    t05 = t03 ^ t04;
    y0  =     ~ t05;
    t07 = x0  ^ t01;
    t08 = x3  | y0;
    t09 = x1  | t05;
    t10 = x3  ^ t08;
    t11 = x1  | t07;
    t12 = t03 | y0;
    t13 = t07 | t10;
    t14 = t01 ^ t11;
    y2  = t09 ^ t13;
    y1  = t07 ^ t08;
    y3  = t12 ^ t14;
*/
#define SBOX5(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y0  = x1 | x3;					\
    y0 ^= x2;						\
    x2  = x1 ^ x3;					\
    y2  = x0 ^ x2;					\
    x0 &= x2;						\
    y0 ^= x0;						\
    y3  = x1 | y2;					\
    x1 |= y0;						\
    y0  = ~y0;						\
    x0 |= y0;						\
    y3 ^= x2;						\
    y3 ^= x0;						\
    y1  = x3 | y0;					\
    x3 ^= y1;						\
    y1 ^= y2;						\
    y2 |= x3;						\
    y2 ^= x1;						\
  } while (0)

/* S6:  7  2 12  5  8  4  6 11 14  9  1 15 13  3 10  0 */
/* Original single-assignment form:
    t01 = x0  & x3;
    t02 = x1  ^ x2;
    t03 = x0  ^ x3;
    t04 = t01 ^ t02;
    t05 = x1  | x2;
    y1  =     ~ t04;
    t07 = t03 & t05;
    t08 = x1  & y1;
    t09 = x0  | x2;
    t10 = t07 ^ t08;
    t11 = x1  | x3;
    t12 = x2  ^ t11;
    t13 = t09 ^ t10;
    y2  =     ~ t13;
    t15 = y1  & t03;
    y3  = t12 ^ t07;
    t17 = x0  ^ x1;
    t18 = y2  ^ t15;
    y0  = t17 ^ t18;
*/
#define SBOX6(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y0  = x0 ^ x3;					\
    y1  = x0 & x3;					\
    y2  = x0 | x2;					\
    x3 |= x1;						\
    x3 ^= x2;						\
    x0 ^= x1;						\
    y3  = x1 | x2;					\
    x2 ^= x1;						\
    y3 &= y0;						\
    y1 ^= x2;						\
    y1  = ~y1;						\
    y0 &= y1;						\
    x1 &= y1;						\
    x1 ^= y3;						\
    y3 ^= x3;						\
    y2 ^= x1;						\
    y2  = ~y2;						\
    y0 ^= y2;						\
    y0 ^= x0;						\
  } while (0)

/* S7:  1 13 15  0 14  8  2 11  7  4 12 10  9  3  5  6 */
/* Original single-assignment form:
    t01 = x0  & x2;
    t02 =     ~ x3;
    t03 = x0  & t02;
    t04 = x1  | t01;
    t05 = x0  & x1;
    t06 = x2  ^ t04;
    y3  = t03 ^ t06;
    t08 = x2  | y3;
    t09 = x3  | t05;
    t10 = x0  ^ t08;
    t11 = t04 & y3;
    y1  = t09 ^ t10;
    t13 = x1  ^ y1;
    t14 = t01 ^ y1;
    t15 = x2  ^ t05;
    t16 = t11 | t13;
    t17 = t02 | t14;
    y0  = t15 ^ t17;
    y2  = x0  ^ t16;
*/
/* It appears impossible to do this with only 8 registers. We
   recompute t02, and t04 (if we have spare registers, hopefully the
   compiler can recognize them as common subexpressions). */
#define SBOX7(x0, x1, x2, x3, y0, y1, y2, y3)	\
  do {							\
    y0  = x0 & x2;					\
    y3  = x1 | y0; /* t04 */				\
    y3 ^= x2;						\
    y1  = ~x3;     /* t02 */				\
    y1 &= x0;						\
    y3 ^= y1;						\
    y1  = x2 | y3;					\
    y1 ^= x0;						\
    y2  = x0 & x1;					\
    x2 ^= y2;						\
    y2 |= x3;						\
    y1 ^= y2;						\
    y2  = x1 | y0; /* t04 */				\
    y2 &= y3;						\
    x1 ^= y1;						\
    y2 |= x1;						\
    y2 ^= x0;						\
    y0 ^= y1;						\
    x3  = ~x3;     /* t02 */				\
    y0 |= x3;						\
    y0 ^= x2;						\
  } while (0)

/* In-place linear transformation.  */
#define LINEAR_TRANSFORMATION(x0,x1,x2,x3)		 \
  do {                                                   \
    x0 = ROTL32 (13, x0);                    \
    x2 = ROTL32 (3, x2);                     \
    x1 = x1 ^ x0 ^ x2;        \
    x3 = x3 ^ x2 ^ (x0 << 3); \
    x1 = ROTL32 (1, x1);                     \
    x3 = ROTL32 (7, x3);                     \
    x0 = x0 ^ x1 ^ x3;        \
    x2 = x2 ^ x3 ^ (x1 << 7); \
    x0 = ROTL32 (5, x0);                     \
    x2 = ROTL32 (22, x2);                    \
  } while (0)

/* Round inputs are x0,x1,x2,x3 (destroyed), and round outputs are
   y0,y1,y2,y3. */
#define ROUND(which, subkey, x0,x1,x2,x3, y0,y1,y2,y3) \
  do {						       \
    KEYXOR(x0,x1,x2,x3, subkey);		       \
    SBOX##which(x0,x1,x2,x3, y0,y1,y2,y3);	       \
    LINEAR_TRANSFORMATION(y0,y1,y2,y3);		       \
  } while (0)

#if HAVE_NATIVE_64_BIT

#define LINEAR_TRANSFORMATION64(x0,x1,x2,x3)		 \
  do {                                                   \
    x0 = DROTL32 (13, x0);                    \
    x2 = DROTL32 (3, x2);                     \
    x1 = x1 ^ x0 ^ x2;        \
    x3 = x3 ^ x2 ^ DRSHIFT32(3, x0);	    \
    x1 = DROTL32 (1, x1);                     \
    x3 = DROTL32 (7, x3);                     \
    x0 = x0 ^ x1 ^ x3;        \
    x2 = x2 ^ x3 ^ DRSHIFT32(7, x1);	    \
    x0 = DROTL32 (5, x0);                     \
    x2 = DROTL32 (22, x2);                    \
  } while (0)

#define ROUND64(which, subkey, x0,x1,x2,x3, y0,y1,y2,y3) \
  do {						       \
    KEYXOR64(x0,x1,x2,x3, subkey);		       \
    SBOX##which(x0,x1,x2,x3, y0,y1,y2,y3);	       \
    LINEAR_TRANSFORMATION64(y0,y1,y2,y3);		       \
  } while (0)

#endif /* HAVE_NATIVE_64_BIT */

void
serpent_encrypt (const struct serpent_ctx *ctx,
		 size_t length, uint8_t * dst, const uint8_t * src)
{
  assert( !(length % SERPENT_BLOCK_SIZE));
  
#if HAVE_NATIVE_64_BIT
  if (length & SERPENT_BLOCK_SIZE)
#else
  while (length >= SERPENT_BLOCK_SIZE)
#endif
    {
      uint32_t x0,x1,x2,x3, y0,y1,y2,y3;
      unsigned k;

      x0 = LE_READ_UINT32 (src);
      x1 = LE_READ_UINT32 (src + 4);
      x2 = LE_READ_UINT32 (src + 8);
      x3 = LE_READ_UINT32 (src + 12);

      for (k = 0; ; k += 8)
	{
	  ROUND (0, ctx->keys[k+0], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND (1, ctx->keys[k+1], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND (2, ctx->keys[k+2], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND (3, ctx->keys[k+3], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND (4, ctx->keys[k+4], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND (5, ctx->keys[k+5], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND (6, ctx->keys[k+6], x0,x1,x2,x3, y0,y1,y2,y3);
	  if (k == 24)
	    break;
	  ROUND (7, ctx->keys[k+7], y0,y1,y2,y3, x0,x1,x2,x3);
	}

      /* Special final round, using two subkeys. */
      KEYXOR (y0,y1,y2,y3, ctx->keys[31]);
      SBOX7 (y0,y1,y2,y3, x0,x1,x2,x3);
      KEYXOR (x0,x1,x2,x3, ctx->keys[32]);
    
      LE_WRITE_UINT32 (dst, x0);
      LE_WRITE_UINT32 (dst + 4, x1);
      LE_WRITE_UINT32 (dst + 8, x2);
      LE_WRITE_UINT32 (dst + 12, x3);

      src += SERPENT_BLOCK_SIZE;
      dst += SERPENT_BLOCK_SIZE;
      length -= SERPENT_BLOCK_SIZE;
    }
#if HAVE_NATIVE_64_BIT
  FOR_BLOCKS(length, dst, src, 2*SERPENT_BLOCK_SIZE)
    {
      uint64_t x0,x1,x2,x3, y0,y1,y2,y3;
      unsigned k;

      x0 = LE_READ_UINT32 (src);
      x1 = LE_READ_UINT32 (src + 4);
      x2 = LE_READ_UINT32 (src + 8);
      x3 = LE_READ_UINT32 (src + 12);

      x0 <<= 32; x0 |= LE_READ_UINT32 (src + 16);
      x1 <<= 32; x1 |= LE_READ_UINT32 (src + 20);
      x2 <<= 32; x2 |= LE_READ_UINT32 (src + 24);
      x3 <<= 32; x3 |= LE_READ_UINT32 (src + 28);

      for (k = 0; ; k += 8)
	{
	  ROUND64 (0, ctx->keys[k+0], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND64 (1, ctx->keys[k+1], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND64 (2, ctx->keys[k+2], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND64 (3, ctx->keys[k+3], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND64 (4, ctx->keys[k+4], x0,x1,x2,x3, y0,y1,y2,y3);
	  ROUND64 (5, ctx->keys[k+5], y0,y1,y2,y3, x0,x1,x2,x3);
	  ROUND64 (6, ctx->keys[k+6], x0,x1,x2,x3, y0,y1,y2,y3);
	  if (k == 24)
	    break;
	  ROUND64 (7, ctx->keys[k+7], y0,y1,y2,y3, x0,x1,x2,x3);
	}

      /* Special final round, using two subkeys. */
      KEYXOR64 (y0,y1,y2,y3, ctx->keys[31]);
      SBOX7 (y0,y1,y2,y3, x0,x1,x2,x3);
      KEYXOR64 (x0,x1,x2,x3, ctx->keys[32]);
    
      LE_WRITE_UINT32 (dst + 16, x0);
      LE_WRITE_UINT32 (dst + 20, x1);
      LE_WRITE_UINT32 (dst + 24, x2);
      LE_WRITE_UINT32 (dst + 28, x3);
      x0 >>= 32; LE_WRITE_UINT32 (dst, x0);
      x1 >>= 32; LE_WRITE_UINT32 (dst + 4, x1);
      x2 >>= 32; LE_WRITE_UINT32 (dst + 8, x2);
      x3 >>= 32; LE_WRITE_UINT32 (dst + 12, x3);
    }
#endif /* HAVE_NATIVE_64_BIT */
}
Commit	Line	Data
00a6c2d1	1	/* serpent-encrypt.c
90112edb NM	2
	3	The serpent block cipher.
	4
	5	For more details on this algorithm, see the Serpent website at
	6	http://www.cl.cam.ac.uk/~rja14/serpent.html
	7
	8	Copyright (C) 2011 Niels Möller
	9	Copyright (C) 2010, 2011 Simon Josefsson
	10	Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
	11
	12	This file is part of GNU Nettle.
	13
	14	GNU Nettle is free software: you can redistribute it and/or
	15	modify it under the terms of either:
	16
	17	* the GNU Lesser General Public License as published by the Free
	18	Software Foundation; either version 3 of the License, or (at your
	19	option) any later version.
	20
	21	or
	22
	23	* the GNU General Public License as published by the Free
	24	Software Foundation; either version 2 of the License, or (at your
	25	option) any later version.
	26
	27	or both in parallel, as here.
	28
	29	GNU Nettle is distributed in the hope that it will be useful,
	30	but WITHOUT ANY WARRANTY; without even the implied warranty of
	31	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	32	General Public License for more details.
	33
	34	You should have received copies of the GNU General Public License and
	35	the GNU Lesser General Public License along with this program. If
	36	not, see http://www.gnu.org/licenses/.
	37	*/
00a6c2d1 NM	38
	39	/* This file is derived from cipher/serpent.c in Libgcrypt v1.4.6.
	40	The adaption to Nettle was made by Simon Josefsson on 2010-12-07
	41	with final touches on 2011-05-30. Changes include replacing
	42	libgcrypt with nettle in the license template, renaming
	43	serpent_context to serpent_ctx, renaming u32 to uint32_t, removing
	44	libgcrypt stubs and selftests, modifying entry function prototypes,
	45	using FOR_BLOCKS to iterate through data in encrypt/decrypt, using
	46	LE_READ_UINT32 and LE_WRITE_UINT32 to access data in
	47	encrypt/decrypt, and running indent on the code. */
	48
	49	#if HAVE_CONFIG_H
	50	#include "config.h"
	51	#endif
	52
	53	#include <assert.h>
	54	#include <limits.h>
	55
	56	#include "serpent.h"
	57
	58	#include "macros.h"
	59	#include "serpent-internal.h"
	60
	61	/* These are the S-Boxes of Serpent. They are copied from Serpents
	62	reference implementation (the optimized one, contained in
	63	`floppy2') and are therefore:
	64
	65	Copyright (C) 1998 Ross Anderson, Eli Biham, Lars Knudsen.
	66
	67	To quote the Serpent homepage
	68	(http://www.cl.cam.ac.uk/~rja14/serpent.html):
	69
	70	"Serpent is now completely in the public domain, and we impose no
	71	restrictions on its use. This was announced on the 21st August at
	72	the First AES Candidate Conference. The optimised implementations
	73	in the submission package are now under the GNU PUBLIC LICENSE
	74	(GPL), although some comments in the code still say otherwise. You
	75	are welcome to use Serpent for any application." */
	76
00a6c2d1	77	/* S0: 3 8 15 1 10 6 5 11 14 13 4 2 7 0 9 12 */
66b7bda5	78	/* Could easily let y0, y1 overlap with x0, x1, and possibly also x2 and y2 */
3c5e8649	79	#define SBOX0(x0, x1, x2, x3, y0, y1, y2, y3) \
66b7bda5 NM	80	do { \
	81	y3 = x1 ^ x2; \
	82	y0 = x0 \| x3; \
	83	y1 = x0 ^ x1; \
	84	y3 ^= y0; \
	85	y2 = x2 \| y3; \
	86	x0 ^= x3; \
	87	y2 &= x3; \
	88	x3 ^= x2; \
	89	x2 \|= x1; \
	90	y0 = y1 & x2; \
	91	y2 ^= y0; \
	92	y0 &= y2; \
	93	y0 ^= x2; \
	94	x1 &= x0; \
	95	y0 ^= x0; \
	96	y0 = ~ y0; \
	97	y1 = y0 ^ x1; \
	98	y1 ^= x3; \
00a6c2d1 NM	99	} while (0)
00a6c2d1 NM	100
2c32cfc7	101	/* FIXME: Arrange for some overlap between inputs and outputs? */
00a6c2d1	102	/* S1: 15 12 2 7 9 0 5 10 1 11 14 8 6 13 3 4 */
d9a172fd NM	103	/* Original single-assignment form:
d9a172fd NM	104
fe7695d1 NM	105	t01 = x0 \| x3;
	106	t02 = x2 ^ x3;
	107	t03 = ~ x1;
	108	t04 = x0 ^ x2;
	109	t05 = x0 \| t03;
	110	t06 = x3 & t04;
	111	t07 = t01 & t02;
	112	t08 = x1 \| t06;
	113	y2 = t02 ^ t05;
	114	t10 = t07 ^ t08;
	115	t11 = t01 ^ t10;
	116	t12 = y2 ^ t11;
	117	t13 = x1 & x3;
d9a172fd	118	y3 = ~ t10;
fe7695d1 NM	119	y1 = t13 ^ t12;
	120	t16 = t10 \| y1;
	121	t17 = t05 & t16;
	122	y0 = x2 ^ t17;
d9a172fd NM	123	*/
d9a172fd NM	124	#define SBOX1(x0, x1, x2, x3, y0, y1, y2, y3) \
dae705f5 NM	125	do { \
	126	y1 = x0 \| x3; \
	127	y2 = x2 ^ x3; \
	128	y0 = ~ x1; \
61c1cfc3	129	y3 = x0 ^ x2; \
dae705f5 NM	130	y0 \|= x0; \
	131	y3 &= x3; \
	132	x0 = y1 & y2; \
	133	y3 \|= x1; \
	134	y2 ^= y0; \
	135	y3 ^= x0; \
	136	x0 = y1 ^ y3; \
	137	x0 ^= y2; \
61c1cfc3	138	y1 = x1 & x3; \
dae705f5	139	y1 ^= x0; \
d9a172fd	140	x3 = y1 \| y3; \
dae705f5 NM	141	y3 = ~ y3; \
	142	y0 &= x3; \
	143	y0 ^= x2; \
00a6c2d1 NM	144	} while (0)
00a6c2d1 NM	145
2c32cfc7	146	/* FIXME: Arrange for some overlap between inputs and outputs? */
00a6c2d1	147	/* S2: 8 6 7 9 3 12 10 15 13 1 14 4 0 11 5 2 */
3c5e8649	148	#define SBOX2(x0, x1, x2, x3, y0, y1, y2, y3) \
2c32cfc7 NM	149	do { \
	150	y2 = x0 \| x2; \
	151	y1 = x0 ^ x1; \
	152	y3 = x3 ^ y2; \
	153	y0 = y1 ^ y3; \
	154	x3 \|= x0; \
	155	x2 ^= y0; \
	156	x0 = x1 ^ x2; \
	157	x2 \|= x1; \
	158	x0 &= y2; \
	159	y3 ^= x2; \
	160	y1 \|= y3; \
	161	y1 ^= x0; \
	162	y2 = y3 ^ y1; \
	163	y2 ^= x1; \
	164	y3 = ~ y3; \
	165	y2 ^= x3; \
00a6c2d1 NM	166	} while (0)
	167
	168	/* S3: 0 15 11 8 12 9 6 3 13 1 2 4 10 7 5 14 */
2b39cfeb NM	169	/* Original single-assignment form:
	170
	171	t01 = x0 ^ x2;
	172	t02 = x0 \| x3;
	173	t03 = x0 & x3;
	174	t04 = t01 & t02;
	175	t05 = x1 \| t03;
	176	t06 = x0 & x1;
	177	t07 = x3 ^ t04;
	178	t08 = x2 \| t06;
	179	t09 = x1 ^ t07;
	180	t10 = x3 & t05;
	181	t11 = t02 ^ t10;
	182	y3 = t08 ^ t09;
	183	t13 = x3 \| y3;
	184	t14 = x0 \| t07;
	185	t15 = x1 & t13;
	186	y2 = t08 ^ t11;
	187	y0 = t14 ^ t15;
	188	y1 = t05 ^ t04;
	189	*/
3c5e8649	190	#define SBOX3(x0, x1, x2, x3, y0, y1, y2, y3) \
d778b2da	191	do { \
61c1cfc3	192	y1 = x0 ^ x2; \
d778b2da NM	193	y0 = x0 \| x3; \
d778b2da NM	194	y3 = x0 & x3; \
61c1cfc3	195	y1 &= y0; \
d778b2da	196	y3 \|= x1; \
61c1cfc3	197	y2 = x0 & x1; \
d778b2da	198	y2 \|= x2; \
61c1cfc3	199	x2 = x3 ^ y1; \
d778b2da NM	200	y1 ^= y3; \
	201	x0 \|= x2; \
	202	x2 ^= x1; \
	203	y3 &= x3; \
	204	y0 ^= y3; \
	205	y3 = y2 ^ x2; \
	206	y2 ^= y0; \
	207	x3 \|= y3; \
	208	x1 &= x3; \
	209	y0 = x0 ^ x1; \
00a6c2d1 NM	210	} while (0)
00a6c2d1 NM	211
2b39cfeb	212
00a6c2d1	213	/* S4: 1 15 8 3 12 0 11 6 2 5 4 10 9 14 7 13 */
2b39cfeb NM	214	/* Original single-assignment form:
	215	t01 = x0 \| x1;
	216	t02 = x1 \| x2;
	217	t03 = x0 ^ t02;
	218	t04 = x1 ^ x3;
	219	t05 = x3 \| t03;
	220	t06 = x3 & t01;
	221	y3 = t03 ^ t06;
	222	t08 = y3 & t04;
	223	t09 = t04 & t05;
	224	t10 = x2 ^ t06;
	225	t11 = x1 & x2;
	226	t12 = t04 ^ t08;
	227	t13 = t11 \| t03;
	228	t14 = t10 ^ t09;
	229	t15 = x0 & t05;
	230	t16 = t11 \| t12;
	231	y2 = t13 ^ t08;
	232	y1 = t15 ^ t16;
	233	y0 = ~ t14;
	234	*/
3c5e8649	235	#define SBOX4(x0, x1, x2, x3, y0, y1, y2, y3) \
b44a75d0 NM	236	do { \
	237	y3 = x0 \| x1; \
	238	y2 = x1 \| x2; \
61c1cfc3	239	y2 ^= x0; \
b44a75d0 NM	240	y3 &= x3; \
	241	y0 = x1 ^ x3; \
	242	x3 \|= y2; \
	243	x0 &= x3; \
	244	x1 &= x2; \
	245	x2 ^= y3; \
	246	y3 ^= y2; \
	247	y2 \|= x1; \
	248	y1 = y3 & y0; \
	249	y2 ^= y1; \
	250	y1 ^= y0; \
	251	y1 \|= x1; \
	252	y1 ^= x0; \
	253	y0 &= x3; \
	254	y0 ^= x2; \
	255	y0 = ~y0; \
00a6c2d1 NM	256	} while (0)
	257
	258	/* S5: 15 5 2 11 4 10 9 12 0 3 14 8 13 6 7 1 */
9474fe42 NM	259	/* Original single-assignment form:
	260	t01 = x1 ^ x3;
	261	t02 = x1 \| x3;
	262	t03 = x0 & t01;
	263	t04 = x2 ^ t02;
	264	t05 = t03 ^ t04;
	265	y0 = ~ t05;
	266	t07 = x0 ^ t01;
	267	t08 = x3 \| y0;
	268	t09 = x1 \| t05;
	269	t10 = x3 ^ t08;
	270	t11 = x1 \| t07;
	271	t12 = t03 \| y0;
	272	t13 = t07 \| t10;
	273	t14 = t01 ^ t11;
	274	y2 = t09 ^ t13;
	275	y1 = t07 ^ t08;
	276	y3 = t12 ^ t14;
	277	*/
3c5e8649	278	#define SBOX5(x0, x1, x2, x3, y0, y1, y2, y3) \
9474fe42 NM	279	do { \
	280	y0 = x1 \| x3; \
	281	y0 ^= x2; \
	282	x2 = x1 ^ x3; \
	283	y2 = x0 ^ x2; \
	284	x0 &= x2; \
	285	y0 ^= x0; \
	286	y3 = x1 \| y2; \
	287	x1 \|= y0; \
	288	y0 = ~y0; \
	289	x0 \|= y0; \
	290	y3 ^= x2; \
	291	y3 ^= x0; \
	292	y1 = x3 \| y0; \
	293	x3 ^= y1; \
	294	y1 ^= y2; \
	295	y2 \|= x3; \
	296	y2 ^= x1; \
00a6c2d1 NM	297	} while (0)
	298
	299	/* S6: 7 2 12 5 8 4 6 11 14 9 1 15 13 3 10 0 */
3c5e8649 NM	300	/* Original single-assignment form:
	301	t01 = x0 & x3;
	302	t02 = x1 ^ x2;
	303	t03 = x0 ^ x3;
	304	t04 = t01 ^ t02;
	305	t05 = x1 \| x2;
	306	y1 = ~ t04;
	307	t07 = t03 & t05;
	308	t08 = x1 & y1;
	309	t09 = x0 \| x2;
	310	t10 = t07 ^ t08;
	311	t11 = x1 \| x3;
	312	t12 = x2 ^ t11;
	313	t13 = t09 ^ t10;
	314	y2 = ~ t13;
	315	t15 = y1 & t03;
	316	y3 = t12 ^ t07;
	317	t17 = x0 ^ x1;
	318	t18 = y2 ^ t15;
	319	y0 = t17 ^ t18;
	320	*/
	321	#define SBOX6(x0, x1, x2, x3, y0, y1, y2, y3) \
	322	do { \
	323	y0 = x0 ^ x3; \
	324	y1 = x0 & x3; \
	325	y2 = x0 \| x2; \
	326	x3 \|= x1; \
	327	x3 ^= x2; \
	328	x0 ^= x1; \
	329	y3 = x1 \| x2; \
	330	x2 ^= x1; \
	331	y3 &= y0; \
	332	y1 ^= x2; \
	333	y1 = ~y1; \
	334	y0 &= y1; \
	335	x1 &= y1; \
	336	x1 ^= y3; \
	337	y3 ^= x3; \
	338	y2 ^= x1; \
	339	y2 = ~y2; \
	340	y0 ^= y2; \
	341	y0 ^= x0; \
00a6c2d1 NM	342	} while (0)
	343
	344	/* S7: 1 13 15 0 14 8 2 11 7 4 12 10 9 3 5 6 */
3c5e8649 NM	345	/* Original single-assignment form:
	346	t01 = x0 & x2;
	347	t02 = ~ x3;
	348	t03 = x0 & t02;
	349	t04 = x1 \| t01;
	350	t05 = x0 & x1;
	351	t06 = x2 ^ t04;
	352	y3 = t03 ^ t06;
	353	t08 = x2 \| y3;
	354	t09 = x3 \| t05;
	355	t10 = x0 ^ t08;
	356	t11 = t04 & y3;
	357	y1 = t09 ^ t10;
	358	t13 = x1 ^ y1;
	359	t14 = t01 ^ y1;
	360	t15 = x2 ^ t05;
	361	t16 = t11 \| t13;
	362	t17 = t02 \| t14;
	363	y0 = t15 ^ t17;
	364	y2 = x0 ^ t16;
	365	*/
	366	/* It appears impossible to do this with only 8 registers. We
	367	recompute t02, and t04 (if we have spare registers, hopefully the
9946d3df	368	compiler can recognize them as common subexpressions). */
3c5e8649 NM	369	#define SBOX7(x0, x1, x2, x3, y0, y1, y2, y3) \
	370	do { \
	371	y0 = x0 & x2; \
	372	y3 = x1 \| y0; /* t04 */ \
	373	y3 ^= x2; \
	374	y1 = ~x3; /* t02 */ \
	375	y1 &= x0; \
	376	y3 ^= y1; \
	377	y1 = x2 \| y3; \
	378	y1 ^= x0; \
	379	y2 = x0 & x1; \
	380	x2 ^= y2; \
	381	y2 \|= x3; \
	382	y1 ^= y2; \
	383	y2 = x1 \| y0; /* t04 */ \
	384	y2 &= y3; \
	385	x1 ^= y1; \
	386	y2 \|= x1; \
	387	y2 ^= x0; \
	388	y0 ^= y1; \
	389	x3 = ~x3; /* t02 */ \
	390	y0 \|= x3; \
	391	y0 ^= x2; \
00a6c2d1 NM	392	} while (0)
	393
	394	/* In-place linear transformation. */
	395	#define LINEAR_TRANSFORMATION(x0,x1,x2,x3) \
	396	do { \
8a56233b NM	397	x0 = ROTL32 (13, x0); \
8a56233b NM	398	x2 = ROTL32 (3, x2); \
00a6c2d1 NM	399	x1 = x1 ^ x0 ^ x2; \
00a6c2d1 NM	400	x3 = x3 ^ x2 ^ (x0 << 3); \
8a56233b NM	401	x1 = ROTL32 (1, x1); \
8a56233b NM	402	x3 = ROTL32 (7, x3); \
00a6c2d1 NM	403	x0 = x0 ^ x1 ^ x3; \
00a6c2d1 NM	404	x2 = x2 ^ x3 ^ (x1 << 7); \
8a56233b NM	405	x0 = ROTL32 (5, x0); \
8a56233b NM	406	x2 = ROTL32 (22, x2); \
00a6c2d1 NM	407	} while (0)
	408
	409	/* Round inputs are x0,x1,x2,x3 (destroyed), and round outputs are
	410	y0,y1,y2,y3. */
	411	#define ROUND(which, subkey, x0,x1,x2,x3, y0,y1,y2,y3) \
	412	do { \
	413	KEYXOR(x0,x1,x2,x3, subkey); \
3c5e8649	414	SBOX##which(x0,x1,x2,x3, y0,y1,y2,y3); \
00a6c2d1 NM	415	LINEAR_TRANSFORMATION(y0,y1,y2,y3); \
	416	} while (0)
	417
	418	#if HAVE_NATIVE_64_BIT
	419
	420	#define LINEAR_TRANSFORMATION64(x0,x1,x2,x3) \
	421	do { \
d20990fd NM	422	x0 = DROTL32 (13, x0); \
d20990fd NM	423	x2 = DROTL32 (3, x2); \
00a6c2d1	424	x1 = x1 ^ x0 ^ x2; \
d20990fd NM	425	x3 = x3 ^ x2 ^ DRSHIFT32(3, x0); \
	426	x1 = DROTL32 (1, x1); \
	427	x3 = DROTL32 (7, x3); \
00a6c2d1	428	x0 = x0 ^ x1 ^ x3; \
d20990fd NM	429	x2 = x2 ^ x3 ^ DRSHIFT32(7, x1); \
	430	x0 = DROTL32 (5, x0); \
	431	x2 = DROTL32 (22, x2); \
00a6c2d1 NM	432	} while (0)
	433
	434	#define ROUND64(which, subkey, x0,x1,x2,x3, y0,y1,y2,y3) \
	435	do { \
	436	KEYXOR64(x0,x1,x2,x3, subkey); \
3c5e8649	437	SBOX##which(x0,x1,x2,x3, y0,y1,y2,y3); \
00a6c2d1 NM	438	LINEAR_TRANSFORMATION64(y0,y1,y2,y3); \
	439	} while (0)
	440
	441	#endif /* HAVE_NATIVE_64_BIT */
	442
	443	void
	444	serpent_encrypt (const struct serpent_ctx *ctx,
3eb603d0	445	size_t length, uint8_t * dst, const uint8_t * src)
00a6c2d1 NM	446	{
	447	assert( !(length % SERPENT_BLOCK_SIZE));
	448
	449	#if HAVE_NATIVE_64_BIT
	450	if (length & SERPENT_BLOCK_SIZE)
	451	#else
	452	while (length >= SERPENT_BLOCK_SIZE)
	453	#endif
	454	{
	455	uint32_t x0,x1,x2,x3, y0,y1,y2,y3;
	456	unsigned k;
	457
	458	x0 = LE_READ_UINT32 (src);
	459	x1 = LE_READ_UINT32 (src + 4);
	460	x2 = LE_READ_UINT32 (src + 8);
	461	x3 = LE_READ_UINT32 (src + 12);
	462
	463	for (k = 0; ; k += 8)
	464	{
	465	ROUND (0, ctx->keys[k+0], x0,x1,x2,x3, y0,y1,y2,y3);
	466	ROUND (1, ctx->keys[k+1], y0,y1,y2,y3, x0,x1,x2,x3);
	467	ROUND (2, ctx->keys[k+2], x0,x1,x2,x3, y0,y1,y2,y3);
	468	ROUND (3, ctx->keys[k+3], y0,y1,y2,y3, x0,x1,x2,x3);
	469	ROUND (4, ctx->keys[k+4], x0,x1,x2,x3, y0,y1,y2,y3);
	470	ROUND (5, ctx->keys[k+5], y0,y1,y2,y3, x0,x1,x2,x3);
	471	ROUND (6, ctx->keys[k+6], x0,x1,x2,x3, y0,y1,y2,y3);
	472	if (k == 24)
	473	break;
	474	ROUND (7, ctx->keys[k+7], y0,y1,y2,y3, x0,x1,x2,x3);
	475	}
	476
	477	/* Special final round, using two subkeys. */
	478	KEYXOR (y0,y1,y2,y3, ctx->keys[31]);
3c5e8649	479	SBOX7 (y0,y1,y2,y3, x0,x1,x2,x3);
00a6c2d1 NM	480	KEYXOR (x0,x1,x2,x3, ctx->keys[32]);
	481
	482	LE_WRITE_UINT32 (dst, x0);
	483	LE_WRITE_UINT32 (dst + 4, x1);
	484	LE_WRITE_UINT32 (dst + 8, x2);
	485	LE_WRITE_UINT32 (dst + 12, x3);
	486
	487	src += SERPENT_BLOCK_SIZE;
	488	dst += SERPENT_BLOCK_SIZE;
	489	length -= SERPENT_BLOCK_SIZE;
	490	}
	491	#if HAVE_NATIVE_64_BIT
	492	FOR_BLOCKS(length, dst, src, 2*SERPENT_BLOCK_SIZE)
	493	{
	494	uint64_t x0,x1,x2,x3, y0,y1,y2,y3;
	495	unsigned k;
	496
	497	x0 = LE_READ_UINT32 (src);
	498	x1 = LE_READ_UINT32 (src + 4);
	499	x2 = LE_READ_UINT32 (src + 8);
	500	x3 = LE_READ_UINT32 (src + 12);
	501
	502	x0 <<= 32; x0 \|= LE_READ_UINT32 (src + 16);
	503	x1 <<= 32; x1 \|= LE_READ_UINT32 (src + 20);
	504	x2 <<= 32; x2 \|= LE_READ_UINT32 (src + 24);
	505	x3 <<= 32; x3 \|= LE_READ_UINT32 (src + 28);
	506
	507	for (k = 0; ; k += 8)
	508	{
	509	ROUND64 (0, ctx->keys[k+0], x0,x1,x2,x3, y0,y1,y2,y3);
	510	ROUND64 (1, ctx->keys[k+1], y0,y1,y2,y3, x0,x1,x2,x3);
	511	ROUND64 (2, ctx->keys[k+2], x0,x1,x2,x3, y0,y1,y2,y3);
	512	ROUND64 (3, ctx->keys[k+3], y0,y1,y2,y3, x0,x1,x2,x3);
	513	ROUND64 (4, ctx->keys[k+4], x0,x1,x2,x3, y0,y1,y2,y3);
	514	ROUND64 (5, ctx->keys[k+5], y0,y1,y2,y3, x0,x1,x2,x3);
	515	ROUND64 (6, ctx->keys[k+6], x0,x1,x2,x3, y0,y1,y2,y3);
	516	if (k == 24)
	517	break;
	518	ROUND64 (7, ctx->keys[k+7], y0,y1,y2,y3, x0,x1,x2,x3);
	519	}
	520
	521	/* Special final round, using two subkeys. */
	522	KEYXOR64 (y0,y1,y2,y3, ctx->keys[31]);
3c5e8649	523	SBOX7 (y0,y1,y2,y3, x0,x1,x2,x3);
00a6c2d1 NM	524	KEYXOR64 (x0,x1,x2,x3, ctx->keys[32]);
	525
	526	LE_WRITE_UINT32 (dst + 16, x0);
	527	LE_WRITE_UINT32 (dst + 20, x1);
	528	LE_WRITE_UINT32 (dst + 24, x2);
	529	LE_WRITE_UINT32 (dst + 28, x3);
	530	x0 >>= 32; LE_WRITE_UINT32 (dst, x0);
	531	x1 >>= 32; LE_WRITE_UINT32 (dst + 4, x1);
	532	x2 >>= 32; LE_WRITE_UINT32 (dst + 8, x2);
	533	x3 >>= 32; LE_WRITE_UINT32 (dst + 12, x3);
	534	}
	535	#endif /* HAVE_NATIVE_64_BIT */
	536	}