/*
Algorithm Name: Keccak
Authors: Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche
Date: January 9, 2009

This code, originally by Guido Bertoni, Joan Daemen, Michaël Peeters and
Gilles Van Assche as a part of the SHA-3 submission, is hereby put in the
public domain. It is given as is, without any guarantee.

For more information, feedback or questions, please refer to our website:
http://keccak.noekeon.org/
*/

#undef	OPTIMIZED
#define	OPTIMIZED	64

#if OPTIMIZED == 64
/* ===== "KeccakOpt64-settings.h" */
#define Unrolling 18
#if defined(__SSE2__)
 #define UseSSE
#elif defined(__MMX__)
 #define UseMMX
#else
 #define UseBebigokimisa
#endif
/* ===== */
#endif

#if OPTIMIZED == 32
/* ===== "KeccakOpt32-settings.h" */
#define Unrolling 2
#define UseBebigokimisa
//#define UseInterleaveTables
/* ===== */
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <rpmiotypes.h>

#include "keccak.h"

#define IS_BIG_ENDIAN		4321
#define IS_LITTLE_ENDIAN	1234
#ifdef  WORDS_BIGENDIAN
#define PLATFORM_BYTE_ORDER	4321
#else
#define PLATFORM_BYTE_ORDER	1234
#endif

enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 };

typedef unsigned char UINT8;
typedef unsigned short UINT16;
typedef unsigned int UINT32;
typedef unsigned long long int UINT64;

/* ===== "KeccakPermutationInterface.h" */
void KeccakInitialize(void);
void KeccakInitializeState(unsigned char *state);
void KeccakPermutation(unsigned char *state);
void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data);
void KeccakAbsorb512bits(unsigned char *state, const unsigned char *data);
void KeccakExtract1024bits(const unsigned char *state, unsigned char *data);
void KeccakExtract512bits(const unsigned char *state, unsigned char *data);
/* ===== */

#if OPTIMIZED == 64
/* ===== "KeccakPermutationOptimized64.c" */

#if defined(UseSSE)
    #include <emmintrin.h>
    typedef __m128i V64;
    typedef __m128i V128;
    typedef union {
        V128 v128;
        UINT64 v64[2];
    } V6464;

    #define ANDnu64(a, b)       _mm_andnot_si128(a, b)
    #define LOAD64(a)           _mm_loadl_epi64((const V64 *)&(a))
    #define CONST64(a)          _mm_loadl_epi64((const V64 *)&(a))
    #define ROL64(a, o)         _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
    #define STORE64(a, b)       _mm_storel_epi64((V64 *)&(a), b)
    #define XOR64(a, b)         _mm_xor_si128(a, b)
    #define XOReq64(a, b)       a = _mm_xor_si128(a, b)

    #define ANDnu128(a, b)      _mm_andnot_si128(a, b)
    #define LOAD6464(a, b)      _mm_set_epi64((__m64)(a), (__m64)(b))
    #define LOAD128(a)          _mm_load_si128((const V128 *)&(a))
    #define LOAD128u(a)         _mm_loadu_si128((const V128 *)&(a))
    #define ROL64in128(a, o)    _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
    #define STORE128(a, b)      _mm_store_si128((V128 *)&(a), b)
    #define XOR128(a, b)        _mm_xor_si128(a, b)
    #define XOReq128(a, b)      a = _mm_xor_si128(a, b)
    #define GET64LO(a, b)       _mm_unpacklo_epi64(a, b)
    #define GET64HI(a, b)       _mm_unpackhi_epi64(a, b)
    #define COPY64HI2LO(a)      _mm_shuffle_epi32(a, 0xEE)
    #define COPY64LO2HI(a)      _mm_shuffle_epi32(a, 0x44)
    #define ZERO128()           _mm_setzero_si128()

    #ifdef UseOnlySIMD64
/* ===== "KeccakF-1600-18-simd64.macros" */
/* Code automatically generated by KeccakTools! */
#define declareABCDE \
    V64 Aba, Abe, Abi, Abo, Abu; \
    V64 Aga, Age, Agi, Ago, Agu; \
    V64 Aka, Ake, Aki, Ako, Aku; \
    V64 Ama, Ame, Ami, Amo, Amu; \
    V64 Asa, Ase, Asi, Aso, Asu; \
    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
    V64 Bga, Bge, Bgi, Bgo, Bgu; \
    V64 Bka, Bke, Bki, Bko, Bku; \
    V64 Bma, Bme, Bmi, Bmo, Bmu; \
    V64 Bsa, Bse, Bsi, Bso, Bsu; \
    V64 Ca, Ce, Ci, Co, Cu; \
    V64 Da, De, Di, Do, Du; \
    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
    V64 Ega, Ege, Egi, Ego, Egu; \
    V64 Eka, Eke, Eki, Eko, Eku; \
    V64 Ema, Eme, Emi, Emo, Emu; \
    V64 Esa, Ese, Esi, Eso, Esu; \

#define prepareTheta \
    Ca = XOR64(Aba, XOR64(Aga, XOR64(Aka, XOR64(Ama, Asa)))); \
    Ce = XOR64(Abe, XOR64(Age, XOR64(Ake, XOR64(Ame, Ase)))); \
    Ci = XOR64(Abi, XOR64(Agi, XOR64(Aki, XOR64(Ami, Asi)))); \
    Co = XOR64(Abo, XOR64(Ago, XOR64(Ako, XOR64(Amo, Aso)))); \
    Cu = XOR64(Abu, XOR64(Agu, XOR64(Aku, XOR64(Amu, Asu)))); \

// --- Theta Rho Pi Chi Iota Prepare-theta
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da = XOR64(Cu, ROL64(Ce, 1)); \
    De = XOR64(Ca, ROL64(Ci, 1)); \
    Di = XOR64(Ce, ROL64(Co, 1)); \
    Do = XOR64(Ci, ROL64(Cu, 1)); \
    Du = XOR64(Co, ROL64(Ca, 1)); \
\
    XOReq64(A##ba, Da); \
    Bba = A##ba; \
    XOReq64(A##ge, De); \
    Bbe = ROL64(A##ge, 44); \
    XOReq64(A##ki, Di); \
    Bbi = ROL64(A##ki, 43); \
    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
    Ca = E##ba; \
    XOReq64(A##mo, Do); \
    Bbo = ROL64(A##mo, 21); \
    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
    Ce = E##be; \
    XOReq64(A##su, Du); \
    Bbu = ROL64(A##su, 14); \
    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
    Ci = E##bi; \
    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
    Co = E##bo; \
    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
    Cu = E##bu; \
\
    XOReq64(A##bo, Do); \
    Bga = ROL64(A##bo, 28); \
    XOReq64(A##gu, Du); \
    Bge = ROL64(A##gu, 20); \
    XOReq64(A##ka, Da); \
    Bgi = ROL64(A##ka, 3); \
    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
    XOReq64(Ca, E##ga); \
    XOReq64(A##me, De); \
    Bgo = ROL64(A##me, 45); \
    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
    XOReq64(Ce, E##ge); \
    XOReq64(A##si, Di); \
    Bgu = ROL64(A##si, 61); \
    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
    XOReq64(Ci, E##gi); \
    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
    XOReq64(Co, E##go); \
    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
    XOReq64(Cu, E##gu); \
\
    XOReq64(A##be, De); \
    Bka = ROL64(A##be, 1); \
    XOReq64(A##gi, Di); \
    Bke = ROL64(A##gi, 6); \
    XOReq64(A##ko, Do); \
    Bki = ROL64(A##ko, 25); \
    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
    XOReq64(Ca, E##ka); \
    XOReq64(A##mu, Du); \
    Bko = ROL64(A##mu, 8); \
    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
    XOReq64(Ce, E##ke); \
    XOReq64(A##sa, Da); \
    Bku = ROL64(A##sa, 18); \
    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
    XOReq64(Ci, E##ki); \
    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
    XOReq64(Co, E##ko); \
    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
    XOReq64(Cu, E##ku); \
\
    XOReq64(A##bu, Du); \
    Bma = ROL64(A##bu, 27); \
    XOReq64(A##ga, Da); \
    Bme = ROL64(A##ga, 36); \
    XOReq64(A##ke, De); \
    Bmi = ROL64(A##ke, 10); \
    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
    XOReq64(Ca, E##ma); \
    XOReq64(A##mi, Di); \
    Bmo = ROL64(A##mi, 15); \
    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
    XOReq64(Ce, E##me); \
    XOReq64(A##so, Do); \
    Bmu = ROL64(A##so, 56); \
    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
    XOReq64(Ci, E##mi); \
    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
    XOReq64(Co, E##mo); \
    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
    XOReq64(Cu, E##mu); \
\
    XOReq64(A##bi, Di); \
    Bsa = ROL64(A##bi, 62); \
    XOReq64(A##go, Do); \
    Bse = ROL64(A##go, 55); \
    XOReq64(A##ku, Du); \
    Bsi = ROL64(A##ku, 39); \
    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
    XOReq64(Ca, E##sa); \
    XOReq64(A##ma, Da); \
    Bso = ROL64(A##ma, 41); \
    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
    XOReq64(Ce, E##se); \
    XOReq64(A##se, De); \
    Bsu = ROL64(A##se, 2); \
    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
    XOReq64(Ci, E##si); \
    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
    XOReq64(Co, E##so); \
    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
    XOReq64(Cu, E##su); \
\

// --- Theta Rho Pi Chi Iota
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da = XOR64(Cu, ROL64(Ce, 1)); \
    De = XOR64(Ca, ROL64(Ci, 1)); \
    Di = XOR64(Ce, ROL64(Co, 1)); \
    Do = XOR64(Ci, ROL64(Cu, 1)); \
    Du = XOR64(Co, ROL64(Ca, 1)); \
\
    XOReq64(A##ba, Da); \
    Bba = A##ba; \
    XOReq64(A##ge, De); \
    Bbe = ROL64(A##ge, 44); \
    XOReq64(A##ki, Di); \
    Bbi = ROL64(A##ki, 43); \
    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
    XOReq64(A##mo, Do); \
    Bbo = ROL64(A##mo, 21); \
    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
    XOReq64(A##su, Du); \
    Bbu = ROL64(A##su, 14); \
    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
\
    XOReq64(A##bo, Do); \
    Bga = ROL64(A##bo, 28); \
    XOReq64(A##gu, Du); \
    Bge = ROL64(A##gu, 20); \
    XOReq64(A##ka, Da); \
    Bgi = ROL64(A##ka, 3); \
    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
    XOReq64(A##me, De); \
    Bgo = ROL64(A##me, 45); \
    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
    XOReq64(A##si, Di); \
    Bgu = ROL64(A##si, 61); \
    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
\
    XOReq64(A##be, De); \
    Bka = ROL64(A##be, 1); \
    XOReq64(A##gi, Di); \
    Bke = ROL64(A##gi, 6); \
    XOReq64(A##ko, Do); \
    Bki = ROL64(A##ko, 25); \
    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
    XOReq64(A##mu, Du); \
    Bko = ROL64(A##mu, 8); \
    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
    XOReq64(A##sa, Da); \
    Bku = ROL64(A##sa, 18); \
    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
\
    XOReq64(A##bu, Du); \
    Bma = ROL64(A##bu, 27); \
    XOReq64(A##ga, Da); \
    Bme = ROL64(A##ga, 36); \
    XOReq64(A##ke, De); \
    Bmi = ROL64(A##ke, 10); \
    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
    XOReq64(A##mi, Di); \
    Bmo = ROL64(A##mi, 15); \
    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
    XOReq64(A##so, Do); \
    Bmu = ROL64(A##so, 56); \
    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
\
    XOReq64(A##bi, Di); \
    Bsa = ROL64(A##bi, 62); \
    XOReq64(A##go, Do); \
    Bse = ROL64(A##go, 55); \
    XOReq64(A##ku, Du); \
    Bsi = ROL64(A##ku, 39); \
    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
    XOReq64(A##ma, Da); \
    Bso = ROL64(A##ma, 41); \
    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
    XOReq64(A##se, De); \
    Bsu = ROL64(A##se, 2); \
    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
\

const UINT64 KeccakF1600RoundConstants[18] = {
    0x0000000000000001ULL,
    0x0000000000008082ULL,
    0x800000000000808aULL,
    0x8000000080008000ULL,
    0x000000000000808bULL,
    0x0000000080000001ULL,
    0x8000000080008081ULL,
    0x8000000000008009ULL,
    0x000000000000008aULL,
    0x0000000000000088ULL,
    0x0000000080008009ULL,
    0x000000008000000aULL,
    0x000000008000808bULL,
    0x800000000000008bULL,
    0x8000000000008089ULL,
    0x8000000000008003ULL,
    0x8000000000008002ULL,
    0x8000000000000080ULL };

#define copyFromStateAndXor1024bits(X, state, input) \
    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyFromStateAndXor512bits(X, state, input) \
    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
    X##go = LOAD64(state[ 8]); \
    X##gu = LOAD64(state[ 9]); \
    X##ka = LOAD64(state[10]); \
    X##ke = LOAD64(state[11]); \
    X##ki = LOAD64(state[12]); \
    X##ko = LOAD64(state[13]); \
    X##ku = LOAD64(state[14]); \
    X##ma = LOAD64(state[15]); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyFromState(X, state) \
    X##ba = LOAD64(state[ 0]); \
    X##be = LOAD64(state[ 1]); \
    X##bi = LOAD64(state[ 2]); \
    X##bo = LOAD64(state[ 3]); \
    X##bu = LOAD64(state[ 4]); \
    X##ga = LOAD64(state[ 5]); \
    X##ge = LOAD64(state[ 6]); \
    X##gi = LOAD64(state[ 7]); \
    X##go = LOAD64(state[ 8]); \
    X##gu = LOAD64(state[ 9]); \
    X##ka = LOAD64(state[10]); \
    X##ke = LOAD64(state[11]); \
    X##ki = LOAD64(state[12]); \
    X##ko = LOAD64(state[13]); \
    X##ku = LOAD64(state[14]); \
    X##ma = LOAD64(state[15]); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyToState(state, X) \
    STORE64(state[ 0], X##ba); \
    STORE64(state[ 1], X##be); \
    STORE64(state[ 2], X##bi); \
    STORE64(state[ 3], X##bo); \
    STORE64(state[ 4], X##bu); \
    STORE64(state[ 5], X##ga); \
    STORE64(state[ 6], X##ge); \
    STORE64(state[ 7], X##gi); \
    STORE64(state[ 8], X##go); \
    STORE64(state[ 9], X##gu); \
    STORE64(state[10], X##ka); \
    STORE64(state[11], X##ke); \
    STORE64(state[12], X##ki); \
    STORE64(state[13], X##ko); \
    STORE64(state[14], X##ku); \
    STORE64(state[15], X##ma); \
    STORE64(state[16], X##me); \
    STORE64(state[17], X##mi); \
    STORE64(state[18], X##mo); \
    STORE64(state[19], X##mu); \
    STORE64(state[20], X##sa); \
    STORE64(state[21], X##se); \
    STORE64(state[22], X##si); \
    STORE64(state[23], X##so); \
    STORE64(state[24], X##su); \

#define copyStateVariables(X, Y) \
    X##ba = Y##ba; \
    X##be = Y##be; \
    X##bi = Y##bi; \
    X##bo = Y##bo; \
    X##bu = Y##bu; \
    X##ga = Y##ga; \
    X##ge = Y##ge; \
    X##gi = Y##gi; \
    X##go = Y##go; \
    X##gu = Y##gu; \
    X##ka = Y##ka; \
    X##ke = Y##ke; \
    X##ki = Y##ki; \
    X##ko = Y##ko; \
    X##ku = Y##ku; \
    X##ma = Y##ma; \
    X##me = Y##me; \
    X##mi = Y##mi; \
    X##mo = Y##mo; \
    X##mu = Y##mu; \
    X##sa = Y##sa; \
    X##se = Y##se; \
    X##si = Y##si; \
    X##so = Y##so; \
    X##su = Y##su; \

/* ===== */
    #else
/* ===== "KeccakF-1600-18-simd128.macros" */
#define declareABCDE \
    V6464 Abage, Abegi, Abigo, Abogu, Abuga; \
    V6464 Akame, Akemi, Akimo, Akomu, Akuma; \
    V6464 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio, Asae, Asio; \
    V64 Aba, Abe, Abi, Abo, Abu; \
    V64 Aga, Age, Agi, Ago, Agu; \
    V64 Aka, Ake, Aki, Ako, Aku; \
    V64 Ama, Ame, Ami, Amo, Amu; \
    V64 Asa, Ase, Asi, Aso, Asu; \
    V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga; \
    V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma; \
    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
    V64 Bga, Bge, Bgi, Bgo, Bgu; \
    V64 Bka, Bke, Bki, Bko, Bku; \
    V64 Bma, Bme, Bmi, Bmo, Bmu; \
    V64 Bsa, Bse, Bsi, Bso, Bsu; \
    V128 Cae, Cei, Cio, Cou, Cua, Dei, Dou; \
    V64 Ca, Ce, Ci, Co, Cu; \
    V64 Da, De, Di, Do, Du; \
    V6464 Ebage, Ebegi, Ebigo, Ebogu, Ebuga; \
    V6464 Ekame, Ekemi, Ekimo, Ekomu, Ekuma; \
    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
    V64 Ega, Ege, Egi, Ego, Egu; \
    V64 Eka, Eke, Eki, Eko, Eku; \
    V64 Ema, Eme, Emi, Emo, Emu; \
    V64 Esa, Ese, Esi, Eso, Esu; \
    V128 Zero;

#define prepareTheta

#define computeD \
    Cua = GET64LO(Cu, Cae); \
    Dei = XOR128(Cae, ROL64in128(Cio, 1)); \
    Dou = XOR128(Cio, ROL64in128(Cua, 1)); \
    Da = XOR64(Cu, ROL64in128(COPY64HI2LO(Cae), 1)); \
    De = Dei; \
    Di = COPY64HI2LO(Dei); \
    Do = Dou; \
    Du = COPY64HI2LO(Dou);

// --- Theta Rho Pi Chi Iota Prepare-theta
// --- 64-bit lanes mapped to 64-bit and 128-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    computeD \
    \
    A##ba = LOAD64(A##bage.v64[0]); \
    XOReq64(A##ba, Da); \
    Bba = A##ba; \
    XOReq64(A##gu, Du); \
    Bge = ROL64(A##gu, 20); \
    Bbage = GET64LO(Bba, Bge); \
    A##ge = LOAD64(A##bage.v64[1]); \
    XOReq64(A##ge, De); \
    Bbe = ROL64(A##ge, 44); \
    A##ka = LOAD64(A##kame.v64[0]); \
    XOReq64(A##ka, Da); \
    Bgi = ROL64(A##ka, 3); \
    Bbegi = GET64LO(Bbe, Bgi); \
    XOReq64(A##ki, Di); \
    Bbi = ROL64(A##ki, 43); \
    A##me = LOAD64(A##kame.v64[1]); \
    XOReq64(A##me, De); \
    Bgo = ROL64(A##me, 45); \
    Bbigo = GET64LO(Bbi, Bgo); \
    E##bage.v128 = XOR128(Bbage, ANDnu128(Bbegi, Bbigo)); \
    XOReq128(E##bage.v128, CONST64(KeccakF1600RoundConstants[i])); \
    Cae = E##bage.v128; \
    XOReq64(A##mo, Do); \
    Bbo = ROL64(A##mo, 21); \
    XOReq64(A##si, Di); \
    Bgu = ROL64(A##si, 61); \
    Bbogu = GET64LO(Bbo, Bgu); \
    E##begi.v128 = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu)); \
    Cei = E##begi.v128; \
    XOReq64(A##su, Du); \
    Bbu = ROL64(A##su, 14); \
    XOReq64(A##bo, Do); \
    Bga = ROL64(A##bo, 28); \
    Bbuga = GET64LO(Bbu, Bga); \
    E##bigo.v128 = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga)); \
    E##bi = E##bigo.v128; \
    E##go = GET64HI(E##bigo.v128, E##bigo.v128); \
    Cio = E##bigo.v128; \
    E##bogu.v128 = XOR128(Bbogu, ANDnu128(Bbuga, Bbage)); \
    E##bo = E##bogu.v128; \
    E##gu = GET64HI(E##bogu.v128, E##bogu.v128); \
    Cou = E##bogu.v128; \
    E##buga.v128 = XOR128(Bbuga, ANDnu128(Bbage, Bbegi)); \
    E##bu = E##buga.v128; \
    E##ga = GET64HI(E##buga.v128, E##buga.v128); \
    Cua = E##buga.v128; \
\
    A##be = LOAD64(A##begi.v64[0]); \
    XOReq64(A##be, De); \
    Bka = ROL64(A##be, 1); \
    XOReq64(A##ga, Da); \
    Bme = ROL64(A##ga, 36); \
    Bkame = GET64LO(Bka, Bme); \
    A##gi = LOAD64(A##begi.v64[1]); \
    XOReq64(A##gi, Di); \
    Bke = ROL64(A##gi, 6); \
    A##ke = LOAD64(A##kemi.v64[0]); \
    XOReq64(A##ke, De); \
    Bmi = ROL64(A##ke, 10); \
    Bkemi = GET64LO(Bke, Bmi); \
    XOReq64(A##ko, Do); \
    Bki = ROL64(A##ko, 25); \
    A##mi = LOAD64(A##kemi.v64[1]); \
    XOReq64(A##mi, Di); \
    Bmo = ROL64(A##mi, 15); \
    Bkimo = GET64LO(Bki, Bmo); \
    E##kame.v128 = XOR128(Bkame, ANDnu128(Bkemi, Bkimo)); \
    XOReq128(Cae, E##kame.v128); \
    XOReq64(A##mu, Du); \
    Bko = ROL64(A##mu, 8); \
    XOReq64(A##so, Do); \
    Bmu = ROL64(A##so, 56); \
    Bkomu = GET64LO(Bko, Bmu); \
    E##kemi.v128 = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu)); \
    XOReq128(Cei, E##kemi.v128); \
    XOReq64(A##sa, Da); \
    Bku = ROL64(A##sa, 18); \
    XOReq64(A##bu, Du); \
    Bma = ROL64(A##bu, 27); \
    Bkuma = GET64LO(Bku, Bma); \
    E##kimo.v128 = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma)); \
    E##ki = E##kimo.v128; \
    E##mo = GET64HI(E##kimo.v128, E##kimo.v128); \
    XOReq128(Cio, E##kimo.v128); \
    E##komu.v128 = XOR128(Bkomu, ANDnu128(Bkuma, Bkame)); \
    E##ko = E##komu.v128; \
    E##mu = GET64HI(E##komu.v128, E##komu.v128); \
    XOReq128(Cou, E##komu.v128); \
    E##kuma.v128 = XOR128(Bkuma, ANDnu128(Bkame, Bkemi)); \
    E##ku = E##kuma.v128; \
    E##ma = GET64HI(E##kuma.v128, E##kuma.v128); \
    XOReq128(Cua, E##kuma.v128); \
\
    XOReq64(A##bi, Di); \
    Bsa = ROL64(A##bi, 62); \
    XOReq64(A##go, Do); \
    Bse = ROL64(A##go, 55); \
    XOReq64(A##ku, Du); \
    Bsi = ROL64(A##ku, 39); \
    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
    Ca = E##sa; \
    XOReq64(A##ma, Da); \
    Bso = ROL64(A##ma, 41); \
    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
    Ce = E##se; \
    XOReq128(Cae, GET64LO(Ca, Ce)); \
    XOReq64(A##se, De); \
    Bsu = ROL64(A##se, 2); \
    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
    Ci = E##si; \
    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
    Co = E##so; \
    XOReq128(Cio, GET64LO(Ci, Co)); \
    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
    Cu = E##su; \
\
    Zero = ZERO128(); \
    XOReq128(Cae, GET64HI(Cua, Zero)); \
    XOReq128(Cae, GET64LO(Zero, Cei)); \
    XOReq128(Cio, GET64HI(Cei, Zero)); \
    XOReq128(Cio, GET64LO(Zero, Cou)); \
    XOReq128(Cua, GET64HI(Cou, Zero)); \
    XOReq64(Cu, Cua); \

// --- Theta Rho Pi Chi Iota
// --- 64-bit lanes mapped to 64-bit and 128-bit words
#define thetaRhoPiChiIota(i, A, E) thetaRhoPiChiIotaPrepareTheta(i, A, E)

const UINT64 KeccakF1600RoundConstants[18] = {
    0x0000000000000001ULL,
    0x0000000000008082ULL,
    0x800000000000808aULL,
    0x8000000080008000ULL,
    0x000000000000808bULL,
    0x0000000080000001ULL,
    0x8000000080008081ULL,
    0x8000000000008009ULL,
    0x000000000000008aULL,
    0x0000000000000088ULL,
    0x0000000080008009ULL,
    0x000000008000000aULL,
    0x000000008000808bULL,
    0x800000000000008bULL,
    0x8000000000008089ULL,
    0x8000000000008003ULL,
    0x8000000000008002ULL,
    0x8000000000000080ULL };

#define copyFromStateAndXor1024bits(X, state, input) \
    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
    X##ba = X##bae.v128; \
    X##be = GET64HI(X##bae.v128, X##bae.v128); \
    Cae = X##bae.v128; \
    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
    X##bi = X##bio.v128; \
    X##bo = GET64HI(X##bio.v128, X##bio.v128); \
    Cio = X##bio.v128; \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    Cu = X##bu; \
    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
    X##ga = X##gae.v128; \
    X##ge = GET64HI(X##gae.v128, X##gae.v128); \
    X##bage.v128 = GET64LO(X##ba, X##ge); \
    XOReq128(Cae, X##gae.v128); \
    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD128u(input[ 7])); \
    X##gi = X##gio.v128; \
    X##begi.v128 = GET64LO(X##be, X##gi); \
    X##go = GET64HI(X##gio.v128, X##gio.v128); \
    XOReq128(Cio, X##gio.v128); \
    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
    XOReq64(Cu, X##gu); \
    X##kae.v128 = XOR128(LOAD128(state[10]), LOAD128u(input[10])); \
    X##ka = X##kae.v128; \
    X##ke = GET64HI(X##kae.v128, X##kae.v128); \
    XOReq128(Cae, X##kae.v128); \
    X##kio.v128 = XOR128(LOAD128(state[12]), LOAD128u(input[12])); \
    X##ki = X##kio.v128; \
    X##ko = GET64HI(X##kio.v128, X##kio.v128); \
    XOReq128(Cio, X##kio.v128); \
    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
    XOReq64(Cu, X##ku); \
    X##mae.v128 = XOR128(LOAD128u(state[15]), LOAD64(input[15])); \
    X##ma = X##mae.v128; \
    X##me = GET64HI(X##mae.v128, X##mae.v128); \
    X##kame.v128 = GET64LO(X##ka, X##me); \
    XOReq128(Cae, X##mae.v128); \
    X##mio.v128 = LOAD128u(state[17]); \
    X##mi = X##mio.v128; \
    X##kemi.v128 = GET64LO(X##ke, X##mi); \
    X##mo = GET64HI(X##mio.v128, X##mio.v128); \
    XOReq128(Cio, X##mio.v128); \
    X##mu = LOAD64(state[19]); \
    XOReq64(Cu, X##mu); \
    X##sae.v128 = LOAD128(state[20]); \
    X##sa = X##sae.v128; \
    X##se = GET64HI(X##sae.v128, X##sae.v128); \
    XOReq128(Cae, X##sae.v128); \
    X##sio.v128 = LOAD128(state[22]); \
    X##si = X##sio.v128; \
    X##so = GET64HI(X##sio.v128, X##sio.v128); \
    XOReq128(Cio, X##sio.v128); \
    X##su = LOAD64(state[24]); \
    XOReq64(Cu, X##su); \

#define copyFromStateAndXor512bits(X, state, input) \
    X##bae.v128 = XOR128(LOAD128(state[ 0]), LOAD128u(input[ 0])); \
    X##ba = X##bae.v128; \
    X##be = GET64HI(X##bae.v128, X##bae.v128); \
    Cae = X##bae.v128; \
    X##bio.v128 = XOR128(LOAD128(state[ 2]), LOAD128u(input[ 2])); \
    X##bi = X##bio.v128; \
    X##bo = GET64HI(X##bio.v128, X##bio.v128); \
    Cio = X##bio.v128; \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    Cu = X##bu; \
    X##gae.v128 = XOR128(LOAD128u(state[ 5]), LOAD128u(input[ 5])); \
    X##ga = X##gae.v128; \
    X##ge = GET64HI(X##gae.v128, X##gae.v128); \
    X##bage.v128 = GET64LO(X##ba, X##ge); \
    XOReq128(Cae, X##gae.v128); \
    X##gio.v128 = XOR128(LOAD128u(state[ 7]), LOAD64(input[ 7])); \
    X##gi = X##gio.v128; \
    X##begi.v128 = GET64LO(X##be, X##gi); \
    X##go = GET64HI(X##gio.v128, X##gio.v128); \
    XOReq128(Cio, X##gio.v128); \
    X##gu = LOAD64(state[ 9]); \
    XOReq64(Cu, X##gu); \
    X##kae.v128 = LOAD128(state[10]); \
    X##ka = X##kae.v128; \
    X##ke = GET64HI(X##kae.v128, X##kae.v128); \
    XOReq128(Cae, X##kae.v128); \
    X##kio.v128 = LOAD128(state[12]); \
    X##ki = X##kio.v128; \
    X##ko = GET64HI(X##kio.v128, X##kio.v128); \
    XOReq128(Cio, X##kio.v128); \
    X##ku = LOAD64(state[14]); \
    XOReq64(Cu, X##ku); \
    X##mae.v128 = LOAD128u(state[15]); \
    X##ma = X##mae.v128; \
    X##me = GET64HI(X##mae.v128, X##mae.v128); \
    X##kame.v128 = GET64LO(X##ka, X##me); \
    XOReq128(Cae, X##mae.v128); \
    X##mio.v128 = LOAD128u(state[17]); \
    X##mi = X##mio.v128; \
    X##kemi.v128 = GET64LO(X##ke, X##mi); \
    X##mo = GET64HI(X##mio.v128, X##mio.v128); \
    XOReq128(Cio, X##mio.v128); \
    X##mu = LOAD64(state[19]); \
    XOReq64(Cu, X##mu); \
    X##sae.v128 = LOAD128(state[20]); \
    X##sa = X##sae.v128; \
    X##se = GET64HI(X##sae.v128, X##sae.v128); \
    XOReq128(Cae, X##sae.v128); \
    X##sio.v128 = LOAD128(state[22]); \
    X##si = X##sio.v128; \
    X##so = GET64HI(X##sio.v128, X##sio.v128); \
    XOReq128(Cio, X##sio.v128); \
    X##su = LOAD64(state[24]); \
    XOReq64(Cu, X##su); \

#define copyFromState(X, state) \
    X##bae.v128 = LOAD128(state[ 0]); \
    X##ba = X##bae.v128; \
    X##be = GET64HI(X##bae.v128, X##bae.v128); \
    Cae = X##bae.v128; \
    X##bio.v128 = LOAD128(state[ 2]); \
    X##bi = X##bio.v128; \
    X##bo = GET64HI(X##bio.v128, X##bio.v128); \
    Cio = X##bio.v128; \
    X##bu = LOAD64(state[ 4]); \
    Cu = X##bu; \
    X##gae.v128 = LOAD128u(state[ 5]); \
    X##ga = X##gae.v128; \
    X##ge = GET64HI(X##gae.v128, X##gae.v128); \
    X##bage.v128 = GET64LO(X##ba, X##ge); \
    XOReq128(Cae, X##gae.v128); \
    X##gio.v128 = LOAD128u(state[ 7]); \
    X##gi = X##gio.v128; \
    X##begi.v128 = GET64LO(X##be, X##gi); \
    X##go = GET64HI(X##gio.v128, X##gio.v128); \
    XOReq128(Cio, X##gio.v128); \
    X##gu = LOAD64(state[ 9]); \
    XOReq64(Cu, X##gu); \
    X##kae.v128 = LOAD128(state[10]); \
    X##ka = X##kae.v128; \
    X##ke = GET64HI(X##kae.v128, X##kae.v128); \
    XOReq128(Cae, X##kae.v128); \
    X##kio.v128 = LOAD128(state[12]); \
    X##ki = X##kio.v128; \
    X##ko = GET64HI(X##kio.v128, X##kio.v128); \
    XOReq128(Cio, X##kio.v128); \
    X##ku = LOAD64(state[14]); \
    XOReq64(Cu, X##ku); \
    X##mae.v128 = LOAD128u(state[15]); \
    X##ma = X##mae.v128; \
    X##me = GET64HI(X##mae.v128, X##mae.v128); \
    X##kame.v128 = GET64LO(X##ka, X##me); \
    XOReq128(Cae, X##mae.v128); \
    X##mio.v128 = LOAD128u(state[17]); \
    X##mi = X##mio.v128; \
    X##kemi.v128 = GET64LO(X##ke, X##mi); \
    X##mo = GET64HI(X##mio.v128, X##mio.v128); \
    XOReq128(Cio, X##mio.v128); \
    X##mu = LOAD64(state[19]); \
    XOReq64(Cu, X##mu); \
    X##sae.v128 = LOAD128(state[20]); \
    X##sa = X##sae.v128; \
    X##se = GET64HI(X##sae.v128, X##sae.v128); \
    XOReq128(Cae, X##sae.v128); \
    X##sio.v128 = LOAD128(state[22]); \
    X##si = X##sio.v128; \
    X##so = GET64HI(X##sio.v128, X##sio.v128); \
    XOReq128(Cio, X##sio.v128); \
    X##su = LOAD64(state[24]); \
    XOReq64(Cu, X##su); \

#define copyToState(state, X) \
    state[ 0] = A##bage.v64[0]; \
    state[ 1] = A##begi.v64[0]; \
    STORE64(state[ 2], X##bi); \
    STORE64(state[ 3], X##bo); \
    STORE64(state[ 4], X##bu); \
    STORE64(state[ 5], X##ga); \
    state[ 6] = A##bage.v64[1]; \
    state[ 7] = A##begi.v64[1]; \
    STORE64(state[ 8], X##go); \
    STORE64(state[ 9], X##gu); \
    state[10] = X##kame.v64[0]; \
    state[11] = X##kemi.v64[0]; \
    STORE64(state[12], X##ki); \
    STORE64(state[13], X##ko); \
    STORE64(state[14], X##ku); \
    STORE64(state[15], X##ma); \
    state[16] = X##kame.v64[1]; \
    state[17] = X##kemi.v64[1]; \
    STORE64(state[18], X##mo); \
    STORE64(state[19], X##mu); \
    STORE64(state[20], X##sa); \
    STORE64(state[21], X##se); \
    STORE64(state[22], X##si); \
    STORE64(state[23], X##so); \
    STORE64(state[24], X##su); \

#define copyStateVariables(X, Y) \
    X##bage = Y##bage; \
    X##begi = Y##begi; \
    X##bi = Y##bi; \
    X##bo = Y##bo; \
    X##bu = Y##bu; \
    X##ga = Y##ga; \
    X##go = Y##go; \
    X##gu = Y##gu; \
    X##kame = Y##kame; \
    X##kemi = Y##kemi; \
    X##ki = Y##ki; \
    X##ko = Y##ko; \
    X##ku = Y##ku; \
    X##ma = Y##ma; \
    X##mo = Y##mo; \
    X##mu = Y##mu; \
    X##sa = Y##sa; \
    X##se = Y##se; \
    X##si = Y##si; \
    X##so = Y##so; \
    X##su = Y##su; \

/* ===== */
    #endif

    #ifdef UseBebigokimisa
    #error "UseBebigokimisa cannot be used in combination with UseSSE"
    #endif
#elif defined(UseMMX)
    #include <mmintrin.h>
    typedef __m64 V64;
    #define ANDnu64(a, b)       _mm_andnot_si64(a, b)

    #if defined(_MSC_VER)
        #define LOAD64(a)       *(V64*)&(a)
        #define CONST64(a)      *(V64*)&(a)
        #define STORE64(a, b)   *(V64*)&(a) = b
    #else
        #define LOAD64(a)       (V64)a
        #define CONST64(a)      (V64)a
        #define STORE64(a, b)   a = (UINT64)b
    #endif
    #define ROL64(a, o)         _mm_or_si64(_mm_slli_si64(a, o), _mm_srli_si64(a, 64-(o)))
    #define XOR64(a, b)         _mm_xor_si64(a, b)
    #define XOReq64(a, b)       a = _mm_xor_si64(a, b)

/* ===== "KeccakF-1600-18-simd64.macros" */
/* Code automatically generated by KeccakTools! */
#define declareABCDE \
    V64 Aba, Abe, Abi, Abo, Abu; \
    V64 Aga, Age, Agi, Ago, Agu; \
    V64 Aka, Ake, Aki, Ako, Aku; \
    V64 Ama, Ame, Ami, Amo, Amu; \
    V64 Asa, Ase, Asi, Aso, Asu; \
    V64 Bba, Bbe, Bbi, Bbo, Bbu; \
    V64 Bga, Bge, Bgi, Bgo, Bgu; \
    V64 Bka, Bke, Bki, Bko, Bku; \
    V64 Bma, Bme, Bmi, Bmo, Bmu; \
    V64 Bsa, Bse, Bsi, Bso, Bsu; \
    V64 Ca, Ce, Ci, Co, Cu; \
    V64 Da, De, Di, Do, Du; \
    V64 Eba, Ebe, Ebi, Ebo, Ebu; \
    V64 Ega, Ege, Egi, Ego, Egu; \
    V64 Eka, Eke, Eki, Eko, Eku; \
    V64 Ema, Eme, Emi, Emo, Emu; \
    V64 Esa, Ese, Esi, Eso, Esu; \

#define prepareTheta \
    Ca = XOR64(Aba, XOR64(Aga, XOR64(Aka, XOR64(Ama, Asa)))); \
    Ce = XOR64(Abe, XOR64(Age, XOR64(Ake, XOR64(Ame, Ase)))); \
    Ci = XOR64(Abi, XOR64(Agi, XOR64(Aki, XOR64(Ami, Asi)))); \
    Co = XOR64(Abo, XOR64(Ago, XOR64(Ako, XOR64(Amo, Aso)))); \
    Cu = XOR64(Abu, XOR64(Agu, XOR64(Aku, XOR64(Amu, Asu)))); \

// --- Theta Rho Pi Chi Iota Prepare-theta
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da = XOR64(Cu, ROL64(Ce, 1)); \
    De = XOR64(Ca, ROL64(Ci, 1)); \
    Di = XOR64(Ce, ROL64(Co, 1)); \
    Do = XOR64(Ci, ROL64(Cu, 1)); \
    Du = XOR64(Co, ROL64(Ca, 1)); \
\
    XOReq64(A##ba, Da); \
    Bba = A##ba; \
    XOReq64(A##ge, De); \
    Bbe = ROL64(A##ge, 44); \
    XOReq64(A##ki, Di); \
    Bbi = ROL64(A##ki, 43); \
    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
    Ca = E##ba; \
    XOReq64(A##mo, Do); \
    Bbo = ROL64(A##mo, 21); \
    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
    Ce = E##be; \
    XOReq64(A##su, Du); \
    Bbu = ROL64(A##su, 14); \
    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
    Ci = E##bi; \
    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
    Co = E##bo; \
    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
    Cu = E##bu; \
\
    XOReq64(A##bo, Do); \
    Bga = ROL64(A##bo, 28); \
    XOReq64(A##gu, Du); \
    Bge = ROL64(A##gu, 20); \
    XOReq64(A##ka, Da); \
    Bgi = ROL64(A##ka, 3); \
    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
    XOReq64(Ca, E##ga); \
    XOReq64(A##me, De); \
    Bgo = ROL64(A##me, 45); \
    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
    XOReq64(Ce, E##ge); \
    XOReq64(A##si, Di); \
    Bgu = ROL64(A##si, 61); \
    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
    XOReq64(Ci, E##gi); \
    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
    XOReq64(Co, E##go); \
    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
    XOReq64(Cu, E##gu); \
\
    XOReq64(A##be, De); \
    Bka = ROL64(A##be, 1); \
    XOReq64(A##gi, Di); \
    Bke = ROL64(A##gi, 6); \
    XOReq64(A##ko, Do); \
    Bki = ROL64(A##ko, 25); \
    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
    XOReq64(Ca, E##ka); \
    XOReq64(A##mu, Du); \
    Bko = ROL64(A##mu, 8); \
    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
    XOReq64(Ce, E##ke); \
    XOReq64(A##sa, Da); \
    Bku = ROL64(A##sa, 18); \
    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
    XOReq64(Ci, E##ki); \
    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
    XOReq64(Co, E##ko); \
    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
    XOReq64(Cu, E##ku); \
\
    XOReq64(A##bu, Du); \
    Bma = ROL64(A##bu, 27); \
    XOReq64(A##ga, Da); \
    Bme = ROL64(A##ga, 36); \
    XOReq64(A##ke, De); \
    Bmi = ROL64(A##ke, 10); \
    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
    XOReq64(Ca, E##ma); \
    XOReq64(A##mi, Di); \
    Bmo = ROL64(A##mi, 15); \
    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
    XOReq64(Ce, E##me); \
    XOReq64(A##so, Do); \
    Bmu = ROL64(A##so, 56); \
    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
    XOReq64(Ci, E##mi); \
    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
    XOReq64(Co, E##mo); \
    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
    XOReq64(Cu, E##mu); \
\
    XOReq64(A##bi, Di); \
    Bsa = ROL64(A##bi, 62); \
    XOReq64(A##go, Do); \
    Bse = ROL64(A##go, 55); \
    XOReq64(A##ku, Du); \
    Bsi = ROL64(A##ku, 39); \
    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
    XOReq64(Ca, E##sa); \
    XOReq64(A##ma, Da); \
    Bso = ROL64(A##ma, 41); \
    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
    XOReq64(Ce, E##se); \
    XOReq64(A##se, De); \
    Bsu = ROL64(A##se, 2); \
    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
    XOReq64(Ci, E##si); \
    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
    XOReq64(Co, E##so); \
    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
    XOReq64(Cu, E##su); \
\

// --- Theta Rho Pi Chi Iota
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da = XOR64(Cu, ROL64(Ce, 1)); \
    De = XOR64(Ca, ROL64(Ci, 1)); \
    Di = XOR64(Ce, ROL64(Co, 1)); \
    Do = XOR64(Ci, ROL64(Cu, 1)); \
    Du = XOR64(Co, ROL64(Ca, 1)); \
\
    XOReq64(A##ba, Da); \
    Bba = A##ba; \
    XOReq64(A##ge, De); \
    Bbe = ROL64(A##ge, 44); \
    XOReq64(A##ki, Di); \
    Bbi = ROL64(A##ki, 43); \
    E##ba = XOR64(Bba, ANDnu64(Bbe, Bbi)); \
    XOReq64(E##ba, CONST64(KeccakF1600RoundConstants[i])); \
    XOReq64(A##mo, Do); \
    Bbo = ROL64(A##mo, 21); \
    E##be = XOR64(Bbe, ANDnu64(Bbi, Bbo)); \
    XOReq64(A##su, Du); \
    Bbu = ROL64(A##su, 14); \
    E##bi = XOR64(Bbi, ANDnu64(Bbo, Bbu)); \
    E##bo = XOR64(Bbo, ANDnu64(Bbu, Bba)); \
    E##bu = XOR64(Bbu, ANDnu64(Bba, Bbe)); \
\
    XOReq64(A##bo, Do); \
    Bga = ROL64(A##bo, 28); \
    XOReq64(A##gu, Du); \
    Bge = ROL64(A##gu, 20); \
    XOReq64(A##ka, Da); \
    Bgi = ROL64(A##ka, 3); \
    E##ga = XOR64(Bga, ANDnu64(Bge, Bgi)); \
    XOReq64(A##me, De); \
    Bgo = ROL64(A##me, 45); \
    E##ge = XOR64(Bge, ANDnu64(Bgi, Bgo)); \
    XOReq64(A##si, Di); \
    Bgu = ROL64(A##si, 61); \
    E##gi = XOR64(Bgi, ANDnu64(Bgo, Bgu)); \
    E##go = XOR64(Bgo, ANDnu64(Bgu, Bga)); \
    E##gu = XOR64(Bgu, ANDnu64(Bga, Bge)); \
\
    XOReq64(A##be, De); \
    Bka = ROL64(A##be, 1); \
    XOReq64(A##gi, Di); \
    Bke = ROL64(A##gi, 6); \
    XOReq64(A##ko, Do); \
    Bki = ROL64(A##ko, 25); \
    E##ka = XOR64(Bka, ANDnu64(Bke, Bki)); \
    XOReq64(A##mu, Du); \
    Bko = ROL64(A##mu, 8); \
    E##ke = XOR64(Bke, ANDnu64(Bki, Bko)); \
    XOReq64(A##sa, Da); \
    Bku = ROL64(A##sa, 18); \
    E##ki = XOR64(Bki, ANDnu64(Bko, Bku)); \
    E##ko = XOR64(Bko, ANDnu64(Bku, Bka)); \
    E##ku = XOR64(Bku, ANDnu64(Bka, Bke)); \
\
    XOReq64(A##bu, Du); \
    Bma = ROL64(A##bu, 27); \
    XOReq64(A##ga, Da); \
    Bme = ROL64(A##ga, 36); \
    XOReq64(A##ke, De); \
    Bmi = ROL64(A##ke, 10); \
    E##ma = XOR64(Bma, ANDnu64(Bme, Bmi)); \
    XOReq64(A##mi, Di); \
    Bmo = ROL64(A##mi, 15); \
    E##me = XOR64(Bme, ANDnu64(Bmi, Bmo)); \
    XOReq64(A##so, Do); \
    Bmu = ROL64(A##so, 56); \
    E##mi = XOR64(Bmi, ANDnu64(Bmo, Bmu)); \
    E##mo = XOR64(Bmo, ANDnu64(Bmu, Bma)); \
    E##mu = XOR64(Bmu, ANDnu64(Bma, Bme)); \
\
    XOReq64(A##bi, Di); \
    Bsa = ROL64(A##bi, 62); \
    XOReq64(A##go, Do); \
    Bse = ROL64(A##go, 55); \
    XOReq64(A##ku, Du); \
    Bsi = ROL64(A##ku, 39); \
    E##sa = XOR64(Bsa, ANDnu64(Bse, Bsi)); \
    XOReq64(A##ma, Da); \
    Bso = ROL64(A##ma, 41); \
    E##se = XOR64(Bse, ANDnu64(Bsi, Bso)); \
    XOReq64(A##se, De); \
    Bsu = ROL64(A##se, 2); \
    E##si = XOR64(Bsi, ANDnu64(Bso, Bsu)); \
    E##so = XOR64(Bso, ANDnu64(Bsu, Bsa)); \
    E##su = XOR64(Bsu, ANDnu64(Bsa, Bse)); \
\

const UINT64 KeccakF1600RoundConstants[18] = {
    0x0000000000000001ULL,
    0x0000000000008082ULL,
    0x800000000000808aULL,
    0x8000000080008000ULL,
    0x000000000000808bULL,
    0x0000000080000001ULL,
    0x8000000080008081ULL,
    0x8000000000008009ULL,
    0x000000000000008aULL,
    0x0000000000000088ULL,
    0x0000000080008009ULL,
    0x000000008000000aULL,
    0x000000008000808bULL,
    0x800000000000008bULL,
    0x8000000000008089ULL,
    0x8000000000008003ULL,
    0x8000000000008002ULL,
    0x8000000000000080ULL };

#define copyFromStateAndXor1024bits(X, state, input) \
    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
    X##go = XOR64(LOAD64(state[ 8]), LOAD64(input[ 8])); \
    X##gu = XOR64(LOAD64(state[ 9]), LOAD64(input[ 9])); \
    X##ka = XOR64(LOAD64(state[10]), LOAD64(input[10])); \
    X##ke = XOR64(LOAD64(state[11]), LOAD64(input[11])); \
    X##ki = XOR64(LOAD64(state[12]), LOAD64(input[12])); \
    X##ko = XOR64(LOAD64(state[13]), LOAD64(input[13])); \
    X##ku = XOR64(LOAD64(state[14]), LOAD64(input[14])); \
    X##ma = XOR64(LOAD64(state[15]), LOAD64(input[15])); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyFromStateAndXor512bits(X, state, input) \
    X##ba = XOR64(LOAD64(state[ 0]), LOAD64(input[ 0])); \
    X##be = XOR64(LOAD64(state[ 1]), LOAD64(input[ 1])); \
    X##bi = XOR64(LOAD64(state[ 2]), LOAD64(input[ 2])); \
    X##bo = XOR64(LOAD64(state[ 3]), LOAD64(input[ 3])); \
    X##bu = XOR64(LOAD64(state[ 4]), LOAD64(input[ 4])); \
    X##ga = XOR64(LOAD64(state[ 5]), LOAD64(input[ 5])); \
    X##ge = XOR64(LOAD64(state[ 6]), LOAD64(input[ 6])); \
    X##gi = XOR64(LOAD64(state[ 7]), LOAD64(input[ 7])); \
    X##go = LOAD64(state[ 8]); \
    X##gu = LOAD64(state[ 9]); \
    X##ka = LOAD64(state[10]); \
    X##ke = LOAD64(state[11]); \
    X##ki = LOAD64(state[12]); \
    X##ko = LOAD64(state[13]); \
    X##ku = LOAD64(state[14]); \
    X##ma = LOAD64(state[15]); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyFromState(X, state) \
    X##ba = LOAD64(state[ 0]); \
    X##be = LOAD64(state[ 1]); \
    X##bi = LOAD64(state[ 2]); \
    X##bo = LOAD64(state[ 3]); \
    X##bu = LOAD64(state[ 4]); \
    X##ga = LOAD64(state[ 5]); \
    X##ge = LOAD64(state[ 6]); \
    X##gi = LOAD64(state[ 7]); \
    X##go = LOAD64(state[ 8]); \
    X##gu = LOAD64(state[ 9]); \
    X##ka = LOAD64(state[10]); \
    X##ke = LOAD64(state[11]); \
    X##ki = LOAD64(state[12]); \
    X##ko = LOAD64(state[13]); \
    X##ku = LOAD64(state[14]); \
    X##ma = LOAD64(state[15]); \
    X##me = LOAD64(state[16]); \
    X##mi = LOAD64(state[17]); \
    X##mo = LOAD64(state[18]); \
    X##mu = LOAD64(state[19]); \
    X##sa = LOAD64(state[20]); \
    X##se = LOAD64(state[21]); \
    X##si = LOAD64(state[22]); \
    X##so = LOAD64(state[23]); \
    X##su = LOAD64(state[24]); \

#define copyToState(state, X) \
    STORE64(state[ 0], X##ba); \
    STORE64(state[ 1], X##be); \
    STORE64(state[ 2], X##bi); \
    STORE64(state[ 3], X##bo); \
    STORE64(state[ 4], X##bu); \
    STORE64(state[ 5], X##ga); \
    STORE64(state[ 6], X##ge); \
    STORE64(state[ 7], X##gi); \
    STORE64(state[ 8], X##go); \
    STORE64(state[ 9], X##gu); \
    STORE64(state[10], X##ka); \
    STORE64(state[11], X##ke); \
    STORE64(state[12], X##ki); \
    STORE64(state[13], X##ko); \
    STORE64(state[14], X##ku); \
    STORE64(state[15], X##ma); \
    STORE64(state[16], X##me); \
    STORE64(state[17], X##mi); \
    STORE64(state[18], X##mo); \
    STORE64(state[19], X##mu); \
    STORE64(state[20], X##sa); \
    STORE64(state[21], X##se); \
    STORE64(state[22], X##si); \
    STORE64(state[23], X##so); \
    STORE64(state[24], X##su); \

#define copyStateVariables(X, Y) \
    X##ba = Y##ba; \
    X##be = Y##be; \
    X##bi = Y##bi; \
    X##bo = Y##bo; \
    X##bu = Y##bu; \
    X##ga = Y##ga; \
    X##ge = Y##ge; \
    X##gi = Y##gi; \
    X##go = Y##go; \
    X##gu = Y##gu; \
    X##ka = Y##ka; \
    X##ke = Y##ke; \
    X##ki = Y##ki; \
    X##ko = Y##ko; \
    X##ku = Y##ku; \
    X##ma = Y##ma; \
    X##me = Y##me; \
    X##mi = Y##mi; \
    X##mo = Y##mo; \
    X##mu = Y##mu; \
    X##sa = Y##sa; \
    X##se = Y##se; \
    X##si = Y##si; \
    X##so = Y##so; \
    X##su = Y##su; \

/* ===== */

    #ifdef UseBebigokimisa
    #error "UseBebigokimisa cannot be used in combination with UseMMX"
    #endif
#else
    #if defined(_MSC_VER)
    #define ROL64(a, offset) _rotl64(a, offset)
    #else
    #define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
    #endif

/* ===== "KeccakF-1600-18-64.macros" */
/* Code automatically generated by KeccakTools! */
#define declareABCDE \
    UINT64 Aba, Abe, Abi, Abo, Abu; \
    UINT64 Aga, Age, Agi, Ago, Agu; \
    UINT64 Aka, Ake, Aki, Ako, Aku; \
    UINT64 Ama, Ame, Ami, Amo, Amu; \
    UINT64 Asa, Ase, Asi, Aso, Asu; \
    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
    UINT64 Bka, Bke, Bki, Bko, Bku; \
    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
    UINT64 Ca, Ce, Ci, Co, Cu; \
    UINT64 Da, De, Di, Do, Du; \
    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
    UINT64 Ega, Ege, Egi, Ego, Egu; \
    UINT64 Eka, Eke, Eki, Eko, Eku; \
    UINT64 Ema, Eme, Emi, Emo, Emu; \
    UINT64 Esa, Ese, Esi, Eso, Esu; \

#define prepareTheta \
    Ca = Aba^Aga^Aka^Ama^Asa; \
    Ce = Abe^Age^Ake^Ame^Ase; \
    Ci = Abi^Agi^Aki^Ami^Asi; \
    Co = Abo^Ago^Ako^Amo^Aso; \
    Cu = Abu^Agu^Aku^Amu^Asu; \

#ifdef UseBebigokimisa
// --- Theta Rho Pi Chi Iota Prepare-theta (lane complementing pattern 'bebigokimisa')
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da = Cu^ROL64(Ce, 1); \
    De = Ca^ROL64(Ci, 1); \
    Di = Ce^ROL64(Co, 1); \
    Do = Ci^ROL64(Cu, 1); \
    Du = Co^ROL64(Ca, 1); \
\
    A##ba ^= Da; \
    Bba = A##ba; \
    A##ge ^= De; \
    Bbe = ROL64(A##ge, 44); \
    A##ki ^= Di; \
    Bbi = ROL64(A##ki, 43); \
    E##ba =   Bba ^(  Bbe |  Bbi ); \
    E##ba ^= KeccakF1600RoundConstants[i]; \
    Ca = E##ba; \
    A##mo ^= Do; \
    Bbo = ROL64(A##mo, 21); \
    E##be =   Bbe ^((~Bbi)|  Bbo ); \
    Ce = E##be; \
    A##su ^= Du; \
    Bbu = ROL64(A##su, 14); \
    E##bi =   Bbi ^(  Bbo &  Bbu ); \
    Ci = E##bi; \
    E##bo =   Bbo ^(  Bbu |  Bba ); \
    Co = E##bo; \
    E##bu =   Bbu ^(  Bba &  Bbe ); \
    Cu = E##bu; \
\
    A##bo ^= Do; \
    Bga = ROL64(A##bo, 28); \
    A##gu ^= Du; \
    Bge = ROL64(A##gu, 20); \
    A##ka ^= Da; \
    Bgi = ROL64(A##ka, 3); \
    E##ga =   Bga ^(  Bge |  Bgi ); \
    Ca ^= E##ga; \
    A##me ^= De; \
    Bgo = ROL64(A##me, 45); \
    E##ge =   Bge ^(  Bgi &  Bgo ); \
    Ce ^= E##ge; \
    A##si ^= Di; \
    Bgu = ROL64(A##si, 61); \
    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
    Ci ^= E##gi; \
    E##go =   Bgo ^(  Bgu |  Bga ); \
    Co ^= E##go; \
    E##gu =   Bgu ^(  Bga &  Bge ); \
    Cu ^= E##gu; \
\
    A##be ^= De; \
    Bka = ROL64(A##be, 1); \
    A##gi ^= Di; \
    Bke = ROL64(A##gi, 6); \
    A##ko ^= Do; \
    Bki = ROL64(A##ko, 25); \
    E##ka =   Bka ^(  Bke |  Bki ); \
    Ca ^= E##ka; \
    A##mu ^= Du; \
    Bko = ROL64(A##mu, 8); \
    E##ke =   Bke ^(  Bki &  Bko ); \
    Ce ^= E##ke; \
    A##sa ^= Da; \
    Bku = ROL64(A##sa, 18); \
    E##ki =   Bki ^((~Bko)&  Bku ); \
    Ci ^= E##ki; \
    E##ko = (~Bko)^(  Bku |  Bka ); \
    Co ^= E##ko; \
    E##ku =   Bku ^(  Bka &  Bke ); \
    Cu ^= E##ku; \
\
    A##bu ^= Du; \
    Bma = ROL64(A##bu, 27); \
    A##ga ^= Da; \
    Bme = ROL64(A##ga, 36); \
    A##ke ^= De; \
    Bmi = ROL64(A##ke, 10); \
    E##ma =   Bma ^(  Bme &  Bmi ); \
    Ca ^= E##ma; \
    A##mi ^= Di; \
    Bmo = ROL64(A##mi, 15); \
    E##me =   Bme ^(  Bmi |  Bmo ); \
    Ce ^= E##me; \
    A##so ^= Do; \
    Bmu = ROL64(A##so, 56); \
    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
    Ci ^= E##mi; \
    E##mo = (~Bmo)^(  Bmu &  Bma ); \
    Co ^= E##mo; \
    E##mu =   Bmu ^(  Bma |  Bme ); \
    Cu ^= E##mu; \
\
    A##bi ^= Di; \
    Bsa = ROL64(A##bi, 62); \
    A##go ^= Do; \
    Bse = ROL64(A##go, 55); \
    A##ku ^= Du; \
    Bsi = ROL64(A##ku, 39); \
    E##sa =   Bsa ^((~Bse)&  Bsi ); \
    Ca ^= E##sa; \
    A##ma ^= Da; \
    Bso = ROL64(A##ma, 41); \
    E##se = (~Bse)^(  Bsi |  Bso ); \
    Ce ^= E##se; \
    A##se ^= De; \
    Bsu = ROL64(A##se, 2); \
    E##si =   Bsi ^(  Bso &  Bsu ); \
    Ci ^= E##si; \
    E##so =   Bso ^(  Bsu |  Bsa ); \
    Co ^= E##so; \
    E##su =   Bsu ^(  Bsa &  Bse ); \
    Cu ^= E##su; \
\

// --- Theta Rho Pi Chi Iota (lane complementing pattern 'bebigokimisa')
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da = Cu^ROL64(Ce, 1); \
    De = Ca^ROL64(Ci, 1); \
    Di = Ce^ROL64(Co, 1); \
    Do = Ci^ROL64(Cu, 1); \
    Du = Co^ROL64(Ca, 1); \
\
    A##ba ^= Da; \
    Bba = A##ba; \
    A##ge ^= De; \
    Bbe = ROL64(A##ge, 44); \
    A##ki ^= Di; \
    Bbi = ROL64(A##ki, 43); \
    E##ba =   Bba ^(  Bbe |  Bbi ); \
    E##ba ^= KeccakF1600RoundConstants[i]; \
    A##mo ^= Do; \
    Bbo = ROL64(A##mo, 21); \
    E##be =   Bbe ^((~Bbi)|  Bbo ); \
    A##su ^= Du; \
    Bbu = ROL64(A##su, 14); \
    E##bi =   Bbi ^(  Bbo &  Bbu ); \
    E##bo =   Bbo ^(  Bbu |  Bba ); \
    E##bu =   Bbu ^(  Bba &  Bbe ); \
\
    A##bo ^= Do; \
    Bga = ROL64(A##bo, 28); \
    A##gu ^= Du; \
    Bge = ROL64(A##gu, 20); \
    A##ka ^= Da; \
    Bgi = ROL64(A##ka, 3); \
    E##ga =   Bga ^(  Bge |  Bgi ); \
    A##me ^= De; \
    Bgo = ROL64(A##me, 45); \
    E##ge =   Bge ^(  Bgi &  Bgo ); \
    A##si ^= Di; \
    Bgu = ROL64(A##si, 61); \
    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
    E##go =   Bgo ^(  Bgu |  Bga ); \
    E##gu =   Bgu ^(  Bga &  Bge ); \
\
    A##be ^= De; \
    Bka = ROL64(A##be, 1); \
    A##gi ^= Di; \
    Bke = ROL64(A##gi, 6); \
    A##ko ^= Do; \
    Bki = ROL64(A##ko, 25); \
    E##ka =   Bka ^(  Bke |  Bki ); \
    A##mu ^= Du; \
    Bko = ROL64(A##mu, 8); \
    E##ke =   Bke ^(  Bki &  Bko ); \
    A##sa ^= Da; \
    Bku = ROL64(A##sa, 18); \
    E##ki =   Bki ^((~Bko)&  Bku ); \
    E##ko = (~Bko)^(  Bku |  Bka ); \
    E##ku =   Bku ^(  Bka &  Bke ); \
\
    A##bu ^= Du; \
    Bma = ROL64(A##bu, 27); \
    A##ga ^= Da; \
    Bme = ROL64(A##ga, 36); \
    A##ke ^= De; \
    Bmi = ROL64(A##ke, 10); \
    E##ma =   Bma ^(  Bme &  Bmi ); \
    A##mi ^= Di; \
    Bmo = ROL64(A##mi, 15); \
    E##me =   Bme ^(  Bmi |  Bmo ); \
    A##so ^= Do; \
    Bmu = ROL64(A##so, 56); \
    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
    E##mo = (~Bmo)^(  Bmu &  Bma ); \
    E##mu =   Bmu ^(  Bma |  Bme ); \
\
    A##bi ^= Di; \
    Bsa = ROL64(A##bi, 62); \
    A##go ^= Do; \
    Bse = ROL64(A##go, 55); \
    A##ku ^= Du; \
    Bsi = ROL64(A##ku, 39); \
    E##sa =   Bsa ^((~Bse)&  Bsi ); \
    A##ma ^= Da; \
    Bso = ROL64(A##ma, 41); \
    E##se = (~Bse)^(  Bsi |  Bso ); \
    A##se ^= De; \
    Bsu = ROL64(A##se, 2); \
    E##si =   Bsi ^(  Bso &  Bsu ); \
    E##so =   Bso ^(  Bsu |  Bsa ); \
    E##su =   Bsu ^(  Bsa &  Bse ); \
\

#else // UseBebigokimisa
// --- Theta Rho Pi Chi Iota Prepare-theta
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da = Cu^ROL64(Ce, 1); \
    De = Ca^ROL64(Ci, 1); \
    Di = Ce^ROL64(Co, 1); \
    Do = Ci^ROL64(Cu, 1); \
    Du = Co^ROL64(Ca, 1); \
\
    A##ba ^= Da; \
    Bba = A##ba; \
    A##ge ^= De; \
    Bbe = ROL64(A##ge, 44); \
    A##ki ^= Di; \
    Bbi = ROL64(A##ki, 43); \
    E##ba =   Bba ^((~Bbe)&  Bbi ); \
    E##ba ^= KeccakF1600RoundConstants[i]; \
    Ca = E##ba; \
    A##mo ^= Do; \
    Bbo = ROL64(A##mo, 21); \
    E##be =   Bbe ^((~Bbi)&  Bbo ); \
    Ce = E##be; \
    A##su ^= Du; \
    Bbu = ROL64(A##su, 14); \
    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
    Ci = E##bi; \
    E##bo =   Bbo ^((~Bbu)&  Bba ); \
    Co = E##bo; \
    E##bu =   Bbu ^((~Bba)&  Bbe ); \
    Cu = E##bu; \
\
    A##bo ^= Do; \
    Bga = ROL64(A##bo, 28); \
    A##gu ^= Du; \
    Bge = ROL64(A##gu, 20); \
    A##ka ^= Da; \
    Bgi = ROL64(A##ka, 3); \
    E##ga =   Bga ^((~Bge)&  Bgi ); \
    Ca ^= E##ga; \
    A##me ^= De; \
    Bgo = ROL64(A##me, 45); \
    E##ge =   Bge ^((~Bgi)&  Bgo ); \
    Ce ^= E##ge; \
    A##si ^= Di; \
    Bgu = ROL64(A##si, 61); \
    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
    Ci ^= E##gi; \
    E##go =   Bgo ^((~Bgu)&  Bga ); \
    Co ^= E##go; \
    E##gu =   Bgu ^((~Bga)&  Bge ); \
    Cu ^= E##gu; \
\
    A##be ^= De; \
    Bka = ROL64(A##be, 1); \
    A##gi ^= Di; \
    Bke = ROL64(A##gi, 6); \
    A##ko ^= Do; \
    Bki = ROL64(A##ko, 25); \
    E##ka =   Bka ^((~Bke)&  Bki ); \
    Ca ^= E##ka; \
    A##mu ^= Du; \
    Bko = ROL64(A##mu, 8); \
    E##ke =   Bke ^((~Bki)&  Bko ); \
    Ce ^= E##ke; \
    A##sa ^= Da; \
    Bku = ROL64(A##sa, 18); \
    E##ki =   Bki ^((~Bko)&  Bku ); \
    Ci ^= E##ki; \
    E##ko =   Bko ^((~Bku)&  Bka ); \
    Co ^= E##ko; \
    E##ku =   Bku ^((~Bka)&  Bke ); \
    Cu ^= E##ku; \
\
    A##bu ^= Du; \
    Bma = ROL64(A##bu, 27); \
    A##ga ^= Da; \
    Bme = ROL64(A##ga, 36); \
    A##ke ^= De; \
    Bmi = ROL64(A##ke, 10); \
    E##ma =   Bma ^((~Bme)&  Bmi ); \
    Ca ^= E##ma; \
    A##mi ^= Di; \
    Bmo = ROL64(A##mi, 15); \
    E##me =   Bme ^((~Bmi)&  Bmo ); \
    Ce ^= E##me; \
    A##so ^= Do; \
    Bmu = ROL64(A##so, 56); \
    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
    Ci ^= E##mi; \
    E##mo =   Bmo ^((~Bmu)&  Bma ); \
    Co ^= E##mo; \
    E##mu =   Bmu ^((~Bma)&  Bme ); \
    Cu ^= E##mu; \
\
    A##bi ^= Di; \
    Bsa = ROL64(A##bi, 62); \
    A##go ^= Do; \
    Bse = ROL64(A##go, 55); \
    A##ku ^= Du; \
    Bsi = ROL64(A##ku, 39); \
    E##sa =   Bsa ^((~Bse)&  Bsi ); \
    Ca ^= E##sa; \
    A##ma ^= Da; \
    Bso = ROL64(A##ma, 41); \
    E##se =   Bse ^((~Bsi)&  Bso ); \
    Ce ^= E##se; \
    A##se ^= De; \
    Bsu = ROL64(A##se, 2); \
    E##si =   Bsi ^((~Bso)&  Bsu ); \
    Ci ^= E##si; \
    E##so =   Bso ^((~Bsu)&  Bsa ); \
    Co ^= E##so; \
    E##su =   Bsu ^((~Bsa)&  Bse ); \
    Cu ^= E##su; \
\

// --- Theta Rho Pi Chi Iota
// --- 64-bit lanes mapped to 64-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da = Cu^ROL64(Ce, 1); \
    De = Ca^ROL64(Ci, 1); \
    Di = Ce^ROL64(Co, 1); \
    Do = Ci^ROL64(Cu, 1); \
    Du = Co^ROL64(Ca, 1); \
\
    A##ba ^= Da; \
    Bba = A##ba; \
    A##ge ^= De; \
    Bbe = ROL64(A##ge, 44); \
    A##ki ^= Di; \
    Bbi = ROL64(A##ki, 43); \
    E##ba =   Bba ^((~Bbe)&  Bbi ); \
    E##ba ^= KeccakF1600RoundConstants[i]; \
    A##mo ^= Do; \
    Bbo = ROL64(A##mo, 21); \
    E##be =   Bbe ^((~Bbi)&  Bbo ); \
    A##su ^= Du; \
    Bbu = ROL64(A##su, 14); \
    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
    E##bo =   Bbo ^((~Bbu)&  Bba ); \
    E##bu =   Bbu ^((~Bba)&  Bbe ); \
\
    A##bo ^= Do; \
    Bga = ROL64(A##bo, 28); \
    A##gu ^= Du; \
    Bge = ROL64(A##gu, 20); \
    A##ka ^= Da; \
    Bgi = ROL64(A##ka, 3); \
    E##ga =   Bga ^((~Bge)&  Bgi ); \
    A##me ^= De; \
    Bgo = ROL64(A##me, 45); \
    E##ge =   Bge ^((~Bgi)&  Bgo ); \
    A##si ^= Di; \
    Bgu = ROL64(A##si, 61); \
    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
    E##go =   Bgo ^((~Bgu)&  Bga ); \
    E##gu =   Bgu ^((~Bga)&  Bge ); \
\
    A##be ^= De; \
    Bka = ROL64(A##be, 1); \
    A##gi ^= Di; \
    Bke = ROL64(A##gi, 6); \
    A##ko ^= Do; \
    Bki = ROL64(A##ko, 25); \
    E##ka =   Bka ^((~Bke)&  Bki ); \
    A##mu ^= Du; \
    Bko = ROL64(A##mu, 8); \
    E##ke =   Bke ^((~Bki)&  Bko ); \
    A##sa ^= Da; \
    Bku = ROL64(A##sa, 18); \
    E##ki =   Bki ^((~Bko)&  Bku ); \
    E##ko =   Bko ^((~Bku)&  Bka ); \
    E##ku =   Bku ^((~Bka)&  Bke ); \
\
    A##bu ^= Du; \
    Bma = ROL64(A##bu, 27); \
    A##ga ^= Da; \
    Bme = ROL64(A##ga, 36); \
    A##ke ^= De; \
    Bmi = ROL64(A##ke, 10); \
    E##ma =   Bma ^((~Bme)&  Bmi ); \
    A##mi ^= Di; \
    Bmo = ROL64(A##mi, 15); \
    E##me =   Bme ^((~Bmi)&  Bmo ); \
    A##so ^= Do; \
    Bmu = ROL64(A##so, 56); \
    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
    E##mo =   Bmo ^((~Bmu)&  Bma ); \
    E##mu =   Bmu ^((~Bma)&  Bme ); \
\
    A##bi ^= Di; \
    Bsa = ROL64(A##bi, 62); \
    A##go ^= Do; \
    Bse = ROL64(A##go, 55); \
    A##ku ^= Du; \
    Bsi = ROL64(A##ku, 39); \
    E##sa =   Bsa ^((~Bse)&  Bsi ); \
    A##ma ^= Da; \
    Bso = ROL64(A##ma, 41); \
    E##se =   Bse ^((~Bsi)&  Bso ); \
    A##se ^= De; \
    Bsu = ROL64(A##se, 2); \
    E##si =   Bsi ^((~Bso)&  Bsu ); \
    E##so =   Bso ^((~Bsu)&  Bsa ); \
    E##su =   Bsu ^((~Bsa)&  Bse ); \
\

#endif // UseBebigokimisa

const UINT64 KeccakF1600RoundConstants[18] = {
    0x0000000000000001ULL,
    0x0000000000008082ULL,
    0x800000000000808aULL,
    0x8000000080008000ULL,
    0x000000000000808bULL,
    0x0000000080000001ULL,
    0x8000000080008081ULL,
    0x8000000000008009ULL,
    0x000000000000008aULL,
    0x0000000000000088ULL,
    0x0000000080008009ULL,
    0x000000008000000aULL,
    0x000000008000808bULL,
    0x800000000000008bULL,
    0x8000000000008089ULL,
    0x8000000000008003ULL,
    0x8000000000008002ULL,
    0x8000000000000080ULL };

#define copyFromStateAndXor1024bits(X, state, input) \
    X##ba = state[ 0]^input[ 0]; \
    X##be = state[ 1]^input[ 1]; \
    X##bi = state[ 2]^input[ 2]; \
    X##bo = state[ 3]^input[ 3]; \
    X##bu = state[ 4]^input[ 4]; \
    X##ga = state[ 5]^input[ 5]; \
    X##ge = state[ 6]^input[ 6]; \
    X##gi = state[ 7]^input[ 7]; \
    X##go = state[ 8]^input[ 8]; \
    X##gu = state[ 9]^input[ 9]; \
    X##ka = state[10]^input[10]; \
    X##ke = state[11]^input[11]; \
    X##ki = state[12]^input[12]; \
    X##ko = state[13]^input[13]; \
    X##ku = state[14]^input[14]; \
    X##ma = state[15]^input[15]; \
    X##me = state[16]; \
    X##mi = state[17]; \
    X##mo = state[18]; \
    X##mu = state[19]; \
    X##sa = state[20]; \
    X##se = state[21]; \
    X##si = state[22]; \
    X##so = state[23]; \
    X##su = state[24]; \

#define copyFromStateAndXor512bits(X, state, input) \
    X##ba = state[ 0]^input[ 0]; \
    X##be = state[ 1]^input[ 1]; \
    X##bi = state[ 2]^input[ 2]; \
    X##bo = state[ 3]^input[ 3]; \
    X##bu = state[ 4]^input[ 4]; \
    X##ga = state[ 5]^input[ 5]; \
    X##ge = state[ 6]^input[ 6]; \
    X##gi = state[ 7]^input[ 7]; \
    X##go = state[ 8]; \
    X##gu = state[ 9]; \
    X##ka = state[10]; \
    X##ke = state[11]; \
    X##ki = state[12]; \
    X##ko = state[13]; \
    X##ku = state[14]; \
    X##ma = state[15]; \
    X##me = state[16]; \
    X##mi = state[17]; \
    X##mo = state[18]; \
    X##mu = state[19]; \
    X##sa = state[20]; \
    X##se = state[21]; \
    X##si = state[22]; \
    X##so = state[23]; \
    X##su = state[24]; \

#define copyFromState(X, state) \
    X##ba = state[ 0]; \
    X##be = state[ 1]; \
    X##bi = state[ 2]; \
    X##bo = state[ 3]; \
    X##bu = state[ 4]; \
    X##ga = state[ 5]; \
    X##ge = state[ 6]; \
    X##gi = state[ 7]; \
    X##go = state[ 8]; \
    X##gu = state[ 9]; \
    X##ka = state[10]; \
    X##ke = state[11]; \
    X##ki = state[12]; \
    X##ko = state[13]; \
    X##ku = state[14]; \
    X##ma = state[15]; \
    X##me = state[16]; \
    X##mi = state[17]; \
    X##mo = state[18]; \
    X##mu = state[19]; \
    X##sa = state[20]; \
    X##se = state[21]; \
    X##si = state[22]; \
    X##so = state[23]; \
    X##su = state[24]; \

#define copyToState(state, X) \
    state[ 0] = X##ba; \
    state[ 1] = X##be; \
    state[ 2] = X##bi; \
    state[ 3] = X##bo; \
    state[ 4] = X##bu; \
    state[ 5] = X##ga; \
    state[ 6] = X##ge; \
    state[ 7] = X##gi; \
    state[ 8] = X##go; \
    state[ 9] = X##gu; \
    state[10] = X##ka; \
    state[11] = X##ke; \
    state[12] = X##ki; \
    state[13] = X##ko; \
    state[14] = X##ku; \
    state[15] = X##ma; \
    state[16] = X##me; \
    state[17] = X##mi; \
    state[18] = X##mo; \
    state[19] = X##mu; \
    state[20] = X##sa; \
    state[21] = X##se; \
    state[22] = X##si; \
    state[23] = X##so; \
    state[24] = X##su; \

#define copyStateVariables(X, Y) \
    X##ba = Y##ba; \
    X##be = Y##be; \
    X##bi = Y##bi; \
    X##bo = Y##bo; \
    X##bu = Y##bu; \
    X##ga = Y##ga; \
    X##ge = Y##ge; \
    X##gi = Y##gi; \
    X##go = Y##go; \
    X##gu = Y##gu; \
    X##ka = Y##ka; \
    X##ke = Y##ke; \
    X##ki = Y##ki; \
    X##ko = Y##ko; \
    X##ku = Y##ku; \
    X##ma = Y##ma; \
    X##me = Y##me; \
    X##mi = Y##mi; \
    X##mo = Y##mo; \
    X##mu = Y##mu; \
    X##sa = Y##sa; \
    X##se = Y##se; \
    X##si = Y##si; \
    X##so = Y##so; \
    X##su = Y##su; \

/* ===== */
#endif

/* ===== "KeccakPermutationOptimized.macros" */
#if (Unrolling == 18)
#define rounds \
	prepareTheta \
	thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
	thetaRhoPiChiIotaPrepareTheta(10, A, E) \
	thetaRhoPiChiIotaPrepareTheta(11, E, A) \
	thetaRhoPiChiIotaPrepareTheta(12, A, E) \
	thetaRhoPiChiIotaPrepareTheta(13, E, A) \
	thetaRhoPiChiIotaPrepareTheta(14, A, E) \
	thetaRhoPiChiIotaPrepareTheta(15, E, A) \
	thetaRhoPiChiIotaPrepareTheta(16, A, E) \
	thetaRhoPiChiIota(17, E, A) \
    copyToState(state, A)
#elif (Unrolling == 9)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=9) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+6, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+7, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+8, A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#elif (Unrolling == 6)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=6) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
    } \
    copyToState(state, A)
#elif (Unrolling == 3)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=3) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#elif (Unrolling == 2)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=2) { \
        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
    } \
    copyToState(state, A)
#elif (Unrolling == 1)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i++) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#else
#error "Unrolling is not correctly specified!"
#endif

static
void KeccakPermutationOnWords(UINT64 *state)
{
    declareABCDE
#if (Unrolling != 18)
    unsigned int i;
#endif

    copyFromState(A, state)
    rounds
#if defined(UseMMX)
    _mm_empty();
#endif
}

static
void KeccakPermutationOnWordsAfterXoring512bits(UINT64 *state, const UINT64 *input)
{
    declareABCDE
#if (Unrolling != 18)
    unsigned int i;
#endif

    copyFromStateAndXor512bits(A, state, input)
    rounds
#if defined(UseMMX)
    _mm_empty();
#endif
}

static
void KeccakPermutationOnWordsAfterXoring1024bits(UINT64 *state, const UINT64 *input)
{
    declareABCDE
#if (Unrolling != 18)
    unsigned int i;
#endif

    copyFromStateAndXor1024bits(A, state, input)
    rounds
#if defined(UseMMX)
    _mm_empty();
#endif
}

RPM_GNUC_CONST
void KeccakInitialize()
{
}

void KeccakInitializeState(unsigned char *state)
{
    memset(state, 0, KeccakPermutationSizeInBytes);
#ifdef UseBebigokimisa
    ((UINT64*)state)[ 1] = ~(UINT64)0;
    ((UINT64*)state)[ 2] = ~(UINT64)0;
    ((UINT64*)state)[ 8] = ~(UINT64)0;
    ((UINT64*)state)[12] = ~(UINT64)0;
    ((UINT64*)state)[17] = ~(UINT64)0;
    ((UINT64*)state)[20] = ~(UINT64)0;
#endif
}

void KeccakPermutation(unsigned char *state)
{
    // We assume the state is always stored as words
    KeccakPermutationOnWords((UINT64*)state);
}

#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
static
void fromBytesToWord(UINT64 *word, const UINT8 *bytes)
{
    unsigned int i;

    *word = 0;
    for(i=0; i<(64/8); i++)
        *word |= (UINT64)(bytes[i]) << (8*i);
}

static
void fromWordToBytes(UINT8 *bytes, const UINT64 word)
{
    unsigned int i;

    for(i=0; i<(64/8); i++)
        bytes[i] = (word >> (8*i)) & 0xFF;
}
#endif

void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, (const UINT64*)data);
#else
    UINT64 dataAsWords[16];
    unsigned int i;

    for(i=0; i<16; i++)
        fromBytesToWord(dataAsWords+i, data+(i*8));
    KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, dataAsWords);
#endif
}

void KeccakAbsorb512bits(unsigned char *state, const unsigned char *data)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    KeccakPermutationOnWordsAfterXoring512bits((UINT64*)state, (const UINT64*)data);
#else
    UINT64 dataAsWords[8];
    unsigned int i;

    for(i=0; i<8; i++)
        fromBytesToWord(dataAsWords+i, data+(i*8));
    KeccakPermutationOnWordsAfterXoring512bits((UINT64*)state, dataAsWords);
#endif
}

void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    memcpy(data, state, 128);
#else
    unsigned int i;

    for(i=0; i<16; i++)
        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
#endif
#ifdef UseBebigokimisa
    ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
    ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
    ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
#endif
}

void KeccakExtract512bits(const unsigned char *state, unsigned char *data)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    memcpy(data, state, 64);
#else
    unsigned int i;

    for(i=0; i<8; i++)
        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
#endif
#ifdef UseBebigokimisa
    ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
    ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
#endif
}
/* ===== */

#elif OPTIMIZED == 32

/* ===== "KeccakPermutationOptimized32.c" */

#ifdef UseInterleaveTables
static int interleaveTablesBuilt = 0;
static UINT16 interleaveTable[65536];
static UINT16 deinterleaveTable[65536];

static
void buildInterleaveTables()
{
    UINT32 i, j;
    UINT16 x;

    if (!interleaveTablesBuilt) {
        for(i=0; i<65536; i++) {
            x = 0;
            for(j=0; j<16; j++) {
                if (i & (1 << j))
                    x |= (1 << (j/2 + 8*(j%2)));
            }
            interleaveTable[i] = x;
            deinterleaveTable[x] = (UINT16)i;
        }
        interleaveTablesBuilt = 1;
    }
}

#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)

#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
    i##j = interleaveTable[((const UINT16*)source)[j]]; \
    ((UINT8*)even)[j] ^= i##j & 0xFF; \
    ((UINT8*)odd)[j] ^= i##j >> 8;

#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
    ((UINT16*)dest)[j] = d##j;

#else

#define xor2bytesIntoInterleavedWords(even, odd, source, j) \
    i##j = interleaveTable[source[2*j] ^ ((UINT16)source[2*j+1] << 8)]; \
    *even ^= (i##j & 0xFF) << (j*8); \
    *odd ^= ((i##j >> 8) & 0xFF) << (j*8);

#define setInterleavedWordsInto2bytes(dest, even, odd, j) \
    d##j = deinterleaveTable[((even >> (j*8)) & 0xFF) ^ (((odd >> (j*8)) & 0xFF) << 8)]; \
    dest[2*j] = d##j & 0xFF; \
    dest[2*j+1] = d##j >> 8;

#endif

static
void xor8bytesIntoInterleavedWords(UINT32 *even, UINT32 *odd, const UINT8* source)
{
    UINT16 i0, i1, i2, i3;

    xor2bytesIntoInterleavedWords(even, odd, source, 0)
    xor2bytesIntoInterleavedWords(even, odd, source, 1)
    xor2bytesIntoInterleavedWords(even, odd, source, 2)
    xor2bytesIntoInterleavedWords(even, odd, source, 3)
}

static
void setInterleavedWordsInto8bytes(UINT8* dest, UINT32 even, UINT32 odd)
{
    UINT16 d0, d1, d2, d3;

    setInterleavedWordsInto2bytes(dest, even, odd, 0)
    setInterleavedWordsInto2bytes(dest, even, odd, 1)
    setInterleavedWordsInto2bytes(dest, even, odd, 2)
    setInterleavedWordsInto2bytes(dest, even, odd, 3)
}

#else

// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
static
UINT64 toInterleaving(UINT64 x) 
{
   UINT64 t;

   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);
   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);

   return x;
}

// Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
static
UINT64 fromInterleaving(UINT64 x)
{
   UINT64 t;

   t = (x ^ (x >> 16)) & 0x00000000FFFF0000ULL;  x = x ^ t ^ (t << 16);
   t = (x ^ (x >>  8)) & 0x0000FF000000FF00ULL;  x = x ^ t ^ (t <<  8);
   t = (x ^ (x >>  4)) & 0x00F000F000F000F0ULL;  x = x ^ t ^ (t <<  4);
   t = (x ^ (x >>  2)) & 0x0C0C0C0C0C0C0C0CULL;  x = x ^ t ^ (t <<  2);
   t = (x ^ (x >>  1)) & 0x2222222222222222ULL;  x = x ^ t ^ (t <<  1);

   return x;
}

static
void xor8bytesIntoInterleavedWords(UINT64 *evenAndOdd, const UINT8* source)
{
    *evenAndOdd ^= toInterleaving(((const UINT64*)source)[0]);
}

static
void setInterleavedWordsInto8bytes(UINT8* dest, UINT64 evenAndOdd)
{
    ((UINT64*)dest)[0] = fromInterleaving(evenAndOdd);
}

#endif

#if defined(_MSC_VER)
#define ROL32(a, offset) _rotl(a, offset)
#else
#define ROL32(a, offset) ((((UINT32)a) << offset) ^ (((UINT32)a) >> (32-offset)))
#endif

/* ===== "KeccakF-1600-18-32.macros" */
/* Code automatically generated by KeccakTools! */
#define declareABCDE \
    UINT32 Aba0, Abe0, Abi0, Abo0, Abu0; \
    UINT32 Aba1, Abe1, Abi1, Abo1, Abu1; \
    UINT32 Aga0, Age0, Agi0, Ago0, Agu0; \
    UINT32 Aga1, Age1, Agi1, Ago1, Agu1; \
    UINT32 Aka0, Ake0, Aki0, Ako0, Aku0; \
    UINT32 Aka1, Ake1, Aki1, Ako1, Aku1; \
    UINT32 Ama0, Ame0, Ami0, Amo0, Amu0; \
    UINT32 Ama1, Ame1, Ami1, Amo1, Amu1; \
    UINT32 Asa0, Ase0, Asi0, Aso0, Asu0; \
    UINT32 Asa1, Ase1, Asi1, Aso1, Asu1; \
    UINT32 Bba0, Bbe0, Bbi0, Bbo0, Bbu0; \
    UINT32 Bba1, Bbe1, Bbi1, Bbo1, Bbu1; \
    UINT32 Bga0, Bge0, Bgi0, Bgo0, Bgu0; \
    UINT32 Bga1, Bge1, Bgi1, Bgo1, Bgu1; \
    UINT32 Bka0, Bke0, Bki0, Bko0, Bku0; \
    UINT32 Bka1, Bke1, Bki1, Bko1, Bku1; \
    UINT32 Bma0, Bme0, Bmi0, Bmo0, Bmu0; \
    UINT32 Bma1, Bme1, Bmi1, Bmo1, Bmu1; \
    UINT32 Bsa0, Bse0, Bsi0, Bso0, Bsu0; \
    UINT32 Bsa1, Bse1, Bsi1, Bso1, Bsu1; \
    UINT32 Ca0, Ce0, Ci0, Co0, Cu0; \
    UINT32 Ca1, Ce1, Ci1, Co1, Cu1; \
    UINT32 Da0, De0, Di0, Do0, Du0; \
    UINT32 Da1, De1, Di1, Do1, Du1; \
    UINT32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; \
    UINT32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; \
    UINT32 Ega0, Ege0, Egi0, Ego0, Egu0; \
    UINT32 Ega1, Ege1, Egi1, Ego1, Egu1; \
    UINT32 Eka0, Eke0, Eki0, Eko0, Eku0; \
    UINT32 Eka1, Eke1, Eki1, Eko1, Eku1; \
    UINT32 Ema0, Eme0, Emi0, Emo0, Emu0; \
    UINT32 Ema1, Eme1, Emi1, Emo1, Emu1; \
    UINT32 Esa0, Ese0, Esi0, Eso0, Esu0; \
    UINT32 Esa1, Ese1, Esi1, Eso1, Esu1; \

#define prepareTheta \
    Ca0 = Aba0^Aga0^Aka0^Ama0^Asa0; \
    Ca1 = Aba1^Aga1^Aka1^Ama1^Asa1; \
    Ce0 = Abe0^Age0^Ake0^Ame0^Ase0; \
    Ce1 = Abe1^Age1^Ake1^Ame1^Ase1; \
    Ci0 = Abi0^Agi0^Aki0^Ami0^Asi0; \
    Ci1 = Abi1^Agi1^Aki1^Ami1^Asi1; \
    Co0 = Abo0^Ago0^Ako0^Amo0^Aso0; \
    Co1 = Abo1^Ago1^Ako1^Amo1^Aso1; \
    Cu0 = Abu0^Agu0^Aku0^Amu0^Asu0; \
    Cu1 = Abu1^Agu1^Aku1^Amu1^Asu1; \

#ifdef UseBebigokimisa
// --- Theta Rho Pi Chi Iota Prepare-theta (lane complementing pattern 'bebigokimisa')
// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da0 = Cu0^ROL32(Ce1, 1); \
    Da1 = Cu1^Ce0; \
    De0 = Ca0^ROL32(Ci1, 1); \
    De1 = Ca1^Ci0; \
    Di0 = Ce0^ROL32(Co1, 1); \
    Di1 = Ce1^Co0; \
    Do0 = Ci0^ROL32(Cu1, 1); \
    Do1 = Ci1^Cu0; \
    Du0 = Co0^ROL32(Ca1, 1); \
    Du1 = Co1^Ca0; \
\
    A##ba0 ^= Da0; \
    Bba0 = A##ba0; \
    A##ge0 ^= De0; \
    Bbe0 = ROL32(A##ge0, 22); \
    A##ki1 ^= Di1; \
    Bbi0 = ROL32(A##ki1, 22); \
    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
    Ca0 = E##ba0; \
    A##mo1 ^= Do1; \
    Bbo0 = ROL32(A##mo1, 11); \
    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
    Ce0 = E##be0; \
    A##su0 ^= Du0; \
    Bbu0 = ROL32(A##su0, 7); \
    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
    Ci0 = E##bi0; \
    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
    Co0 = E##bo0; \
    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
    Cu0 = E##bu0; \
\
    A##ba1 ^= Da1; \
    Bba1 = A##ba1; \
    A##ge1 ^= De1; \
    Bbe1 = ROL32(A##ge1, 22); \
    A##ki0 ^= Di0; \
    Bbi1 = ROL32(A##ki0, 21); \
    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
    Ca1 = E##ba1; \
    A##mo0 ^= Do0; \
    Bbo1 = ROL32(A##mo0, 10); \
    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
    Ce1 = E##be1; \
    A##su1 ^= Du1; \
    Bbu1 = ROL32(A##su1, 7); \
    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
    Ci1 = E##bi1; \
    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
    Co1 = E##bo1; \
    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
    Cu1 = E##bu1; \
\
    A##bo0 ^= Do0; \
    Bga0 = ROL32(A##bo0, 14); \
    A##gu0 ^= Du0; \
    Bge0 = ROL32(A##gu0, 10); \
    A##ka1 ^= Da1; \
    Bgi0 = ROL32(A##ka1, 2); \
    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
    Ca0 ^= E##ga0; \
    A##me1 ^= De1; \
    Bgo0 = ROL32(A##me1, 23); \
    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
    Ce0 ^= E##ge0; \
    A##si1 ^= Di1; \
    Bgu0 = ROL32(A##si1, 31); \
    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
    Ci0 ^= E##gi0; \
    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
    Co0 ^= E##go0; \
    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
    Cu0 ^= E##gu0; \
\
    A##bo1 ^= Do1; \
    Bga1 = ROL32(A##bo1, 14); \
    A##gu1 ^= Du1; \
    Bge1 = ROL32(A##gu1, 10); \
    A##ka0 ^= Da0; \
    Bgi1 = ROL32(A##ka0, 1); \
    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
    Ca1 ^= E##ga1; \
    A##me0 ^= De0; \
    Bgo1 = ROL32(A##me0, 22); \
    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
    Ce1 ^= E##ge1; \
    A##si0 ^= Di0; \
    Bgu1 = ROL32(A##si0, 30); \
    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
    Ci1 ^= E##gi1; \
    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
    Co1 ^= E##go1; \
    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
    Cu1 ^= E##gu1; \
\
    A##be1 ^= De1; \
    Bka0 = ROL32(A##be1, 1); \
    A##gi0 ^= Di0; \
    Bke0 = ROL32(A##gi0, 3); \
    A##ko1 ^= Do1; \
    Bki0 = ROL32(A##ko1, 13); \
    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
    Ca0 ^= E##ka0; \
    A##mu0 ^= Du0; \
    Bko0 = ROL32(A##mu0, 4); \
    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
    Ce0 ^= E##ke0; \
    A##sa0 ^= Da0; \
    Bku0 = ROL32(A##sa0, 9); \
    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
    Ci0 ^= E##ki0; \
    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
    Co0 ^= E##ko0; \
    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
    Cu0 ^= E##ku0; \
\
    A##be0 ^= De0; \
    Bka1 = A##be0; \
    A##gi1 ^= Di1; \
    Bke1 = ROL32(A##gi1, 3); \
    A##ko0 ^= Do0; \
    Bki1 = ROL32(A##ko0, 12); \
    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
    Ca1 ^= E##ka1; \
    A##mu1 ^= Du1; \
    Bko1 = ROL32(A##mu1, 4); \
    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
    Ce1 ^= E##ke1; \
    A##sa1 ^= Da1; \
    Bku1 = ROL32(A##sa1, 9); \
    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
    Ci1 ^= E##ki1; \
    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
    Co1 ^= E##ko1; \
    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
    Cu1 ^= E##ku1; \
\
    A##bu1 ^= Du1; \
    Bma0 = ROL32(A##bu1, 14); \
    A##ga0 ^= Da0; \
    Bme0 = ROL32(A##ga0, 18); \
    A##ke0 ^= De0; \
    Bmi0 = ROL32(A##ke0, 5); \
    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
    Ca0 ^= E##ma0; \
    A##mi1 ^= Di1; \
    Bmo0 = ROL32(A##mi1, 8); \
    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
    Ce0 ^= E##me0; \
    A##so0 ^= Do0; \
    Bmu0 = ROL32(A##so0, 28); \
    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
    Ci0 ^= E##mi0; \
    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
    Co0 ^= E##mo0; \
    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
    Cu0 ^= E##mu0; \
\
    A##bu0 ^= Du0; \
    Bma1 = ROL32(A##bu0, 13); \
    A##ga1 ^= Da1; \
    Bme1 = ROL32(A##ga1, 18); \
    A##ke1 ^= De1; \
    Bmi1 = ROL32(A##ke1, 5); \
    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
    Ca1 ^= E##ma1; \
    A##mi0 ^= Di0; \
    Bmo1 = ROL32(A##mi0, 7); \
    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
    Ce1 ^= E##me1; \
    A##so1 ^= Do1; \
    Bmu1 = ROL32(A##so1, 28); \
    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
    Ci1 ^= E##mi1; \
    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
    Co1 ^= E##mo1; \
    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
    Cu1 ^= E##mu1; \
\
    A##bi0 ^= Di0; \
    Bsa0 = ROL32(A##bi0, 31); \
    A##go1 ^= Do1; \
    Bse0 = ROL32(A##go1, 28); \
    A##ku1 ^= Du1; \
    Bsi0 = ROL32(A##ku1, 20); \
    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
    Ca0 ^= E##sa0; \
    A##ma1 ^= Da1; \
    Bso0 = ROL32(A##ma1, 21); \
    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
    Ce0 ^= E##se0; \
    A##se0 ^= De0; \
    Bsu0 = ROL32(A##se0, 1); \
    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
    Ci0 ^= E##si0; \
    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
    Co0 ^= E##so0; \
    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
    Cu0 ^= E##su0; \
\
    A##bi1 ^= Di1; \
    Bsa1 = ROL32(A##bi1, 31); \
    A##go0 ^= Do0; \
    Bse1 = ROL32(A##go0, 27); \
    A##ku0 ^= Du0; \
    Bsi1 = ROL32(A##ku0, 19); \
    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
    Ca1 ^= E##sa1; \
    A##ma0 ^= Da0; \
    Bso1 = ROL32(A##ma0, 20); \
    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
    Ce1 ^= E##se1; \
    A##se1 ^= De1; \
    Bsu1 = ROL32(A##se1, 1); \
    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
    Ci1 ^= E##si1; \
    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
    Co1 ^= E##so1; \
    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
    Cu1 ^= E##su1; \
\

// --- Theta Rho Pi Chi Iota (lane complementing pattern 'bebigokimisa')
// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da0 = Cu0^ROL32(Ce1, 1); \
    Da1 = Cu1^Ce0; \
    De0 = Ca0^ROL32(Ci1, 1); \
    De1 = Ca1^Ci0; \
    Di0 = Ce0^ROL32(Co1, 1); \
    Di1 = Ce1^Co0; \
    Do0 = Ci0^ROL32(Cu1, 1); \
    Do1 = Ci1^Cu0; \
    Du0 = Co0^ROL32(Ca1, 1); \
    Du1 = Co1^Ca0; \
\
    A##ba0 ^= Da0; \
    Bba0 = A##ba0; \
    A##ge0 ^= De0; \
    Bbe0 = ROL32(A##ge0, 22); \
    A##ki1 ^= Di1; \
    Bbi0 = ROL32(A##ki1, 22); \
    E##ba0 =   Bba0 ^(  Bbe0 |  Bbi0 ); \
    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
    A##mo1 ^= Do1; \
    Bbo0 = ROL32(A##mo1, 11); \
    E##be0 =   Bbe0 ^((~Bbi0)|  Bbo0 ); \
    A##su0 ^= Du0; \
    Bbu0 = ROL32(A##su0, 7); \
    E##bi0 =   Bbi0 ^(  Bbo0 &  Bbu0 ); \
    E##bo0 =   Bbo0 ^(  Bbu0 |  Bba0 ); \
    E##bu0 =   Bbu0 ^(  Bba0 &  Bbe0 ); \
\
    A##ba1 ^= Da1; \
    Bba1 = A##ba1; \
    A##ge1 ^= De1; \
    Bbe1 = ROL32(A##ge1, 22); \
    A##ki0 ^= Di0; \
    Bbi1 = ROL32(A##ki0, 21); \
    E##ba1 =   Bba1 ^(  Bbe1 |  Bbi1 ); \
    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
    A##mo0 ^= Do0; \
    Bbo1 = ROL32(A##mo0, 10); \
    E##be1 =   Bbe1 ^((~Bbi1)|  Bbo1 ); \
    A##su1 ^= Du1; \
    Bbu1 = ROL32(A##su1, 7); \
    E##bi1 =   Bbi1 ^(  Bbo1 &  Bbu1 ); \
    E##bo1 =   Bbo1 ^(  Bbu1 |  Bba1 ); \
    E##bu1 =   Bbu1 ^(  Bba1 &  Bbe1 ); \
\
    A##bo0 ^= Do0; \
    Bga0 = ROL32(A##bo0, 14); \
    A##gu0 ^= Du0; \
    Bge0 = ROL32(A##gu0, 10); \
    A##ka1 ^= Da1; \
    Bgi0 = ROL32(A##ka1, 2); \
    E##ga0 =   Bga0 ^(  Bge0 |  Bgi0 ); \
    A##me1 ^= De1; \
    Bgo0 = ROL32(A##me1, 23); \
    E##ge0 =   Bge0 ^(  Bgi0 &  Bgo0 ); \
    A##si1 ^= Di1; \
    Bgu0 = ROL32(A##si1, 31); \
    E##gi0 =   Bgi0 ^(  Bgo0 |(~Bgu0)); \
    E##go0 =   Bgo0 ^(  Bgu0 |  Bga0 ); \
    E##gu0 =   Bgu0 ^(  Bga0 &  Bge0 ); \
\
    A##bo1 ^= Do1; \
    Bga1 = ROL32(A##bo1, 14); \
    A##gu1 ^= Du1; \
    Bge1 = ROL32(A##gu1, 10); \
    A##ka0 ^= Da0; \
    Bgi1 = ROL32(A##ka0, 1); \
    E##ga1 =   Bga1 ^(  Bge1 |  Bgi1 ); \
    A##me0 ^= De0; \
    Bgo1 = ROL32(A##me0, 22); \
    E##ge1 =   Bge1 ^(  Bgi1 &  Bgo1 ); \
    A##si0 ^= Di0; \
    Bgu1 = ROL32(A##si0, 30); \
    E##gi1 =   Bgi1 ^(  Bgo1 |(~Bgu1)); \
    E##go1 =   Bgo1 ^(  Bgu1 |  Bga1 ); \
    E##gu1 =   Bgu1 ^(  Bga1 &  Bge1 ); \
\
    A##be1 ^= De1; \
    Bka0 = ROL32(A##be1, 1); \
    A##gi0 ^= Di0; \
    Bke0 = ROL32(A##gi0, 3); \
    A##ko1 ^= Do1; \
    Bki0 = ROL32(A##ko1, 13); \
    E##ka0 =   Bka0 ^(  Bke0 |  Bki0 ); \
    A##mu0 ^= Du0; \
    Bko0 = ROL32(A##mu0, 4); \
    E##ke0 =   Bke0 ^(  Bki0 &  Bko0 ); \
    A##sa0 ^= Da0; \
    Bku0 = ROL32(A##sa0, 9); \
    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
    E##ko0 = (~Bko0)^(  Bku0 |  Bka0 ); \
    E##ku0 =   Bku0 ^(  Bka0 &  Bke0 ); \
\
    A##be0 ^= De0; \
    Bka1 = A##be0; \
    A##gi1 ^= Di1; \
    Bke1 = ROL32(A##gi1, 3); \
    A##ko0 ^= Do0; \
    Bki1 = ROL32(A##ko0, 12); \
    E##ka1 =   Bka1 ^(  Bke1 |  Bki1 ); \
    A##mu1 ^= Du1; \
    Bko1 = ROL32(A##mu1, 4); \
    E##ke1 =   Bke1 ^(  Bki1 &  Bko1 ); \
    A##sa1 ^= Da1; \
    Bku1 = ROL32(A##sa1, 9); \
    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
    E##ko1 = (~Bko1)^(  Bku1 |  Bka1 ); \
    E##ku1 =   Bku1 ^(  Bka1 &  Bke1 ); \
\
    A##bu1 ^= Du1; \
    Bma0 = ROL32(A##bu1, 14); \
    A##ga0 ^= Da0; \
    Bme0 = ROL32(A##ga0, 18); \
    A##ke0 ^= De0; \
    Bmi0 = ROL32(A##ke0, 5); \
    E##ma0 =   Bma0 ^(  Bme0 &  Bmi0 ); \
    A##mi1 ^= Di1; \
    Bmo0 = ROL32(A##mi1, 8); \
    E##me0 =   Bme0 ^(  Bmi0 |  Bmo0 ); \
    A##so0 ^= Do0; \
    Bmu0 = ROL32(A##so0, 28); \
    E##mi0 =   Bmi0 ^((~Bmo0)|  Bmu0 ); \
    E##mo0 = (~Bmo0)^(  Bmu0 &  Bma0 ); \
    E##mu0 =   Bmu0 ^(  Bma0 |  Bme0 ); \
\
    A##bu0 ^= Du0; \
    Bma1 = ROL32(A##bu0, 13); \
    A##ga1 ^= Da1; \
    Bme1 = ROL32(A##ga1, 18); \
    A##ke1 ^= De1; \
    Bmi1 = ROL32(A##ke1, 5); \
    E##ma1 =   Bma1 ^(  Bme1 &  Bmi1 ); \
    A##mi0 ^= Di0; \
    Bmo1 = ROL32(A##mi0, 7); \
    E##me1 =   Bme1 ^(  Bmi1 |  Bmo1 ); \
    A##so1 ^= Do1; \
    Bmu1 = ROL32(A##so1, 28); \
    E##mi1 =   Bmi1 ^((~Bmo1)|  Bmu1 ); \
    E##mo1 = (~Bmo1)^(  Bmu1 &  Bma1 ); \
    E##mu1 =   Bmu1 ^(  Bma1 |  Bme1 ); \
\
    A##bi0 ^= Di0; \
    Bsa0 = ROL32(A##bi0, 31); \
    A##go1 ^= Do1; \
    Bse0 = ROL32(A##go1, 28); \
    A##ku1 ^= Du1; \
    Bsi0 = ROL32(A##ku1, 20); \
    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
    A##ma1 ^= Da1; \
    Bso0 = ROL32(A##ma1, 21); \
    E##se0 = (~Bse0)^(  Bsi0 |  Bso0 ); \
    A##se0 ^= De0; \
    Bsu0 = ROL32(A##se0, 1); \
    E##si0 =   Bsi0 ^(  Bso0 &  Bsu0 ); \
    E##so0 =   Bso0 ^(  Bsu0 |  Bsa0 ); \
    E##su0 =   Bsu0 ^(  Bsa0 &  Bse0 ); \
\
    A##bi1 ^= Di1; \
    Bsa1 = ROL32(A##bi1, 31); \
    A##go0 ^= Do0; \
    Bse1 = ROL32(A##go0, 27); \
    A##ku0 ^= Du0; \
    Bsi1 = ROL32(A##ku0, 19); \
    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
    A##ma0 ^= Da0; \
    Bso1 = ROL32(A##ma0, 20); \
    E##se1 = (~Bse1)^(  Bsi1 |  Bso1 ); \
    A##se1 ^= De1; \
    Bsu1 = ROL32(A##se1, 1); \
    E##si1 =   Bsi1 ^(  Bso1 &  Bsu1 ); \
    E##so1 =   Bso1 ^(  Bsu1 |  Bsa1 ); \
    E##su1 =   Bsu1 ^(  Bsa1 &  Bse1 ); \
\

#else // UseBebigokimisa
// --- Theta Rho Pi Chi Iota Prepare-theta
// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
    Da0 = Cu0^ROL32(Ce1, 1); \
    Da1 = Cu1^Ce0; \
    De0 = Ca0^ROL32(Ci1, 1); \
    De1 = Ca1^Ci0; \
    Di0 = Ce0^ROL32(Co1, 1); \
    Di1 = Ce1^Co0; \
    Do0 = Ci0^ROL32(Cu1, 1); \
    Do1 = Ci1^Cu0; \
    Du0 = Co0^ROL32(Ca1, 1); \
    Du1 = Co1^Ca0; \
\
    A##ba0 ^= Da0; \
    Bba0 = A##ba0; \
    A##ge0 ^= De0; \
    Bbe0 = ROL32(A##ge0, 22); \
    A##ki1 ^= Di1; \
    Bbi0 = ROL32(A##ki1, 22); \
    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
    Ca0 = E##ba0; \
    A##mo1 ^= Do1; \
    Bbo0 = ROL32(A##mo1, 11); \
    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
    Ce0 = E##be0; \
    A##su0 ^= Du0; \
    Bbu0 = ROL32(A##su0, 7); \
    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
    Ci0 = E##bi0; \
    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
    Co0 = E##bo0; \
    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
    Cu0 = E##bu0; \
\
    A##ba1 ^= Da1; \
    Bba1 = A##ba1; \
    A##ge1 ^= De1; \
    Bbe1 = ROL32(A##ge1, 22); \
    A##ki0 ^= Di0; \
    Bbi1 = ROL32(A##ki0, 21); \
    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
    Ca1 = E##ba1; \
    A##mo0 ^= Do0; \
    Bbo1 = ROL32(A##mo0, 10); \
    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
    Ce1 = E##be1; \
    A##su1 ^= Du1; \
    Bbu1 = ROL32(A##su1, 7); \
    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
    Ci1 = E##bi1; \
    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
    Co1 = E##bo1; \
    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
    Cu1 = E##bu1; \
\
    A##bo0 ^= Do0; \
    Bga0 = ROL32(A##bo0, 14); \
    A##gu0 ^= Du0; \
    Bge0 = ROL32(A##gu0, 10); \
    A##ka1 ^= Da1; \
    Bgi0 = ROL32(A##ka1, 2); \
    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
    Ca0 ^= E##ga0; \
    A##me1 ^= De1; \
    Bgo0 = ROL32(A##me1, 23); \
    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
    Ce0 ^= E##ge0; \
    A##si1 ^= Di1; \
    Bgu0 = ROL32(A##si1, 31); \
    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
    Ci0 ^= E##gi0; \
    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
    Co0 ^= E##go0; \
    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
    Cu0 ^= E##gu0; \
\
    A##bo1 ^= Do1; \
    Bga1 = ROL32(A##bo1, 14); \
    A##gu1 ^= Du1; \
    Bge1 = ROL32(A##gu1, 10); \
    A##ka0 ^= Da0; \
    Bgi1 = ROL32(A##ka0, 1); \
    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
    Ca1 ^= E##ga1; \
    A##me0 ^= De0; \
    Bgo1 = ROL32(A##me0, 22); \
    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
    Ce1 ^= E##ge1; \
    A##si0 ^= Di0; \
    Bgu1 = ROL32(A##si0, 30); \
    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
    Ci1 ^= E##gi1; \
    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
    Co1 ^= E##go1; \
    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
    Cu1 ^= E##gu1; \
\
    A##be1 ^= De1; \
    Bka0 = ROL32(A##be1, 1); \
    A##gi0 ^= Di0; \
    Bke0 = ROL32(A##gi0, 3); \
    A##ko1 ^= Do1; \
    Bki0 = ROL32(A##ko1, 13); \
    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
    Ca0 ^= E##ka0; \
    A##mu0 ^= Du0; \
    Bko0 = ROL32(A##mu0, 4); \
    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
    Ce0 ^= E##ke0; \
    A##sa0 ^= Da0; \
    Bku0 = ROL32(A##sa0, 9); \
    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
    Ci0 ^= E##ki0; \
    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
    Co0 ^= E##ko0; \
    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
    Cu0 ^= E##ku0; \
\
    A##be0 ^= De0; \
    Bka1 = A##be0; \
    A##gi1 ^= Di1; \
    Bke1 = ROL32(A##gi1, 3); \
    A##ko0 ^= Do0; \
    Bki1 = ROL32(A##ko0, 12); \
    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
    Ca1 ^= E##ka1; \
    A##mu1 ^= Du1; \
    Bko1 = ROL32(A##mu1, 4); \
    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
    Ce1 ^= E##ke1; \
    A##sa1 ^= Da1; \
    Bku1 = ROL32(A##sa1, 9); \
    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
    Ci1 ^= E##ki1; \
    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
    Co1 ^= E##ko1; \
    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
    Cu1 ^= E##ku1; \
\
    A##bu1 ^= Du1; \
    Bma0 = ROL32(A##bu1, 14); \
    A##ga0 ^= Da0; \
    Bme0 = ROL32(A##ga0, 18); \
    A##ke0 ^= De0; \
    Bmi0 = ROL32(A##ke0, 5); \
    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
    Ca0 ^= E##ma0; \
    A##mi1 ^= Di1; \
    Bmo0 = ROL32(A##mi1, 8); \
    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
    Ce0 ^= E##me0; \
    A##so0 ^= Do0; \
    Bmu0 = ROL32(A##so0, 28); \
    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
    Ci0 ^= E##mi0; \
    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
    Co0 ^= E##mo0; \
    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
    Cu0 ^= E##mu0; \
\
    A##bu0 ^= Du0; \
    Bma1 = ROL32(A##bu0, 13); \
    A##ga1 ^= Da1; \
    Bme1 = ROL32(A##ga1, 18); \
    A##ke1 ^= De1; \
    Bmi1 = ROL32(A##ke1, 5); \
    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
    Ca1 ^= E##ma1; \
    A##mi0 ^= Di0; \
    Bmo1 = ROL32(A##mi0, 7); \
    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
    Ce1 ^= E##me1; \
    A##so1 ^= Do1; \
    Bmu1 = ROL32(A##so1, 28); \
    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
    Ci1 ^= E##mi1; \
    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
    Co1 ^= E##mo1; \
    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
    Cu1 ^= E##mu1; \
\
    A##bi0 ^= Di0; \
    Bsa0 = ROL32(A##bi0, 31); \
    A##go1 ^= Do1; \
    Bse0 = ROL32(A##go1, 28); \
    A##ku1 ^= Du1; \
    Bsi0 = ROL32(A##ku1, 20); \
    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
    Ca0 ^= E##sa0; \
    A##ma1 ^= Da1; \
    Bso0 = ROL32(A##ma1, 21); \
    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
    Ce0 ^= E##se0; \
    A##se0 ^= De0; \
    Bsu0 = ROL32(A##se0, 1); \
    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
    Ci0 ^= E##si0; \
    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
    Co0 ^= E##so0; \
    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
    Cu0 ^= E##su0; \
\
    A##bi1 ^= Di1; \
    Bsa1 = ROL32(A##bi1, 31); \
    A##go0 ^= Do0; \
    Bse1 = ROL32(A##go0, 27); \
    A##ku0 ^= Du0; \
    Bsi1 = ROL32(A##ku0, 19); \
    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
    Ca1 ^= E##sa1; \
    A##ma0 ^= Da0; \
    Bso1 = ROL32(A##ma0, 20); \
    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
    Ce1 ^= E##se1; \
    A##se1 ^= De1; \
    Bsu1 = ROL32(A##se1, 1); \
    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
    Ci1 ^= E##si1; \
    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
    Co1 ^= E##so1; \
    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
    Cu1 ^= E##su1; \
\

// --- Theta Rho Pi Chi Iota
// --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words
#define thetaRhoPiChiIota(i, A, E) \
    Da0 = Cu0^ROL32(Ce1, 1); \
    Da1 = Cu1^Ce0; \
    De0 = Ca0^ROL32(Ci1, 1); \
    De1 = Ca1^Ci0; \
    Di0 = Ce0^ROL32(Co1, 1); \
    Di1 = Ce1^Co0; \
    Do0 = Ci0^ROL32(Cu1, 1); \
    Do1 = Ci1^Cu0; \
    Du0 = Co0^ROL32(Ca1, 1); \
    Du1 = Co1^Ca0; \
\
    A##ba0 ^= Da0; \
    Bba0 = A##ba0; \
    A##ge0 ^= De0; \
    Bbe0 = ROL32(A##ge0, 22); \
    A##ki1 ^= Di1; \
    Bbi0 = ROL32(A##ki1, 22); \
    E##ba0 =   Bba0 ^((~Bbe0)&  Bbi0 ); \
    E##ba0 ^= KeccakF1600RoundConstants_int2_0[i]; \
    A##mo1 ^= Do1; \
    Bbo0 = ROL32(A##mo1, 11); \
    E##be0 =   Bbe0 ^((~Bbi0)&  Bbo0 ); \
    A##su0 ^= Du0; \
    Bbu0 = ROL32(A##su0, 7); \
    E##bi0 =   Bbi0 ^((~Bbo0)&  Bbu0 ); \
    E##bo0 =   Bbo0 ^((~Bbu0)&  Bba0 ); \
    E##bu0 =   Bbu0 ^((~Bba0)&  Bbe0 ); \
\
    A##ba1 ^= Da1; \
    Bba1 = A##ba1; \
    A##ge1 ^= De1; \
    Bbe1 = ROL32(A##ge1, 22); \
    A##ki0 ^= Di0; \
    Bbi1 = ROL32(A##ki0, 21); \
    E##ba1 =   Bba1 ^((~Bbe1)&  Bbi1 ); \
    E##ba1 ^= KeccakF1600RoundConstants_int2_1[i]; \
    A##mo0 ^= Do0; \
    Bbo1 = ROL32(A##mo0, 10); \
    E##be1 =   Bbe1 ^((~Bbi1)&  Bbo1 ); \
    A##su1 ^= Du1; \
    Bbu1 = ROL32(A##su1, 7); \
    E##bi1 =   Bbi1 ^((~Bbo1)&  Bbu1 ); \
    E##bo1 =   Bbo1 ^((~Bbu1)&  Bba1 ); \
    E##bu1 =   Bbu1 ^((~Bba1)&  Bbe1 ); \
\
    A##bo0 ^= Do0; \
    Bga0 = ROL32(A##bo0, 14); \
    A##gu0 ^= Du0; \
    Bge0 = ROL32(A##gu0, 10); \
    A##ka1 ^= Da1; \
    Bgi0 = ROL32(A##ka1, 2); \
    E##ga0 =   Bga0 ^((~Bge0)&  Bgi0 ); \
    A##me1 ^= De1; \
    Bgo0 = ROL32(A##me1, 23); \
    E##ge0 =   Bge0 ^((~Bgi0)&  Bgo0 ); \
    A##si1 ^= Di1; \
    Bgu0 = ROL32(A##si1, 31); \
    E##gi0 =   Bgi0 ^((~Bgo0)&  Bgu0 ); \
    E##go0 =   Bgo0 ^((~Bgu0)&  Bga0 ); \
    E##gu0 =   Bgu0 ^((~Bga0)&  Bge0 ); \
\
    A##bo1 ^= Do1; \
    Bga1 = ROL32(A##bo1, 14); \
    A##gu1 ^= Du1; \
    Bge1 = ROL32(A##gu1, 10); \
    A##ka0 ^= Da0; \
    Bgi1 = ROL32(A##ka0, 1); \
    E##ga1 =   Bga1 ^((~Bge1)&  Bgi1 ); \
    A##me0 ^= De0; \
    Bgo1 = ROL32(A##me0, 22); \
    E##ge1 =   Bge1 ^((~Bgi1)&  Bgo1 ); \
    A##si0 ^= Di0; \
    Bgu1 = ROL32(A##si0, 30); \
    E##gi1 =   Bgi1 ^((~Bgo1)&  Bgu1 ); \
    E##go1 =   Bgo1 ^((~Bgu1)&  Bga1 ); \
    E##gu1 =   Bgu1 ^((~Bga1)&  Bge1 ); \
\
    A##be1 ^= De1; \
    Bka0 = ROL32(A##be1, 1); \
    A##gi0 ^= Di0; \
    Bke0 = ROL32(A##gi0, 3); \
    A##ko1 ^= Do1; \
    Bki0 = ROL32(A##ko1, 13); \
    E##ka0 =   Bka0 ^((~Bke0)&  Bki0 ); \
    A##mu0 ^= Du0; \
    Bko0 = ROL32(A##mu0, 4); \
    E##ke0 =   Bke0 ^((~Bki0)&  Bko0 ); \
    A##sa0 ^= Da0; \
    Bku0 = ROL32(A##sa0, 9); \
    E##ki0 =   Bki0 ^((~Bko0)&  Bku0 ); \
    E##ko0 =   Bko0 ^((~Bku0)&  Bka0 ); \
    E##ku0 =   Bku0 ^((~Bka0)&  Bke0 ); \
\
    A##be0 ^= De0; \
    Bka1 = A##be0; \
    A##gi1 ^= Di1; \
    Bke1 = ROL32(A##gi1, 3); \
    A##ko0 ^= Do0; \
    Bki1 = ROL32(A##ko0, 12); \
    E##ka1 =   Bka1 ^((~Bke1)&  Bki1 ); \
    A##mu1 ^= Du1; \
    Bko1 = ROL32(A##mu1, 4); \
    E##ke1 =   Bke1 ^((~Bki1)&  Bko1 ); \
    A##sa1 ^= Da1; \
    Bku1 = ROL32(A##sa1, 9); \
    E##ki1 =   Bki1 ^((~Bko1)&  Bku1 ); \
    E##ko1 =   Bko1 ^((~Bku1)&  Bka1 ); \
    E##ku1 =   Bku1 ^((~Bka1)&  Bke1 ); \
\
    A##bu1 ^= Du1; \
    Bma0 = ROL32(A##bu1, 14); \
    A##ga0 ^= Da0; \
    Bme0 = ROL32(A##ga0, 18); \
    A##ke0 ^= De0; \
    Bmi0 = ROL32(A##ke0, 5); \
    E##ma0 =   Bma0 ^((~Bme0)&  Bmi0 ); \
    A##mi1 ^= Di1; \
    Bmo0 = ROL32(A##mi1, 8); \
    E##me0 =   Bme0 ^((~Bmi0)&  Bmo0 ); \
    A##so0 ^= Do0; \
    Bmu0 = ROL32(A##so0, 28); \
    E##mi0 =   Bmi0 ^((~Bmo0)&  Bmu0 ); \
    E##mo0 =   Bmo0 ^((~Bmu0)&  Bma0 ); \
    E##mu0 =   Bmu0 ^((~Bma0)&  Bme0 ); \
\
    A##bu0 ^= Du0; \
    Bma1 = ROL32(A##bu0, 13); \
    A##ga1 ^= Da1; \
    Bme1 = ROL32(A##ga1, 18); \
    A##ke1 ^= De1; \
    Bmi1 = ROL32(A##ke1, 5); \
    E##ma1 =   Bma1 ^((~Bme1)&  Bmi1 ); \
    A##mi0 ^= Di0; \
    Bmo1 = ROL32(A##mi0, 7); \
    E##me1 =   Bme1 ^((~Bmi1)&  Bmo1 ); \
    A##so1 ^= Do1; \
    Bmu1 = ROL32(A##so1, 28); \
    E##mi1 =   Bmi1 ^((~Bmo1)&  Bmu1 ); \
    E##mo1 =   Bmo1 ^((~Bmu1)&  Bma1 ); \
    E##mu1 =   Bmu1 ^((~Bma1)&  Bme1 ); \
\
    A##bi0 ^= Di0; \
    Bsa0 = ROL32(A##bi0, 31); \
    A##go1 ^= Do1; \
    Bse0 = ROL32(A##go1, 28); \
    A##ku1 ^= Du1; \
    Bsi0 = ROL32(A##ku1, 20); \
    E##sa0 =   Bsa0 ^((~Bse0)&  Bsi0 ); \
    A##ma1 ^= Da1; \
    Bso0 = ROL32(A##ma1, 21); \
    E##se0 =   Bse0 ^((~Bsi0)&  Bso0 ); \
    A##se0 ^= De0; \
    Bsu0 = ROL32(A##se0, 1); \
    E##si0 =   Bsi0 ^((~Bso0)&  Bsu0 ); \
    E##so0 =   Bso0 ^((~Bsu0)&  Bsa0 ); \
    E##su0 =   Bsu0 ^((~Bsa0)&  Bse0 ); \
\
    A##bi1 ^= Di1; \
    Bsa1 = ROL32(A##bi1, 31); \
    A##go0 ^= Do0; \
    Bse1 = ROL32(A##go0, 27); \
    A##ku0 ^= Du0; \
    Bsi1 = ROL32(A##ku0, 19); \
    E##sa1 =   Bsa1 ^((~Bse1)&  Bsi1 ); \
    A##ma0 ^= Da0; \
    Bso1 = ROL32(A##ma0, 20); \
    E##se1 =   Bse1 ^((~Bsi1)&  Bso1 ); \
    A##se1 ^= De1; \
    Bsu1 = ROL32(A##se1, 1); \
    E##si1 =   Bsi1 ^((~Bso1)&  Bsu1 ); \
    E##so1 =   Bso1 ^((~Bsu1)&  Bsa1 ); \
    E##su1 =   Bsu1 ^((~Bsa1)&  Bse1 ); \
\

#endif // UseBebigokimisa

const UINT32 KeccakF1600RoundConstants_int2_0[18] = {
    0x00000001UL,
    0x00000000UL,
    0x00000000UL,
    0x00000000UL,
    0x00000001UL,
    0x00000001UL,
    0x00000001UL,
    0x00000001UL,
    0x00000000UL,
    0x00000000UL,
    0x00000001UL,
    0x00000000UL,
    0x00000001UL,
    0x00000001UL,
    0x00000001UL,
    0x00000001UL,
    0x00000000UL,
    0x00000000UL };

const UINT32 KeccakF1600RoundConstants_int2_1[18] = {
    0x00000000UL,
    0x00000089UL,
    0x8000008bUL,
    0x80008080UL,
    0x0000008bUL,
    0x00008000UL,
    0x80008088UL,
    0x80000082UL,
    0x0000000bUL,
    0x0000000aUL,
    0x00008082UL,
    0x00008003UL,
    0x0000808bUL,
    0x8000000bUL,
    0x8000008aUL,
    0x80000081UL,
    0x80000081UL,
    0x80000008UL };

#define copyFromStateAndXor1024bits(X, state, input) \
    X##ba0 = state[ 0]^input[ 0]; \
    X##ba1 = state[ 1]^input[ 1]; \
    X##be0 = state[ 2]^input[ 2]; \
    X##be1 = state[ 3]^input[ 3]; \
    X##bi0 = state[ 4]^input[ 4]; \
    X##bi1 = state[ 5]^input[ 5]; \
    X##bo0 = state[ 6]^input[ 6]; \
    X##bo1 = state[ 7]^input[ 7]; \
    X##bu0 = state[ 8]^input[ 8]; \
    X##bu1 = state[ 9]^input[ 9]; \
    X##ga0 = state[10]^input[10]; \
    X##ga1 = state[11]^input[11]; \
    X##ge0 = state[12]^input[12]; \
    X##ge1 = state[13]^input[13]; \
    X##gi0 = state[14]^input[14]; \
    X##gi1 = state[15]^input[15]; \
    X##go0 = state[16]^input[16]; \
    X##go1 = state[17]^input[17]; \
    X##gu0 = state[18]^input[18]; \
    X##gu1 = state[19]^input[19]; \
    X##ka0 = state[20]^input[20]; \
    X##ka1 = state[21]^input[21]; \
    X##ke0 = state[22]^input[22]; \
    X##ke1 = state[23]^input[23]; \
    X##ki0 = state[24]^input[24]; \
    X##ki1 = state[25]^input[25]; \
    X##ko0 = state[26]^input[26]; \
    X##ko1 = state[27]^input[27]; \
    X##ku0 = state[28]^input[28]; \
    X##ku1 = state[29]^input[29]; \
    X##ma0 = state[30]^input[30]; \
    X##ma1 = state[31]^input[31]; \
    X##me0 = state[32]; \
    X##me1 = state[33]; \
    X##mi0 = state[34]; \
    X##mi1 = state[35]; \
    X##mo0 = state[36]; \
    X##mo1 = state[37]; \
    X##mu0 = state[38]; \
    X##mu1 = state[39]; \
    X##sa0 = state[40]; \
    X##sa1 = state[41]; \
    X##se0 = state[42]; \
    X##se1 = state[43]; \
    X##si0 = state[44]; \
    X##si1 = state[45]; \
    X##so0 = state[46]; \
    X##so1 = state[47]; \
    X##su0 = state[48]; \
    X##su1 = state[49]; \

#define copyFromStateAndXor512bits(X, state, input) \
    X##ba0 = state[ 0]^input[ 0]; \
    X##ba1 = state[ 1]^input[ 1]; \
    X##be0 = state[ 2]^input[ 2]; \
    X##be1 = state[ 3]^input[ 3]; \
    X##bi0 = state[ 4]^input[ 4]; \
    X##bi1 = state[ 5]^input[ 5]; \
    X##bo0 = state[ 6]^input[ 6]; \
    X##bo1 = state[ 7]^input[ 7]; \
    X##bu0 = state[ 8]^input[ 8]; \
    X##bu1 = state[ 9]^input[ 9]; \
    X##ga0 = state[10]^input[10]; \
    X##ga1 = state[11]^input[11]; \
    X##ge0 = state[12]^input[12]; \
    X##ge1 = state[13]^input[13]; \
    X##gi0 = state[14]^input[14]; \
    X##gi1 = state[15]^input[15]; \
    X##go0 = state[16]; \
    X##go1 = state[17]; \
    X##gu0 = state[18]; \
    X##gu1 = state[19]; \
    X##ka0 = state[20]; \
    X##ka1 = state[21]; \
    X##ke0 = state[22]; \
    X##ke1 = state[23]; \
    X##ki0 = state[24]; \
    X##ki1 = state[25]; \
    X##ko0 = state[26]; \
    X##ko1 = state[27]; \
    X##ku0 = state[28]; \
    X##ku1 = state[29]; \
    X##ma0 = state[30]; \
    X##ma1 = state[31]; \
    X##me0 = state[32]; \
    X##me1 = state[33]; \
    X##mi0 = state[34]; \
    X##mi1 = state[35]; \
    X##mo0 = state[36]; \
    X##mo1 = state[37]; \
    X##mu0 = state[38]; \
    X##mu1 = state[39]; \
    X##sa0 = state[40]; \
    X##sa1 = state[41]; \
    X##se0 = state[42]; \
    X##se1 = state[43]; \
    X##si0 = state[44]; \
    X##si1 = state[45]; \
    X##so0 = state[46]; \
    X##so1 = state[47]; \
    X##su0 = state[48]; \
    X##su1 = state[49]; \

#define copyFromState(X, state) \
    X##ba0 = state[ 0]; \
    X##ba1 = state[ 1]; \
    X##be0 = state[ 2]; \
    X##be1 = state[ 3]; \
    X##bi0 = state[ 4]; \
    X##bi1 = state[ 5]; \
    X##bo0 = state[ 6]; \
    X##bo1 = state[ 7]; \
    X##bu0 = state[ 8]; \
    X##bu1 = state[ 9]; \
    X##ga0 = state[10]; \
    X##ga1 = state[11]; \
    X##ge0 = state[12]; \
    X##ge1 = state[13]; \
    X##gi0 = state[14]; \
    X##gi1 = state[15]; \
    X##go0 = state[16]; \
    X##go1 = state[17]; \
    X##gu0 = state[18]; \
    X##gu1 = state[19]; \
    X##ka0 = state[20]; \
    X##ka1 = state[21]; \
    X##ke0 = state[22]; \
    X##ke1 = state[23]; \
    X##ki0 = state[24]; \
    X##ki1 = state[25]; \
    X##ko0 = state[26]; \
    X##ko1 = state[27]; \
    X##ku0 = state[28]; \
    X##ku1 = state[29]; \
    X##ma0 = state[30]; \
    X##ma1 = state[31]; \
    X##me0 = state[32]; \
    X##me1 = state[33]; \
    X##mi0 = state[34]; \
    X##mi1 = state[35]; \
    X##mo0 = state[36]; \
    X##mo1 = state[37]; \
    X##mu0 = state[38]; \
    X##mu1 = state[39]; \
    X##sa0 = state[40]; \
    X##sa1 = state[41]; \
    X##se0 = state[42]; \
    X##se1 = state[43]; \
    X##si0 = state[44]; \
    X##si1 = state[45]; \
    X##so0 = state[46]; \
    X##so1 = state[47]; \
    X##su0 = state[48]; \
    X##su1 = state[49]; \

#define copyToState(state, X) \
    state[ 0] = X##ba0; \
    state[ 1] = X##ba1; \
    state[ 2] = X##be0; \
    state[ 3] = X##be1; \
    state[ 4] = X##bi0; \
    state[ 5] = X##bi1; \
    state[ 6] = X##bo0; \
    state[ 7] = X##bo1; \
    state[ 8] = X##bu0; \
    state[ 9] = X##bu1; \
    state[10] = X##ga0; \
    state[11] = X##ga1; \
    state[12] = X##ge0; \
    state[13] = X##ge1; \
    state[14] = X##gi0; \
    state[15] = X##gi1; \
    state[16] = X##go0; \
    state[17] = X##go1; \
    state[18] = X##gu0; \
    state[19] = X##gu1; \
    state[20] = X##ka0; \
    state[21] = X##ka1; \
    state[22] = X##ke0; \
    state[23] = X##ke1; \
    state[24] = X##ki0; \
    state[25] = X##ki1; \
    state[26] = X##ko0; \
    state[27] = X##ko1; \
    state[28] = X##ku0; \
    state[29] = X##ku1; \
    state[30] = X##ma0; \
    state[31] = X##ma1; \
    state[32] = X##me0; \
    state[33] = X##me1; \
    state[34] = X##mi0; \
    state[35] = X##mi1; \
    state[36] = X##mo0; \
    state[37] = X##mo1; \
    state[38] = X##mu0; \
    state[39] = X##mu1; \
    state[40] = X##sa0; \
    state[41] = X##sa1; \
    state[42] = X##se0; \
    state[43] = X##se1; \
    state[44] = X##si0; \
    state[45] = X##si1; \
    state[46] = X##so0; \
    state[47] = X##so1; \
    state[48] = X##su0; \
    state[49] = X##su1; \

#define copyStateVariables(X, Y) \
    X##ba0 = Y##ba0; \
    X##ba1 = Y##ba1; \
    X##be0 = Y##be0; \
    X##be1 = Y##be1; \
    X##bi0 = Y##bi0; \
    X##bi1 = Y##bi1; \
    X##bo0 = Y##bo0; \
    X##bo1 = Y##bo1; \
    X##bu0 = Y##bu0; \
    X##bu1 = Y##bu1; \
    X##ga0 = Y##ga0; \
    X##ga1 = Y##ga1; \
    X##ge0 = Y##ge0; \
    X##ge1 = Y##ge1; \
    X##gi0 = Y##gi0; \
    X##gi1 = Y##gi1; \
    X##go0 = Y##go0; \
    X##go1 = Y##go1; \
    X##gu0 = Y##gu0; \
    X##gu1 = Y##gu1; \
    X##ka0 = Y##ka0; \
    X##ka1 = Y##ka1; \
    X##ke0 = Y##ke0; \
    X##ke1 = Y##ke1; \
    X##ki0 = Y##ki0; \
    X##ki1 = Y##ki1; \
    X##ko0 = Y##ko0; \
    X##ko1 = Y##ko1; \
    X##ku0 = Y##ku0; \
    X##ku1 = Y##ku1; \
    X##ma0 = Y##ma0; \
    X##ma1 = Y##ma1; \
    X##me0 = Y##me0; \
    X##me1 = Y##me1; \
    X##mi0 = Y##mi0; \
    X##mi1 = Y##mi1; \
    X##mo0 = Y##mo0; \
    X##mo1 = Y##mo1; \
    X##mu0 = Y##mu0; \
    X##mu1 = Y##mu1; \
    X##sa0 = Y##sa0; \
    X##sa1 = Y##sa1; \
    X##se0 = Y##se0; \
    X##se1 = Y##se1; \
    X##si0 = Y##si0; \
    X##si1 = Y##si1; \
    X##so0 = Y##so0; \
    X##so1 = Y##so1; \
    X##su0 = Y##su0; \
    X##su1 = Y##su1; \

/* ===== */

/* ===== "KeccakPermutationOptimized.macros" */
#if (Unrolling == 18)
#define rounds \
	prepareTheta \
	thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
	thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
	thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
	thetaRhoPiChiIotaPrepareTheta(10, A, E) \
	thetaRhoPiChiIotaPrepareTheta(11, E, A) \
	thetaRhoPiChiIotaPrepareTheta(12, A, E) \
	thetaRhoPiChiIotaPrepareTheta(13, E, A) \
	thetaRhoPiChiIotaPrepareTheta(14, A, E) \
	thetaRhoPiChiIotaPrepareTheta(15, E, A) \
	thetaRhoPiChiIotaPrepareTheta(16, A, E) \
	thetaRhoPiChiIota(17, E, A) \
    copyToState(state, A)
#elif (Unrolling == 9)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=9) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+6, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+7, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+8, A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#elif (Unrolling == 6)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=6) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
    } \
    copyToState(state, A)
#elif (Unrolling == 3)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=3) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
		thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#elif (Unrolling == 2)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i+=2) { \
        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
    } \
    copyToState(state, A)
#elif (Unrolling == 1)
#define rounds \
    prepareTheta \
    for(i=0; i<18; i++) { \
		thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
		copyStateVariables(A, E) \
    } \
    copyToState(state, A)
#else
#error "Unrolling is not correctly specified!"
#endif
/* ===== */

static
void KeccakPermutationOnWords(UINT32 *state)
{
    declareABCDE
#if (Unrolling != 18)
    unsigned int i;
#endif

    copyFromState(A, state)
    rounds
}

static
void KeccakPermutationOnWordsAfterXoring512bits(UINT32 *state, const UINT8 *input)
{
    declareABCDE
    unsigned int i;

#ifdef UseInterleaveTables
    for(i=0; i<8; i++)
        xor8bytesIntoInterleavedWords(state+i*2, state+i*2+1, input+i*8);
#else
    for(i=0; i<8; i++)
        xor8bytesIntoInterleavedWords(((UINT64*)state)+i, input+i*8);
#endif
    copyFromState(A, state)
    rounds
}

static
void KeccakPermutationOnWordsAfterXoring1024bits(UINT32 *state, const UINT8 *input)
{
    declareABCDE
    unsigned int i;

#ifdef UseInterleaveTables
    for(i=0; i<16; i++)
        xor8bytesIntoInterleavedWords(state+i*2, state+i*2+1, input+i*8);
#else
    for(i=0; i<16; i++)
        xor8bytesIntoInterleavedWords(((UINT64*)state)+i, input+i*8);
#endif
    copyFromState(A, state)
    rounds
}

void KeccakInitialize()
{
#ifdef UseInterleaveTables
    buildInterleaveTables();
#endif
}

void KeccakInitializeState(unsigned char *state)
{
    memset(state, 0, KeccakPermutationSizeInBytes);
#ifdef UseBebigokimisa
    ((UINT32*)state)[ 2] = ~(UINT32)0;
    ((UINT32*)state)[ 3] = ~(UINT32)0;
    ((UINT32*)state)[ 4] = ~(UINT32)0;
    ((UINT32*)state)[ 5] = ~(UINT32)0;
    ((UINT32*)state)[16] = ~(UINT32)0;
    ((UINT32*)state)[17] = ~(UINT32)0;
    ((UINT32*)state)[24] = ~(UINT32)0;
    ((UINT32*)state)[25] = ~(UINT32)0;
    ((UINT32*)state)[34] = ~(UINT32)0;
    ((UINT32*)state)[35] = ~(UINT32)0;
    ((UINT32*)state)[40] = ~(UINT32)0;
    ((UINT32*)state)[41] = ~(UINT32)0;
#endif
}

void KeccakPermutation(unsigned char *state)
{
    // We assume the state is always stored as interleaved 32-bit words
    KeccakPermutationOnWords((UINT32*)state);
}

void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
{
    KeccakPermutationOnWordsAfterXoring1024bits((UINT32*)state, data);
}

void KeccakAbsorb512bits(unsigned char *state, const unsigned char *data)
{
    KeccakPermutationOnWordsAfterXoring512bits((UINT32*)state, data);
}

void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
{
    unsigned int i;

#ifdef UseInterleaveTables
    for(i=0; i<16; i++)
        setInterleavedWordsInto8bytes(data+i*8, ((UINT32*)state)[i*2], ((UINT32*)state)[i*2+1]);
#else
    for(i=0; i<16; i++)
        setInterleavedWordsInto8bytes(data+i*8, ((UINT64*)state)[i]);
#endif
#ifdef UseBebigokimisa
    ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
    ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
    ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
    ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
    ((UINT32*)data)[16] = ~((UINT32*)data)[16];
    ((UINT32*)data)[17] = ~((UINT32*)data)[17];
    ((UINT32*)data)[24] = ~((UINT32*)data)[24];
    ((UINT32*)data)[25] = ~((UINT32*)data)[25];
#endif
}

void KeccakExtract512bits(const unsigned char *state, unsigned char *data)
{
    unsigned int i;

#ifdef UseInterleaveTables
    for(i=0; i<8; i++)
        setInterleavedWordsInto8bytes(data+i*8, ((UINT32*)state)[i*2], ((UINT32*)state)[i*2+1]);
#else
    for(i=0; i<8; i++)
        setInterleavedWordsInto8bytes(data+i*8, ((UINT64*)state)[i]);
#endif
#ifdef UseBebigokimisa
    ((UINT32*)data)[ 2] = ~((UINT32*)data)[ 2];
    ((UINT32*)data)[ 3] = ~((UINT32*)data)[ 3];
    ((UINT32*)data)[ 4] = ~((UINT32*)data)[ 4];
    ((UINT32*)data)[ 5] = ~((UINT32*)data)[ 5];
#endif
}

/* ===== */

#else	/* OPTIMIZED != 32 or 64 */

/* ===== "KeccakPermutationReference.c" */

#define nrRounds 18
UINT64 KeccakRoundConstants[nrRounds];
#define nrWords 25
unsigned int KeccakRhoOffsets[nrWords];

void KeccakPermutationOnWords(UINT64 *state);
void theta(UINT64 *A);
void rho(UINT64 *A);
void pi(UINT64 *A);
void chi(UINT64 *A);
void iota(UINT64 *A, unsigned int indexRound);

#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
static
void fromBytesToWords(UINT64 *stateAsWords, const unsigned char *state)
{
    unsigned int i, j;

    for(i=0; i<(KeccakPermutationSize/64); i++) {
        stateAsWords[i] = 0;
        for(j=0; j<(64/8); j++)
            stateAsWords[i] |= (UINT64)(state[i*(64/8)+j]) << (8*j);
    }
}

static
void fromWordsToBytes(unsigned char *state, const UINT64 *stateAsWords)
{
    unsigned int i, j;

    for(i=0; i<(KeccakPermutationSize/64); i++)
        for(j=0; j<(64/8); j++)
            state[i*(64/8)+j] = (stateAsWords[i] >> (8*j)) & 0xFF;
}
#endif

void KeccakPermutation(unsigned char *state)
{
#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
    UINT64 stateAsWords[KeccakPermutationSize/64];
#endif

#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    KeccakPermutationOnWords((UINT64*)state);
#else
    fromBytesToWords(stateAsWords, state);
    KeccakPermutationOnWords(stateAsWords);
    fromWordsToBytes(state, stateAsWords);
#endif
}

static
void KeccakPermutationAfterXor(unsigned char *state, const unsigned char *data, unsigned int dataLengthInBytes)
{
    unsigned int i;

    for(i=0; i<dataLengthInBytes; i++)
        state[i] ^= data[i];
    KeccakPermutation(state);
}

void KeccakPermutationOnWords(UINT64 *state)
{
    unsigned int i;

    for(i=0; i<nrRounds; i++) {
        theta(state);
        rho(state);
        pi(state);
        chi(state);
        iota(state, i);
    }
}

#define index(x, y) (((x)%5)+5*((y)%5))
#define ROL64(a, offset) ((offset != 0) ? ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) : a)

void theta(UINT64 *A)
{
    unsigned int x, y;
    UINT64 C[5], D[5];

    for(x=0; x<5; x++) {
        C[x] = 0; 
        for(y=0; y<5; y++) 
            C[x] ^= A[index(x, y)];
        D[x] = ROL64(C[x], 1);
    }
    for(x=0; x<5; x++)
        for(y=0; y<5; y++)
            A[index(x, y)] ^= D[(x+1)%5] ^ C[(x+4)%5];
}

void rho(UINT64 *A)
{
    unsigned int x, y;

    for(x=0; x<5; x++) for(y=0; y<5; y++)
        A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]);
}

void pi(UINT64 *A)
{
    unsigned int x, y;
    UINT64 tempA[25];

    for(x=0; x<5; x++) for(y=0; y<5; y++)
        tempA[index(x, y)] = A[index(x, y)];
    for(x=0; x<5; x++) for(y=0; y<5; y++)
        A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)];
}

void chi(UINT64 *A)
{
    unsigned int x, y;
    UINT64 C[5];

    for(y=0; y<5; y++) { 
        for(x=0; x<5; x++)
            C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]);
        for(x=0; x<5; x++)
            A[index(x, y)] = C[x];
    }
}

void iota(UINT64 *A, unsigned int indexRound)
{
    A[index(0, 0)] ^= KeccakRoundConstants[indexRound];
}

static
int LFSR86540(UINT8 *LFSR)
{
    int result = ((*LFSR) & 0x01) != 0;
    if (((*LFSR) & 0x80) != 0)
        // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
        (*LFSR) = ((*LFSR) << 1) ^ 0x71;
    else
        (*LFSR) <<= 1;
    return result;
}

static
void KeccakInitializeRoundConstants(void)
{
    UINT8 LFSRstate = 0x01;
    unsigned int i, j, bitPosition;

    for(i=0; i<nrRounds; i++) {
        KeccakRoundConstants[i] = 0;
        for(j=0; j<7; j++) {
            bitPosition = (1<<j)-1; //2^j-1
            if (LFSR86540(&LFSRstate))
                KeccakRoundConstants[i] ^= (UINT64)1<<bitPosition;
        }
    }
}

static
void KeccakInitializeRhoOffsets(void)
{
    unsigned int x, y, t, newX, newY;

    KeccakRhoOffsets[index(0, 0)] = 0;
    x = 1;
    y = 0;
    for(t=0; t<24; t++) {
        KeccakRhoOffsets[index(x, y)] = ((t+1)*(t+2)/2) % 64;
        newX = (0*x+1*y) % 5;
        newY = (2*x+3*y) % 5;
        x = newX;
        y = newY;
    }
}

void KeccakInitialize(void)
{
    KeccakInitializeRoundConstants();
    KeccakInitializeRhoOffsets();
}

void KeccakInitializeState(unsigned char *state)
{
    memset(state, 0, KeccakPermutationSizeInBytes);
}

void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
{
    KeccakPermutationAfterXor(state, data, 128);
}

void KeccakAbsorb512bits(unsigned char *state, const unsigned char *data)
{
    KeccakPermutationAfterXor(state, data, 64);
}

void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
{
    memcpy(data, state, 128);
}

void KeccakExtract512bits(const unsigned char *state, unsigned char *data)
{
    memcpy(data, state, 64);
}

/* ===== */

#endif	/* OPTIMIZED */

HashReturn Init(hashState *state, int hashbitlen)
{
    KeccakInitialize();
    switch(hashbitlen) {
        case 0: // Arbitrary length output
            state->capacity = 576;
            break;
        case 224:
            state->capacity = 576;
            break;
        case 256:
            state->capacity = 576;
            break;
        case 384:
            state->capacity = 1088;
            break;
        case 512:
            state->capacity = 1088;
            break;
        default:
            return BAD_HASHLEN;
    }
    state->rate = KeccakPermutationSize - state->capacity;
    state->diversifier = hashbitlen/8;
    state->hashbitlen = hashbitlen;
    KeccakInitializeState(state->state);
    memset(state->dataQueue, 0, KeccakMaximumRateInBytes);
    state->bitsInQueue = 0;
    state->squeezing = 0;
    state->bitsAvailableForSqueezing = 0;

    return SUCCESS;
}

static
void AbsorbQueue(hashState *state)
{
    // state->bitsInQueue is assumed to be equal a multiple of 8
    memset(state->dataQueue+state->bitsInQueue/8, 0, state->rate/8-state->bitsInQueue/8);
    if (state->rate == 1024)
        KeccakAbsorb1024bits(state->state, state->dataQueue);
    else
        KeccakAbsorb512bits(state->state, state->dataQueue);
    state->bitsInQueue = 0;
}

HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
{
    DataLength i, j;
    DataLength partialBlock, partialByte, wholeBlocks;
    BitSequence lastByte;
    const BitSequence *curData;

    if ((state->bitsInQueue % 8) != 0)
        return FAIL; // Only the last call may contain a partial byte
    if (state->squeezing)
        return FAIL; // Too late for additional input

    i = 0;
    while(i < databitlen) {
        if ((state->bitsInQueue == 0) && (databitlen >= state->rate) && (i <= (databitlen-state->rate))) {
            wholeBlocks = (databitlen-i)/state->rate;
            curData = data+i/8;
            if (state->rate == 1024) {
                for(j=0; j<wholeBlocks; j++, curData+=1024/8) {
                    KeccakAbsorb1024bits(state->state, curData);
                }
            }
            else {
                for(j=0; j<wholeBlocks; j++, curData+=512/8) {
                    KeccakAbsorb512bits(state->state, curData);
                }
            }
            i += wholeBlocks*state->rate;
        }
        else {
            partialBlock = databitlen - i;
            if (partialBlock+state->bitsInQueue > state->rate)
                partialBlock = state->rate-state->bitsInQueue;
            partialByte = partialBlock % 8;
            partialBlock -= partialByte;
            memcpy(state->dataQueue+state->bitsInQueue/8, data+i/8, partialBlock/8);
            state->bitsInQueue += partialBlock;
            i += partialBlock;
            if (state->bitsInQueue == state->rate)
                AbsorbQueue(state);
            if (partialByte > 0) {
                // Align the last partial byte to the least significant bits
                lastByte = data[i/8] >> (8-partialByte);
                state->dataQueue[state->bitsInQueue/8] = lastByte;
                state->bitsInQueue += partialByte;
                i += partialByte;
            }
        }
    }
    return SUCCESS;
}

static
void PadAndSwitchToSqueezingPhase(hashState *state)
{
    if ((state->bitsInQueue % 8) != 0) {
        // The bits are numbered from 0=LSB to 7=MSB
        unsigned char padByte = 1 << (state->bitsInQueue % 8);
        state->dataQueue[state->bitsInQueue/8] |= padByte;
        state->bitsInQueue += 8-(state->bitsInQueue % 8);
    }
    else {
        state->dataQueue[state->bitsInQueue/8] = 0x01;
        state->bitsInQueue += 8;
    }
    if (state->bitsInQueue == state->rate)
        AbsorbQueue(state);
    state->dataQueue[state->bitsInQueue/8] = state->diversifier;
    state->bitsInQueue += 8;
    if (state->bitsInQueue == state->rate)
        AbsorbQueue(state);
    state->dataQueue[state->bitsInQueue/8] = state->rate/8;
    state->bitsInQueue += 8;
    if (state->bitsInQueue == state->rate)
        AbsorbQueue(state);
    state->dataQueue[state->bitsInQueue/8] = 0x01;
    state->bitsInQueue += 8;
    if (state->bitsInQueue > 0)
        AbsorbQueue(state);
    if ((state->rate == 1024) && ((state->hashbitlen > 512) || (state->hashbitlen == 0))) {
        KeccakExtract1024bits(state->state, state->dataQueue);
        state->bitsAvailableForSqueezing = 1024;
    }
    else {
        KeccakExtract512bits(state->state, state->dataQueue);
        state->bitsAvailableForSqueezing = 512;
    }
    state->squeezing = 1;
}

HashReturn Final(hashState *state, BitSequence *hashval)
{
    if (state->squeezing)
        return FAIL; // Too late, we are already squeezing
    PadAndSwitchToSqueezingPhase(state);
    if (state->hashbitlen > 0)
        memcpy(hashval, state->dataQueue, state->hashbitlen/8);
    return SUCCESS;
}

HashReturn Squeeze(hashState *state, BitSequence *output, DataLength outputLength)
{
    DataLength i;
    DataLength partialBlock;

    if (!state->squeezing)
        return FAIL; // Too early, we are still absorbing
    if (state->hashbitlen != 0)
        return FAIL; // Arbitrary length output is not permitted in this case
    if ((outputLength % 8) != 0)
        return FAIL; // Only multiple of 8 bits are allowed, truncation can be done at user level

    i = 0;
    while(i < outputLength) {
        if (state->bitsAvailableForSqueezing == 0) {
            KeccakPermutation(state->state);
            if (state->rate == 1024) {
                KeccakExtract1024bits(state->state, state->dataQueue);
                state->bitsAvailableForSqueezing = state->rate;
            }
            else
                return FAIL; // Inconsistent rate
        }
        partialBlock = outputLength - i;
        if (partialBlock > state->bitsAvailableForSqueezing)
            partialBlock = state->bitsAvailableForSqueezing;
        memcpy(output+i/8, state->dataQueue+(state->rate-state->bitsAvailableForSqueezing)/8, partialBlock/8);
        state->bitsAvailableForSqueezing -= partialBlock;
        i += partialBlock;
    }
    return SUCCESS;
}

HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
    hashState state;
    HashReturn result;

    if (hashbitlen == 0)
        return BAD_HASHLEN; // Arbitrary length output not available through this API
    result = Init(&state, hashbitlen);
    if (result != SUCCESS)
        return result;
    result = Update(&state, data, databitlen);
    if (result != SUCCESS)
        return result;
    result = Final(&state, hashval);
    return result;
}
