/* hash.c     April 2012
 * Groestl ANSI C code optimised for 32-bit machines
 * Author: Thomas Krinninger
 *
 *  This work is based on the implementation of
 *          Soeren S. Thomsen and Krystian Matusiewicz
 *          
 *
 */

#include <stddef.h>
#include "groestl.h"
#include "groestl_tables.h"

#define P_TYPE 0
#define Q_TYPE 1

const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};

const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};


#if BYTE_ORDER == LITTLE_ENDIAN
#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
															v1 = temp_var;}
#else
#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1>>(8*amount_bytes))|(v2<<(8*(4-amount_bytes))); \
															v2 = (v2>>(8*amount_bytes))|(v1<<(8*(4-amount_bytes))); \
															v1 = temp_var;}
#endif
  

#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
   tu = T[2*(uint32_t)x[4*c0+0]];			    \
   tl = T[2*(uint32_t)x[4*c0+0]+1];		    \
   tv1 = T[2*(uint32_t)x[4*c1+1]];			\
   tv2 = T[2*(uint32_t)x[4*c1+1]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
   tu ^= tv1;						\
   tl ^= tv2;						\
   tv1 = T[2*(uint32_t)x[4*c2+2]];			\
   tv2 = T[2*(uint32_t)x[4*c2+2]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
   tu ^= tv1;						\
   tl ^= tv2;   					\
   tv1 = T[2*(uint32_t)x[4*c3+3]];			\
   tv2 = T[2*(uint32_t)x[4*c3+3]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
   tu ^= tv1;						\
   tl ^= tv2;						\
   tl ^= T[2*(uint32_t)x[4*c4+0]];			\
   tu ^= T[2*(uint32_t)x[4*c4+0]+1];			\
   tv1 = T[2*(uint32_t)x[4*c5+1]];			\
   tv2 = T[2*(uint32_t)x[4*c5+1]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
   tl ^= tv1;						\
   tu ^= tv2;						\
   tv1 = T[2*(uint32_t)x[4*c6+2]];			\
   tv2 = T[2*(uint32_t)x[4*c6+2]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
   tl ^= tv1;						\
   tu ^= tv2;   					\
   tv1 = T[2*(uint32_t)x[4*c7+3]];			\
   tv2 = T[2*(uint32_t)x[4*c7+3]+1];			\
   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
   tl ^= tv1;						\
   tu ^= tv2;						\
   y[i] = tu;						\
   y[i+1] = tl;


/* compute one round of P (short variants) */
static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
  uint32_t* x32 = (uint32_t*)x;
  x32[ 0] ^= SWAP32LE(0x00000000)^r;
  x32[ 2] ^= SWAP32LE(0x00000010)^r;
  x32[ 4] ^= SWAP32LE(0x00000020)^r;
  x32[ 6] ^= SWAP32LE(0x00000030)^r;
  x32[ 8] ^= SWAP32LE(0x00000040)^r;
  x32[10] ^= SWAP32LE(0x00000050)^r;
  x32[12] ^= SWAP32LE(0x00000060)^r;
  x32[14] ^= SWAP32LE(0x00000070)^r;
  COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
}

/* compute one round of Q (short variants) */
static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
  uint32_t* x32 = (uint32_t*)x;
  x32[ 0] = ~x32[ 0];
  x32[ 1] ^= SWAP32LE(0xffffffff)^r;
  x32[ 2] = ~x32[ 2];
  x32[ 3] ^= SWAP32LE(0xefffffff)^r;
  x32[ 4] = ~x32[ 4];
  x32[ 5] ^= SWAP32LE(0xdfffffff)^r;
  x32[ 6] = ~x32[ 6];
  x32[ 7] ^= SWAP32LE(0xcfffffff)^r;
  x32[ 8] = ~x32[ 8];
  x32[ 9] ^= SWAP32LE(0xbfffffff)^r;
  x32[10] = ~x32[10];
  x32[11] ^= SWAP32LE(0xafffffff)^r;
  x32[12] = ~x32[12];
  x32[13] ^= SWAP32LE(0x9fffffff)^r;
  x32[14] = ~x32[14];
  x32[15] ^= SWAP32LE(0x8fffffff)^r;

  COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
  COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
}

/* compute compression function (short variants) */
static void F512(uint32_t *h, const uint32_t *m) {
  int i;
  uint32_t Ptmp[2*COLS512];
  uint32_t Qtmp[2*COLS512];
  uint32_t y[2*COLS512];
  uint32_t z[2*COLS512];

  for (i = 0; i < 2*COLS512; i++) {
    z[i] = m[i];
    Ptmp[i] = h[i]^m[i];
  }

  /* compute Q(m) */
  RND512Q((uint8_t*)z, y, SWAP32LE(0x00000000));
  RND512Q((uint8_t*)y, z, SWAP32LE(0x01000000));
  RND512Q((uint8_t*)z, y, SWAP32LE(0x02000000));
  RND512Q((uint8_t*)y, z, SWAP32LE(0x03000000));
  RND512Q((uint8_t*)z, y, SWAP32LE(0x04000000));
  RND512Q((uint8_t*)y, z, SWAP32LE(0x05000000));
  RND512Q((uint8_t*)z, y, SWAP32LE(0x06000000));
  RND512Q((uint8_t*)y, z, SWAP32LE(0x07000000));
  RND512Q((uint8_t*)z, y, SWAP32LE(0x08000000));
  RND512Q((uint8_t*)y, Qtmp, SWAP32LE(0x09000000));

  /* compute P(h+m) */
  RND512P((uint8_t*)Ptmp, y, SWAP32LE(0x00000000));
  RND512P((uint8_t*)y, z, SWAP32LE(0x00000001));
  RND512P((uint8_t*)z, y, SWAP32LE(0x00000002));
  RND512P((uint8_t*)y, z, SWAP32LE(0x00000003));
  RND512P((uint8_t*)z, y, SWAP32LE(0x00000004));
  RND512P((uint8_t*)y, z, SWAP32LE(0x00000005));
  RND512P((uint8_t*)z, y, SWAP32LE(0x00000006));
  RND512P((uint8_t*)y, z, SWAP32LE(0x00000007));
  RND512P((uint8_t*)z, y, SWAP32LE(0x00000008));
  RND512P((uint8_t*)y, Ptmp, SWAP32LE(0x00000009));

  /* compute P(h+m) + Q(m) + h */
  for (i = 0; i < 2*COLS512; i++) {
    h[i] ^= Ptmp[i]^Qtmp[i];
  }
}


/* digest up to msglen bytes of input (full blocks only) */
static void Transform(hashState *ctx, 
	       const uint8_t *input, 
	       int msglen) {

  /* digest message, one block at a time */
  for (; msglen >= SIZE512; 
       msglen -= SIZE512, input += SIZE512) {
    F512(ctx->chaining,(uint32_t*)input);

    /* increment block counter */
    ctx->block_counter1++;
    if (ctx->block_counter1 == 0) ctx->block_counter2++;
  }
}

/* given state h, do h <- P(h)+h */
static void OutputTransformation(hashState *ctx) {
  int j;
  uint32_t temp[2*COLS512];
  uint32_t y[2*COLS512];
  uint32_t z[2*COLS512];



	for (j = 0; j < 2*COLS512; j++) {
	  temp[j] = ctx->chaining[j];
	}
	RND512P((uint8_t*)temp, y, SWAP32LE(0x00000000));
	RND512P((uint8_t*)y, z, SWAP32LE(0x00000001));
	RND512P((uint8_t*)z, y, SWAP32LE(0x00000002));
	RND512P((uint8_t*)y, z, SWAP32LE(0x00000003));
	RND512P((uint8_t*)z, y, SWAP32LE(0x00000004));
	RND512P((uint8_t*)y, z, SWAP32LE(0x00000005));
	RND512P((uint8_t*)z, y, SWAP32LE(0x00000006));
	RND512P((uint8_t*)y, z, SWAP32LE(0x00000007));
	RND512P((uint8_t*)z, y, SWAP32LE(0x00000008));
	RND512P((uint8_t*)y, temp, SWAP32LE(0x00000009));
	for (j = 0; j < 2*COLS512; j++) {
	  ctx->chaining[j] ^= temp[j];
	}									  
}

/* initialise context */
static void Init(hashState* ctx) {
  /* allocate memory for state and data buffer */

  for(size_t i = 0; i < (SIZE512/sizeof(uint32_t)); i++)
  {
	ctx->chaining[i] = 0;
  }

  /* set initial value */
  ctx->chaining[2*COLS512-1] = SWAP32LE(u32BIG((uint32_t)HASH_BIT_LEN));

  /* set other variables */
  ctx->buf_ptr = 0;
  ctx->block_counter1 = 0;
  ctx->block_counter2 = 0;
  ctx->bits_in_last_byte = 0;
}

/* update state with databitlen bits of input */
static void Update(hashState* ctx,
		  const BitSequence* input,
		  DataLength databitlen) {
  int index = 0;
  int msglen = (int)(databitlen/8);
  int rem = (int)(databitlen%8);

  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->buf_ptr) {
    while (ctx->buf_ptr < SIZE512 && index < msglen) {
      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
    }
    if (ctx->buf_ptr < SIZE512) {
      /* buffer still not full, return */
      if (rem) {
	ctx->bits_in_last_byte = rem;
	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
      }
      return;
    }

    /* digest buffer */
    ctx->buf_ptr = 0;
    Transform(ctx, ctx->buffer, SIZE512);
  }

  /* digest bulk of message */
  Transform(ctx, input+index, msglen-index);
  index += ((msglen-index)/SIZE512)*SIZE512;

  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
  }

  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->bits_in_last_byte = rem;
    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
  }
}

#define BILB ctx->bits_in_last_byte

/* finalise: process remaining data (including padding), perform
   output transformation, and write hash result to 'output' */
static void Final(hashState* ctx,
		 BitSequence* output) {
  int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
  uint8_t *s = (BitSequence*)ctx->chaining;

  /* pad with '1'-bit and first few '0'-bits */
  if (BILB) {
    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
    BILB = 0;
  }
  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;

  /* pad with '0'-bits */
  if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
    /* padding requires two blocks */
    while (ctx->buf_ptr < SIZE512) {
      ctx->buffer[(int)ctx->buf_ptr++] = 0;
    }
    /* digest first padding block */
    Transform(ctx, ctx->buffer, SIZE512);
    ctx->buf_ptr = 0;
  }
  while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
    ctx->buffer[(int)ctx->buf_ptr++] = 0;
  }

  /* length padding */
  ctx->block_counter1++;
  if (ctx->block_counter1 == 0) ctx->block_counter2++;
  ctx->buf_ptr = SIZE512;

  while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
    ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
    ctx->block_counter1 >>= 8;
  }
  while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
    ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
    ctx->block_counter2 >>= 8;
  }
  /* digest final padding block */
  Transform(ctx, ctx->buffer, SIZE512); 
  /* perform output transformation */
  OutputTransformation(ctx);

  /* store hash result in output */
  for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
    output[j] = s[i];
  }

  /* zeroise relevant variables and deallocate memory */
  for (i = 0; i < COLS512; i++) {
    ctx->chaining[i] = 0;
  }
  for (i = 0; i < SIZE512; i++) {
    ctx->buffer[i] = 0;
  }
}

/* hash bit sequence */
void groestl(const BitSequence* data, 
		DataLength databitlen,
		BitSequence* hashval) {

  hashState context;

  /* initialise */
    Init(&context);


  /* process message */
  Update(&context, data, databitlen);

  /* finalise */
  Final(&context, hashval);
}
/*
static int crypto_hash(unsigned char *out,
		const unsigned char *in,
		unsigned long long len)
{
  groestl(in, 8*len, out);
  return 0;
}

*/