// sha256 calculation on i386 and amd64 processors.
/*
 *  FIPS-180-2 compliant SHA-256 implementation
 *
 *  Copyright (C) 2001-2003  Christophe Devine
 *  Copyright (C) 2010,2011  Guy Voncken <vogu00@gmail.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

// This file has initially been downloaded from
//     www.spale.com/download/scrypt/scrypt1.0/sha256.c
//
// The following optimisation and debugging work has been done by Guy Voncken:
//     - The CPU intensive parts have been rewritten in assembler for better
//       performance on i386 and amd64 machines.
//     - The code has been corrected for big endian machines. The corrections
//       have been tested on a PowerPC machine.


#include <string.h>
#include <netinet/in.h>

#include "sha256.h"

#if defined(__i386__) || defined (__amd64__)
   #define SHA256_USE_OPTIMISED_ASSEMBLER_CODE
#endif


// Guy: The original GET_UINT32 and PUT_UINT32 macros are
//   - too slow
//   - not working on powerpc
// I replaced them by the powerful htonl/ntohl functions.

/*
#define GET_UINT32(n,b,i)                       \
{                                               \
    (n) = ( (uint32) (b)[(i)    ] << 24 )       \
        | ( (uint32) (b)[(i) + 1] << 16 )       \
        | ( (uint32) (b)[(i) + 2] <<  8 )       \
        | ( (uint32) (b)[(i) + 3]       );      \
}

#define PUT_UINT32(n,b,i)                       \
{                                               \
    (b)[(i)    ] = (uint8) ( (n) >> 24 );       \
    (b)[(i) + 1] = (uint8) ( (n) >> 16 );       \
    (b)[(i) + 2] = (uint8) ( (n) >>  8 );       \
    (b)[(i) + 3] = (uint8) ( (n)       );       \
}
*/

#define GET_UINT32(n,b,i)                       \
   (n)= htonl (*((uint32 *)&(b)[i]));

#define PUT_UINT32(n,b,i)                       \
   *((uint32 *)&(b)[i]) = ntohl((n));

void sha256_starts( sha256_context *ctx )
{
    ctx->total[0] = 0;
    ctx->total[1] = 0;

    ctx->state[0] = 0x6A09E667;
    ctx->state[1] = 0xBB67AE85;
    ctx->state[2] = 0x3C6EF372;
    ctx->state[3] = 0xA54FF53A;
    ctx->state[4] = 0x510E527F;
    ctx->state[5] = 0x9B05688C;
    ctx->state[6] = 0x1F83D9AB;
    ctx->state[7] = 0x5BE0CD19;
}


#ifdef SHA256_USE_OPTIMISED_ASSEMBLER_CODE

   #define P(a,b,c,d,e,f,g,h,x,K)                                                                       \
       __asm__  __volatile__ (                                                                          \
                /* ------------------------------------------------------------------------------- */   \
                /* h + S3(e)                                    +   F1(e,f,g)             + K + x  */   \
                /* h + (ROTR(e, 6) ^ ROTR(e,11) ^ ROTR(e,25))   +   (g ^ (e & (f ^ g)))   + K + x  */   \
                /*           $5            $5           $5           $7   $5   $6  $7       $9  $8 */   \
                /* ------------------------------------------------------------------------------- */   \
                                                                                                        \
                "movl %5,%%ebx;"                                      \
                "movl %%ebx,%%edx;"                                   \
                "rorl $6,%%ebx;"                                      \
                "movl %%ebx,%%eax;"                                   \
                "rorl $5,%%eax;"                                      \
                "xorl %%ebx,%%eax;"                                   \
                "rorl $19,%%ebx;"                                     \
                "xorl %%ebx,%%eax;"  /* eax = S3(); edx = e */        \
                                                                      \
                "movl %7,%%ebx;"                                      \
                "movl %%ebx,%%ecx;"                                   \
                "xorl %6,%%ecx;"                                      \
                "andl %%edx,%%ecx;"                                   \
                "xorl %%ebx,%%ecx;"   /* ecx = F1() */                \
                                                                      \
                "leal " #K "(%%ecx,%%eax,1),%%eax;"                   \
                "addl %1,%%eax;"                                      \
                "addl %8,%%eax;"                                      \
                "addl %%eax, %0;"   /*  d += temp1; */                \
                                                                      \
                /* ----------------------------------------------------------------------- */   \
                /* S2(a)                                    +   F0(a,b,c);                 */   \
                /* (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))   +   ((a & b) | (c & (a | b)))  */   \
                /* ----------------------------------------------------------------------- */   \
                                                                      \
                "movl %2,%%ebx;"                                      \
                "movl %3,%%ecx;"                                      \
                "movl %%ecx,%%edx;"                                   \
                "and  %%ebx,%%ecx;"  /* ecx = (a & b) */              \
                "or   %%ebx,%%edx;"  /* edx = (a | b) */              \
                "and  %4,%%edx;"                                      \
                "or   %%edx,%%ecx;"  /* ecx = F0(); ebx = a */        \
                                                                      \
                "rorl $2,%%ebx;"                                      \
                "movl %%ebx,%%edx;"                                   \
                "rorl $11,%%ebx;"                                     \
                "xorl %%ebx,%%edx;"                                   \
                "rorl $9,%%ebx;"                                      \
                "xorl %%ebx,%%edx;"  /* edx = S2() */                 \
                                                                      \
                "addl %%edx,%%ecx;"  /* ecx = Resultat */             \
                                                                      \
                "addl %%eax,%%ecx;"  /* h = temp1 + temp2; */         \
                "movl %%ecx, %1;"                                     \
            :"=m"(d), "=m"(h)                                                                   \
            :"m"(a), "m"(b), "m"(c), "m"(e), "m"(f), "m"(g), "m"(x), "i"(K), "0"(d), "1"(h)     \
            /*   2       3       4       5       6       7       8       9       0       1 */   \
                                                                                                \
            :"%eax", "%ebx", "%ecx", "%edx", "%cc", "memory")


   #if defined(__i386__)
      #define PR(a,b,c,d,e,f,g,h,i,K)                                         \
          __asm__  __volatile__ (                                             \
                                                                              \
                   /* ----------------------------------- */                  \
                   /* W[t] = S1(W[t -  2]) + W[t -  7] +  */                  \
                   /*        S0(W[t - 15]) + W[t - 16]    */                  \
                   /* ----------------------------------- */                  \
                                                                              \
                   "movl %10,%%edx;"                                          \
                   "movl 4*(" #i "- 7)(%%edx),%%ecx;"  /* ecx used for sum */ \
                   "addl 4*(" #i "-16)(%%edx),%%ecx;"                         \
                                                                              \
                   "movl 4*(" #i "- 2)(%%edx),%%ebx;"                         \
                   "movl %%ebx,%%eax;"                                        \
                   "shrl $10,%%eax;"                                          \
                   "rorl $17,%%ebx;"                                          \
                   "xorl %%ebx,%%eax;"                                        \
                   "rorl $2,%%ebx;"                                           \
                   "xorl %%ebx,%%eax;"  /* eax = S1() */                      \
                   "addl %%eax,%%ecx;"                                        \
                                                                              \
                   "movl 4*(" #i "-15)(%%edx),%%ebx;"                         \
                   "movl %%ebx,%%eax;"                                        \
                   "shrl $3,%%eax;"                                           \
                   "rorl $7,%%ebx;"                                           \
                   "xorl %%ebx,%%eax;"                                        \
                   "rorl $11,%%ebx;"                                          \
                   "xorl %%ebx,%%eax;"  /* eax = S1() */                      \
                   "addl %%eax,%%ecx;"                                        \
                                                                              \
                   "movl %%ecx, 4*(" #i ")(%%edx);" /* Write result to W[t], keep ecx for later */         \
                                                                                                           \
                   /* ------------------------------------------------------------------------------- */   \
                   /* h + S3(e)                                    +   F1(e,f,g)             + K + x  */   \
                   /* h + (ROTR(e, 6) ^ ROTR(e,11) ^ ROTR(e,25))   +   (g ^ (e & (f ^ g)))   + K + x  */   \
                   /*           $5            $5           $5           $7   $5   $6  $7       $9  $8 */   \
                   /* ------------------------------------------------------------------------------- */   \
                                                                                                           \
                   "movl %5,%%ebx;"                                      \
                   "movl %%ebx,%%edx;"                                   \
                   "rorl $6,%%ebx;"                                      \
                   "movl %%ebx,%%eax;"                                   \
                   "rorl $5,%%eax;"                                      \
                   "xorl %%ebx,%%eax;"                                   \
                   "rorl $19,%%ebx;"                                     \
                   "xorl %%ebx,%%eax;"  /* eax = S3(); edx = e */        \
                   "addl %%ecx,%%eax;"  /* Add R(t)  */                  \
                                                                         \
                   "movl %7,%%ebx;"                                      \
                   "movl %%ebx,%%ecx;"                                   \
                   "xorl %6,%%ecx;"                                      \
                   "andl %%edx,%%ecx;"                                   \
                   "xorl %%ebx,%%ecx;"   /* ecx = F1() */                \
                                                                         \
                   "leal " #K "(%%ecx,%%eax,1),%%eax;"                   \
                   "addl %1,%%eax;"                                      \
                   /* "addl %8,%%eax;"   */                              \
                   "addl %%eax, %0;"   /*  d += temp1; */                \
                                                                         \
                   /* ----------------------------------------------------------------------- */   \
                   /* S2(a)                                    +   F0(a,b,c);                 */   \
                   /* (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))   +   ((a & b) | (c & (a | b)))  */   \
                   /* ----------------------------------------------------------------------- */   \
                                                                         \
                   "movl %2,%%ebx;"                                      \
                   "movl %3,%%ecx;"                                      \
                   "movl %%ecx,%%edx;"                                   \
                   "and  %%ebx,%%ecx;"  /* ecx = (a & b) */              \
                   "or   %%ebx,%%edx;"  /* edx = (a | b) */              \
                   "and  %4,%%edx;"                                      \
                   "or   %%edx,%%ecx;"  /* ecx = F0(); ebx = a */        \
                                                                         \
                   "rorl $2,%%ebx;"                                      \
                   "movl %%ebx,%%edx;"                                   \
                   "rorl $11,%%ebx;"                                     \
                   "xorl %%ebx,%%edx;"                                   \
                   "rorl $9,%%ebx;"                                      \
                   "xorl %%ebx,%%edx;"  /* edx = S2() */                 \
                                                                         \
                   "addl %%edx,%%ecx;"  /* ecx = Resultat */             \
                                                                         \
                   "addl %%eax,%%ecx;"  /* h = temp1 + temp2; */         \
                   "movl %%ecx, %1;"                                     \
               :"=m"(d), "=m"(h)                                                                            \
               :"m"(a), "m"(b), "m"(c), "m"(e), "m"(f), "m"(g), "i"(i), "i"(K), "m"(pW), "0"(d), "1"(h)     \
               /*   2       3       4       5       6       7       8       9       10       0       1 */   \
                                                                                                            \
               :"%eax", "%ebx", "%ecx", "%edx", "%cc", "memory")

   #elif defined(__amd64__)

      #define PR(a,b,c,d,e,f,g,h,i,K)                                         \
          __asm__  __volatile__ (                                             \
                                                                              \
                   /* ----------------------------------- */                  \
                   /* W[t] = S1(W[t -  2]) + W[t -  7] +  */                  \
                   /*        S0(W[t - 15]) + W[t - 16]    */                  \
                   /* ----------------------------------- */                  \
                                                                              \
                   "movq %10,%%rdx;"                                          \
                   "movl 4*(" #i "- 7)(%%rdx),%%ecx;"  /* ecx used for sum */ \
                   "addl 4*(" #i "-16)(%%rdx),%%ecx;"                         \
                                                                              \
                   "movl 4*(" #i "- 2)(%%rdx),%%ebx;"                         \
                   "movl %%ebx,%%eax;"                                        \
                   "shrl $10,%%eax;"                                          \
                   "rorl $17,%%ebx;"                                          \
                   "xorl %%ebx,%%eax;"                                        \
                   "rorl $2,%%ebx;"                                           \
                   "xorl %%ebx,%%eax;"  /* eax = S1() */                      \
                   "addl %%eax,%%ecx;"                                        \
                                                                              \
                   "movl 4*(" #i "-15)(%%rdx),%%ebx;"                         \
                   "movl %%ebx,%%eax;"                                        \
                   "shrl $3,%%eax;"                                           \
                   "rorl $7,%%ebx;"                                           \
                   "xorl %%ebx,%%eax;"                                        \
                   "rorl $11,%%ebx;"                                          \
                   "xorl %%ebx,%%eax;"  /* eax = S1() */                      \
                   "addl %%eax,%%ecx;"                                        \
                                                                              \
                   "movl %%ecx, 4*(" #i ")(%%rdx);" /* Write result to W[t], keep ecx for later */         \
                                                                                                           \
                   /* ------------------------------------------------------------------------------- */   \
                   /* h + S3(e)                                    +   F1(e,f,g)             + K + x  */   \
                   /* h + (ROTR(e, 6) ^ ROTR(e,11) ^ ROTR(e,25))   +   (g ^ (e & (f ^ g)))   + K + x  */   \
                   /*           $5            $5           $5           $7   $5   $6  $7       $9  $8 */   \
                   /* ------------------------------------------------------------------------------- */   \
                                                                                                           \
                   "movl %5,%%ebx;"                                      \
                   "movl %%ebx,%%edx;"                                   \
                   "rorl $6,%%ebx;"                                      \
                   "movl %%ebx,%%eax;"                                   \
                   "rorl $5,%%eax;"                                      \
                   "xorl %%ebx,%%eax;"                                   \
                   "rorl $19,%%ebx;"                                     \
                   "xorl %%ebx,%%eax;"  /* eax = S3(); edx = e */        \
                   "addl %%ecx,%%eax;"  /* Add R(t)  */                  \
                                                                         \
                   "movl %7,%%ebx;"                                      \
                   "movl %%ebx,%%ecx;"                                   \
                   "xorl %6,%%ecx;"                                      \
                   "andl %%edx,%%ecx;"                                   \
                   "xorl %%ebx,%%ecx;"   /* ecx = F1() */                \
                                                                         \
                   "leal " #K "(%%ecx,%%eax,1),%%eax;"                   \
                   "addl %1,%%eax;"                                      \
                   /* "addl %8,%%eax;"   */                              \
                   "addl %%eax, %0;"   /*  d += temp1; */                \
                                                                         \
                   /* ----------------------------------------------------------------------- */   \
                   /* S2(a)                                    +   F0(a,b,c);                 */   \
                   /* (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))   +   ((a & b) | (c & (a | b)))  */   \
                   /* ----------------------------------------------------------------------- */   \
                                                                         \
                   "movl %2,%%ebx;"                                      \
                   "movl %3,%%ecx;"                                      \
                   "movl %%ecx,%%edx;"                                   \
                   "and  %%ebx,%%ecx;"  /* ecx = (a & b) */              \
                   "or   %%ebx,%%edx;"  /* edx = (a | b) */              \
                   "and  %4,%%edx;"                                      \
                   "or   %%edx,%%ecx;"  /* ecx = F0(); ebx = a */        \
                                                                         \
                   "rorl $2,%%ebx;"                                      \
                   "movl %%ebx,%%edx;"                                   \
                   "rorl $11,%%ebx;"                                     \
                   "xorl %%ebx,%%edx;"                                   \
                   "rorl $9,%%ebx;"                                      \
                   "xorl %%ebx,%%edx;"  /* edx = S2() */                 \
                                                                         \
                   "addl %%edx,%%ecx;"  /* ecx = Resultat */             \
                                                                         \
                   "addl %%eax,%%ecx;"  /* h = temp1 + temp2; */         \
                   "movl %%ecx, %1;"                                     \
               :"=m"(d), "=m"(h)                                                                            \
               :"m"(a), "m"(b), "m"(c), "m"(e), "m"(f), "m"(g), "i"(i), "i"(K), "m"(pW), "0"(d), "1"(h)     \
               /*   2       3       4       5       6       7       8       9       10       0       1 */   \
                                                                                                            \
               :"%eax", "%ebx", "%ecx", "%edx", "%rdx", "%cc", "memory")
   #else
      #error "Processor architecture not supported by sha256 inline assembly optimisation"
   #endif
#else
// #define  SHR(x,n) ((x & 0xFFFFFFFF) >> n)  // Dieses AND mit FFFFFFFF war nur notwendig, da uint32 in sha256.h faelschlicherweise als long definiert war, was auf amd64 zu 64-Variablen fuehrte
   #define  SHR(x,n) ((x) >> n)
   #define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))

   #define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))
   #define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))

   #define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
   #define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))

   #define F0(x,y,z) ((x & y) | (z & (x | y)))
   #define F1(x,y,z) (z ^ (x & (y ^ z)))

   #define R(t)                                    \
   (                                               \
       W[t] = S1(W[t -  2]) + W[t -  7] +          \
              S0(W[t - 15]) + W[t - 16]            \
   )

   #define P(a,b,c,d,e,f,g,h,x,K)                  \
   {                                               \
       temp1 = h + S3(e) + F1(e,f,g) + K + x;      \
       temp2 = S2(a) + F0(a,b,c);                  \
       d += temp1; h = temp1 + temp2;              \
   }

   #define PR(a,b,c,d,e,f,g,h,i,K)                 \
      P(a,b,c,d,e,f,g,h,R(i),K)


#endif

void sha256_process( sha256_context *ctx, uint8 data[64] )
{
    #ifndef SHA256_USE_OPTIMISED_ASSEMBLER_CODE
       uint32 temp1, temp2;
    #endif
    uint32 W[64];
    uint32 A, B, C, D, E, F, G, H;
    uint32 *pW = &W[0];

    GET_UINT32( W[0],  data,  0 );
    GET_UINT32( W[1],  data,  4 );
    GET_UINT32( W[2],  data,  8 );
    GET_UINT32( W[3],  data, 12 );
    GET_UINT32( W[4],  data, 16 );
    GET_UINT32( W[5],  data, 20 );
    GET_UINT32( W[6],  data, 24 );
    GET_UINT32( W[7],  data, 28 );
    GET_UINT32( W[8],  data, 32 );
    GET_UINT32( W[9],  data, 36 );
    GET_UINT32( W[10], data, 40 );
    GET_UINT32( W[11], data, 44 );
    GET_UINT32( W[12], data, 48 );
    GET_UINT32( W[13], data, 52 );
    GET_UINT32( W[14], data, 56 );
    GET_UINT32( W[15], data, 60 );

    A = ctx->state[0];
    B = ctx->state[1];
    C = ctx->state[2];
    D = ctx->state[3];
    E = ctx->state[4];
    F = ctx->state[5];
    G = ctx->state[6];
    H = ctx->state[7];

    P( A, B, C, D, E, F, G, H, W[ 0],  0x428A2F98 );
    P( H, A, B, C, D, E, F, G, W[ 1],  0x71374491 );
    P( G, H, A, B, C, D, E, F, W[ 2], -0x4A3F0431 );
    P( F, G, H, A, B, C, D, E, W[ 3], -0x164A245B );
    P( E, F, G, H, A, B, C, D, W[ 4],  0x3956C25B );
    P( D, E, F, G, H, A, B, C, W[ 5],  0x59F111F1 );
    P( C, D, E, F, G, H, A, B, W[ 6], -0x6DC07D5C );
    P( B, C, D, E, F, G, H, A, W[ 7], -0x54E3A12B );
    P( A, B, C, D, E, F, G, H, W[ 8], -0x27F85568 );
    P( H, A, B, C, D, E, F, G, W[ 9],  0x12835B01 );
    P( G, H, A, B, C, D, E, F, W[10],  0x243185BE );
    P( F, G, H, A, B, C, D, E, W[11],  0x550C7DC3 );
    P( E, F, G, H, A, B, C, D, W[12],  0x72BE5D74 );
    P( D, E, F, G, H, A, B, C, W[13], -0x7F214E02 );
    P( C, D, E, F, G, H, A, B, W[14], -0x6423F959 );
    P( B, C, D, E, F, G, H, A, W[15], -0x3E640E8C );
    PR(A, B, C, D, E, F, G, H,   16 , -0x1B64963F );
    PR(H, A, B, C, D, E, F, G,   17 , -0x1041B87A );
    PR(G, H, A, B, C, D, E, F,   18 ,  0x0FC19DC6 );
    PR(F, G, H, A, B, C, D, E,   19 ,  0x240CA1CC );
    PR(E, F, G, H, A, B, C, D,   20 ,  0x2DE92C6F );
    PR(D, E, F, G, H, A, B, C,   21 ,  0x4A7484AA );
    PR(C, D, E, F, G, H, A, B,   22 ,  0x5CB0A9DC );
    PR(B, C, D, E, F, G, H, A,   23 ,  0x76F988DA );
    PR(A, B, C, D, E, F, G, H,   24 , -0x67C1AEAE );
    PR(H, A, B, C, D, E, F, G,   25 , -0x57CE3993 );
    PR(G, H, A, B, C, D, E, F,   26 , -0x4FFCD838 );
    PR(F, G, H, A, B, C, D, E,   27 , -0x40A68039 );
    PR(E, F, G, H, A, B, C, D,   28 , -0x391FF40D );
    PR(D, E, F, G, H, A, B, C,   29 , -0x2A586EB9 );
    PR(C, D, E, F, G, H, A, B,   30 ,  0x06CA6351 );
    PR(B, C, D, E, F, G, H, A,   31 ,  0x14292967 );
    PR(A, B, C, D, E, F, G, H,   32 ,  0x27B70A85 );
    PR(H, A, B, C, D, E, F, G,   33 ,  0x2E1B2138 );
    PR(G, H, A, B, C, D, E, F,   34 ,  0x4D2C6DFC );
    PR(F, G, H, A, B, C, D, E,   35 ,  0x53380D13 );
    PR(E, F, G, H, A, B, C, D,   36 ,  0x650A7354 );
    PR(D, E, F, G, H, A, B, C,   37 ,  0x766A0ABB );
    PR(C, D, E, F, G, H, A, B,   38 , -0x7E3D36D2 );
    PR(B, C, D, E, F, G, H, A,   39 , -0x6D8DD37B );
    PR(A, B, C, D, E, F, G, H,   40 , -0x5D40175F );
    PR(H, A, B, C, D, E, F, G,   41 , -0x57E599B5 );
    PR(G, H, A, B, C, D, E, F,   42 , -0x3DB47490 );
    PR(F, G, H, A, B, C, D, E,   43 , -0x3893AE5D );
    PR(E, F, G, H, A, B, C, D,   44 , -0x2E6D17E7 );
    PR(D, E, F, G, H, A, B, C,   45 , -0x2966F9DC );
    PR(C, D, E, F, G, H, A, B,   46 , -0x0BF1CA7B );
    PR(B, C, D, E, F, G, H, A,   47 ,  0x106AA070 );
    PR(A, B, C, D, E, F, G, H,   48 ,  0x19A4C116 );
    PR(H, A, B, C, D, E, F, G,   49 ,  0x1E376C08 );
    PR(G, H, A, B, C, D, E, F,   50 ,  0x2748774C );
    PR(F, G, H, A, B, C, D, E,   51 ,  0x34B0BCB5 );
    PR(E, F, G, H, A, B, C, D,   52 ,  0x391C0CB3 );
    PR(D, E, F, G, H, A, B, C,   53 ,  0x4ED8AA4A );
    PR(C, D, E, F, G, H, A, B,   54 ,  0x5B9CCA4F );
    PR(B, C, D, E, F, G, H, A,   55 ,  0x682E6FF3 );
    PR(A, B, C, D, E, F, G, H,   56 ,  0x748F82EE );
    PR(H, A, B, C, D, E, F, G,   57 ,  0x78A5636F );
    PR(G, H, A, B, C, D, E, F,   58 , -0x7B3787EC );
    PR(F, G, H, A, B, C, D, E,   59 , -0x7338FDF8 );
    PR(E, F, G, H, A, B, C, D,   60 , -0x6F410006 );
    PR(D, E, F, G, H, A, B, C,   61 , -0x5BAF9315 );
    PR(C, D, E, F, G, H, A, B,   62 , -0x41065C09 );
    PR(B, C, D, E, F, G, H, A,   63 , -0x398E870E );

    ctx->state[0] += A;
    ctx->state[1] += B;
    ctx->state[2] += C;
    ctx->state[3] += D;
    ctx->state[4] += E;
    ctx->state[5] += F;
    ctx->state[6] += G;
    ctx->state[7] += H;
}

void sha256_update( sha256_context *ctx, uint8 *input, uint32 length )
{
    uint32 left, fill;

    if( ! length ) return;

    left = ctx->total[0] & 0x3F;
    fill = 64 - left;

    ctx->total[0] += length;
    ctx->total[0] &= 0xFFFFFFFF;

    if( ctx->total[0] < length )
        ctx->total[1]++;

    if( left && length >= fill )
    {
        memcpy( (void *) (ctx->buffer + left),
                (void *) input, fill );
        sha256_process( ctx, ctx->buffer );
        length -= fill;
        input  += fill;
        left = 0;
    }

    while( length >= 64 )
    {
        sha256_process( ctx, input );
        length -= 64;
        input  += 64;
    }

    if( length )
    {
        memcpy( (void *) (ctx->buffer + left),
                (void *) input, length );
    }
}

static uint8 sha256_padding[64] =
{
 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

void sha256_finish( sha256_context *ctx, uint8 digest[32] )
{
    uint32 last, padn;
    uint32 high, low;
    uint8 msglen[8];

    high = ( ctx->total[0] >> 29 )
         | ( ctx->total[1] <<  3 );
    low  = ( ctx->total[0] <<  3 );

    PUT_UINT32( high, msglen, 0 );
    PUT_UINT32( low,  msglen, 4 );

    last = ctx->total[0] & 0x3F;
    padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );

    sha256_update( ctx, sha256_padding, padn );
    sha256_update( ctx, msglen, 8 );

    PUT_UINT32( ctx->state[0], digest,  0 );
    PUT_UINT32( ctx->state[1], digest,  4 );
    PUT_UINT32( ctx->state[2], digest,  8 );
    PUT_UINT32( ctx->state[3], digest, 12 );
    PUT_UINT32( ctx->state[4], digest, 16 );
    PUT_UINT32( ctx->state[5], digest, 20 );
    PUT_UINT32( ctx->state[6], digest, 24 );
    PUT_UINT32( ctx->state[7], digest, 28 );
}

#ifdef TEST

#include <stdlib.h>
#include <stdio.h>

/*
 * those are the standard FIPS-180-2 test vectors
 */

static const char *msg[] =
{
    "abc",
    "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
    NULL
};

static const char *val[] =
{
    "ba7816bf8f01cfea414140de5dae2223" \
    "b00361a396177a9cb410ff61f20015ad",
    "248d6a61d20638b8e5c026930c3e6039" \
    "a33ce45964ff2167f6ecedd419db06c1",
    "cdc76e5c9914fb9281a1c7e284d73e67" \
    "f1809a48a497200e046d39ccc7112cd0"
};

int main( int argc, char *argv[] )
{
    FILE *f;
    int i, j;
    char output[65];
    sha256_context ctx;
    unsigned char buf[65536];
    unsigned char sha256sum[32];

    if( argc < 2 )
    {
        printf( "\n SHA-256 Validation Tests:\n\n" );

        for( i = 0; i < 3; i++ )
        {
            printf( " Test %d ", i + 1 );

            sha256_starts( &ctx );

            if( i < 2 )
            {
                sha256_update( &ctx, (uint8 *) msg[i],
                               strlen( msg[i] ) );
            }
            else
            {
                memset( buf, 'a', 1000 );

                for( j = 0; j < 1000; j++ )
                {
                    sha256_update( &ctx, (uint8 *) buf, 1000 );
                }
            }

            sha256_finish( &ctx, sha256sum );

            for( j = 0; j < 32; j++ )
            {
                sprintf( output + j * 2, "%02x", sha256sum[j] );
            }

            if( memcmp( output, val[i], 64 ) )
            {
                printf( "failed!\n" );
                return( 1 );
            }

            printf( "passed.\n" );
        }

        printf( "\n" );
    }
    else
    {
        if( ! ( f = fopen( argv[1], "rb" ) ) )
        {
            perror( "fopen" );
            return( 1 );
        }

        sha256_starts( &ctx );

        while( ( i = fread( buf, 1, sizeof( buf ), f ) ) > 0 )
        {
            sha256_update( &ctx, buf, i );
        }

        sha256_finish( &ctx, sha256sum );

        for( j = 0; j < 32; j++ )
        {
            printf( "%02x", sha256sum[j] );
        }

        printf( "  %s\n", argv[1] );
    }

    return( 0 );
}

#endif
