[openssl-commits] [openssl] master update

Thu Mar 31 19:36:48 UTC 2016

The branch master has been updated
       via  1fab06a665333d4afdd2bc46a1ab787fd473ec49 (commit)
      from  349232d149804dae09987af7612000e30ee7c4cc (commit)


- Log -----------------------------------------------------------------
commit 1fab06a665333d4afdd2bc46a1ab787fd473ec49
Author: Andy Polyakov <appro at openssl.org>
Date:   Sun Mar 13 22:19:53 2016 +0100

    crypto/blake2: make lowest-level function handle multiple blocks..
    
    This minimizes inter-block overhead. Performance gain naturally
    varies from case to case, up to 10% was spotted so far. There is
    one thing to recognize, given same circumstances gain would be
    higher faster computational part is. Or in other words biggest
    improvement coefficient would have been observed with assembly.
    
    Reviewed-by: Emilia Käsper <emilia at openssl.org>
    Reviewed-by: Rich Salz <rsalz at openssl.org>

-----------------------------------------------------------------------

Summary of changes:
 crypto/blake2/blake2_impl.h |  53 ++++++------
 crypto/blake2/blake2b.c     | 197 ++++++++++++++++++++++++++++----------------
 crypto/blake2/blake2s.c     | 193 +++++++++++++++++++++++++++----------------
 test/evptests.txt           |  16 ++++
 4 files changed, 288 insertions(+), 171 deletions(-)

diff --git a/crypto/blake2/blake2_impl.h b/crypto/blake2/blake2_impl.h
index c613abd..335a383 100644
--- a/crypto/blake2/blake2_impl.h
+++ b/crypto/blake2/blake2_impl.h
@@ -30,10 +30,10 @@ static ossl_inline uint32_t load32(const uint8_t *src)
         memcpy(&w, src, sizeof(w));
         return w;
     } else {
-        uint32_t w = *src++;
-        w |= (uint32_t)(*src++) <<  8;
-        w |= (uint32_t)(*src++) << 16;
-        w |= (uint32_t)(*src++) << 24;
+        uint32_t w = ((uint32_t)src[0])
+                   | ((uint32_t)src[1] <<  8)
+                   | ((uint32_t)src[2] << 16)
+                   | ((uint32_t)src[3] << 24);
         return w;
     }
 }
@@ -50,14 +50,14 @@ static ossl_inline uint64_t load64(const uint8_t *src)
         memcpy(&w, src, sizeof(w));
         return w;
     } else {
-        uint64_t w = *src++;
-        w |= (uint64_t)(*src++) <<  8;
-        w |= (uint64_t)(*src++) << 16;
-        w |= (uint64_t)(*src++) << 24;
-        w |= (uint64_t)(*src++) << 32;
-        w |= (uint64_t)(*src++) << 40;
-        w |= (uint64_t)(*src++) << 48;
-        w |= (uint64_t)(*src++) << 56;
+        uint64_t w = ((uint64_t)src[0])
+                   | ((uint64_t)src[1] <<  8)
+                   | ((uint64_t)src[2] << 16)
+                   | ((uint64_t)src[3] << 24)
+                   | ((uint64_t)src[4] << 32)
+                   | ((uint64_t)src[5] << 40)
+                   | ((uint64_t)src[6] << 48)
+                   | ((uint64_t)src[7] << 56);
         return w;
     }
 }
@@ -100,29 +100,24 @@ static ossl_inline void store64(uint8_t *dst, uint64_t w)
 
 static ossl_inline uint64_t load48(const uint8_t *src)
 {
-    uint64_t w = *src++;
-    w |= (uint64_t)(*src++) <<  8;
-    w |= (uint64_t)(*src++) << 16;
-    w |= (uint64_t)(*src++) << 24;
-    w |= (uint64_t)(*src++) << 32;
-    w |= (uint64_t)(*src++) << 40;
+    uint64_t w = ((uint64_t)src[0])
+               | ((uint64_t)src[1] <<  8)
+               | ((uint64_t)src[2] << 16)
+               | ((uint64_t)src[3] << 24)
+               | ((uint64_t)src[4] << 32)
+               | ((uint64_t)src[5] << 40);
     return w;
 }
 
 static ossl_inline void store48(uint8_t *dst, uint64_t w)
 {
     uint8_t *p = (uint8_t *)dst;
-    *p++ = (uint8_t)w;
-    w >>= 8;
-    *p++ = (uint8_t)w;
-    w >>= 8;
-    *p++ = (uint8_t)w;
-    w >>= 8;
-    *p++ = (uint8_t)w;
-    w >>= 8;
-    *p++ = (uint8_t)w;
-    w >>= 8;
-    *p++ = (uint8_t)w;
+    p[0] = (uint8_t)w;
+    p[1] = (uint8_t)(w>>8);
+    p[2] = (uint8_t)(w>>16);
+    p[3] = (uint8_t)(w>>24);
+    p[4] = (uint8_t)(w>>32);
+    p[5] = (uint8_t)(w>>40);
 }
 
 static ossl_inline uint32_t rotr32(const uint32_t w, const unsigned int c)
diff --git a/crypto/blake2/blake2b.c b/crypto/blake2/blake2b.c
index 6f04412..56b56fb 100644
--- a/crypto/blake2/blake2b.c
+++ b/crypto/blake2/blake2b.c
@@ -15,6 +15,12 @@
  * can be found at https://blake2.net.
  */
 
+#ifndef BLAKE_DEBUG
+# undef NDEBUG                  /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
+#include <assert.h>
 #include <string.h>
 #include <openssl/crypto.h>
 #include "e_os.h"
@@ -52,21 +58,13 @@ static ossl_inline void blake2b_set_lastblock(BLAKE2B_CTX *S)
     S->f[0] = -1;
 }
 
-/* Increment the data hashed counter. */
-static ossl_inline void blake2b_increment_counter(BLAKE2B_CTX *S,
-                                                  const uint64_t inc)
-{
-    S->t[0] += inc;
-    S->t[1] += (S->t[0] < inc);
-}
-
 /* Initialize the hashing state. */
 static ossl_inline void blake2b_init0(BLAKE2B_CTX *S)
 {
     int i;
 
     memset(S, 0, sizeof(BLAKE2B_CTX));
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         S->h[i] = blake2b_IV[i];
     }
 }
@@ -82,7 +80,7 @@ static void blake2b_init_param(BLAKE2B_CTX *S, const BLAKE2B_PARAM *P)
      * every platform. */
     OPENSSL_assert(sizeof(BLAKE2B_PARAM) == 64);
     /* IV XOR ParamBlock */
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         S->h[i] ^= load64(p + sizeof(S->h[i]) * i);
     }
 }
@@ -108,69 +106,106 @@ int BLAKE2b_Init(BLAKE2B_CTX *c)
 
 /* Permute the state while xoring in the block of data. */
 static void blake2b_compress(BLAKE2B_CTX *S,
-                            const uint8_t block[BLAKE2B_BLOCKBYTES])
+                            const uint8_t *blocks,
+                            size_t len)
 {
     uint64_t m[16];
     uint64_t v[16];
     int i;
+    size_t increment;
 
-    for(i = 0; i < 16; ++i) {
-        m[i] = load64(block + i * sizeof(m[i]));
-    }
+    /*
+     * There are two distinct usage vectors for this function:
+     *
+     * a) BLAKE2b_Update uses it to process complete blocks,
+     *    possibly more than one at a time;
+     *
+     * b) BLAK2b_Final uses it to process last block, always
+     *    single but possibly incomplete, in which case caller
+     *    pads input with zeros.
+     */
+    assert(len < BLAKE2B_BLOCKBYTES || len % BLAKE2B_BLOCKBYTES == 0);
+
+    /*
+     * Since last block is always processed with separate call,
+     * |len| not being multiple of complete blocks can be observed
+     * only with |len| being less than BLAKE2B_BLOCKBYTES ("less"
+     * including even zero), which is why following assignment doesn't
+     * have to reside inside the main loop below.
+     */
+    increment = len < BLAKE2B_BLOCKBYTES ? len : BLAKE2B_BLOCKBYTES;
 
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         v[i] = S->h[i];
     }
 
-    v[8]  = blake2b_IV[0];
-    v[9]  = blake2b_IV[1];
-    v[10] = blake2b_IV[2];
-    v[11] = blake2b_IV[3];
-    v[12] = S->t[0] ^ blake2b_IV[4];
-    v[13] = S->t[1] ^ blake2b_IV[5];
-    v[14] = S->f[0] ^ blake2b_IV[6];
-    v[15] = S->f[1] ^ blake2b_IV[7];
+    do {
+        for (i = 0; i < 16; ++i) {
+            m[i] = load64(blocks + i * sizeof(m[i]));
+        }
+
+        /* blake2b_increment_counter */
+        S->t[0] += increment;
+        S->t[1] += (S->t[0] < increment);
+
+        v[8]  = blake2b_IV[0];
+        v[9]  = blake2b_IV[1];
+        v[10] = blake2b_IV[2];
+        v[11] = blake2b_IV[3];
+        v[12] = S->t[0] ^ blake2b_IV[4];
+        v[13] = S->t[1] ^ blake2b_IV[5];
+        v[14] = S->f[0] ^ blake2b_IV[6];
+        v[15] = S->f[1] ^ blake2b_IV[7];
 #define G(r,i,a,b,c,d) \
-    do { \
-        a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-        d = rotr64(d ^ a, 32); \
-        c = c + d; \
-        b = rotr64(b ^ c, 24); \
-        a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-        d = rotr64(d ^ a, 16); \
-        c = c + d; \
-        b = rotr64(b ^ c, 63); \
-    } while(0)
+        do { \
+            a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+            d = rotr64(d ^ a, 32); \
+            c = c + d; \
+            b = rotr64(b ^ c, 24); \
+            a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+            d = rotr64(d ^ a, 16); \
+            c = c + d; \
+            b = rotr64(b ^ c, 63); \
+        } while (0)
 #define ROUND(r)  \
-    do { \
-        G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-        G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-        G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-        G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-        G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-        G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-        G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-        G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-    } while(0)
-    ROUND(0);
-    ROUND(1);
-    ROUND(2);
-    ROUND(3);
-    ROUND(4);
-    ROUND(5);
-    ROUND(6);
-    ROUND(7);
-    ROUND(8);
-    ROUND(9);
-    ROUND(10);
-    ROUND(11);
-
-    for(i = 0; i < 8; ++i) {
-        S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-    }
+        do { \
+            G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+            G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+            G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+            G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+            G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+            G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+            G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+            G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+        } while (0)
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+        /* 3x size reduction on x86_64, almost 7x on ARMv8, 9x on ARMv4 */
+        for (i = 0; i < 12; i++) {
+            ROUND(i);
+        }
+#else
+        ROUND(0);
+        ROUND(1);
+        ROUND(2);
+        ROUND(3);
+        ROUND(4);
+        ROUND(5);
+        ROUND(6);
+        ROUND(7);
+        ROUND(8);
+        ROUND(9);
+        ROUND(10);
+        ROUND(11);
+#endif
 
+        for (i = 0; i < 8; ++i) {
+            S->h[i] = v[i] ^= v[i + 8] ^ S->h[i];
+        }
 #undef G
 #undef ROUND
+        blocks += increment;
+        len -= increment;
+    } while (len);
 }
 
 /* Absorb the input data into the hash state.  Always returns 1. */
@@ -179,23 +214,42 @@ int BLAKE2b_Update(BLAKE2B_CTX *c, const void *data, size_t datalen)
     const uint8_t *in = data;
     size_t fill;
 
-    while(datalen > 0) {
-        fill = sizeof(c->buf) - c->buflen;
-        /* Must be >, not >=, so that last block can be hashed differently */
-        if(datalen > fill) {
+    /*
+     * Intuitively one would expect intermediate buffer, c->buf, to
+     * store incomplete blocks. But in this case we are interested to
+     * temporarily stash even complete blocks, because last one in the
+     * stream has to be treated in special way, and at this point we
+     * don't know if last block in *this* call is last one "ever". This
+     * is the reason for why |datalen| is compared as >, and not >=.
+     */
+    fill = sizeof(c->buf) - c->buflen;
+    if (datalen > fill) {
+        if (c->buflen) {
             memcpy(c->buf + c->buflen, in, fill); /* Fill buffer */
-            blake2b_increment_counter(c, BLAKE2B_BLOCKBYTES);
-            blake2b_compress(c, c->buf); /* Compress */
+            blake2b_compress(c, c->buf, BLAKE2B_BLOCKBYTES);
             c->buflen = 0;
             in += fill;
             datalen -= fill;
-        } else { /* datalen <= fill */
-            memcpy(c->buf + c->buflen, in, datalen);
-            c->buflen += datalen; /* Be lazy, do not compress */
-            return 1;
+        }
+        if (datalen > BLAKE2B_BLOCKBYTES) {
+            size_t stashlen = datalen % BLAKE2B_BLOCKBYTES;
+            /*
+	     * If |datalen| is a multiple of the blocksize, stash
+	     * last complete block, it can be final one...
+             */
+            stashlen = stashlen ? stashlen : BLAKE2B_BLOCKBYTES;
+            datalen -= stashlen;
+            blake2b_compress(c, in, datalen);
+            in += datalen;
+            datalen = stashlen;
         }
     }
 
+    assert(datalen <= BLAKE2B_BLOCKBYTES);
+
+    memcpy(c->buf + c->buflen, in, datalen);
+    c->buflen += datalen; /* Be lazy, do not compress */
+
     return 1;
 }
 
@@ -207,14 +261,13 @@ int BLAKE2b_Final(unsigned char *md, BLAKE2B_CTX *c)
 {
     int i;
 
-    blake2b_increment_counter(c, c->buflen);
     blake2b_set_lastblock(c);
     /* Padding */
     memset(c->buf + c->buflen, 0, sizeof(c->buf) - c->buflen);
-    blake2b_compress(c, c->buf);
+    blake2b_compress(c, c->buf, c->buflen);
 
     /* Output full hash to message digest */
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         store64(md + sizeof(c->h[i]) * i, c->h[i]);
     }
 
diff --git a/crypto/blake2/blake2s.c b/crypto/blake2/blake2s.c
index f940aa1..905a28e 100644
--- a/crypto/blake2/blake2s.c
+++ b/crypto/blake2/blake2s.c
@@ -15,6 +15,12 @@
  * can be found at https://blake2.net.
  */
 
+#ifndef BLAKE_DEBUG
+# undef NDEBUG                  /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
+#include <assert.h>
 #include <string.h>
 #include <openssl/crypto.h>
 #include "e_os.h"
@@ -48,21 +54,13 @@ static ossl_inline void blake2s_set_lastblock(BLAKE2S_CTX *S)
     S->f[0] = -1;
 }
 
-/* Increment the data hashed counter. */
-static ossl_inline void blake2s_increment_counter(BLAKE2S_CTX *S,
-                                                  const uint32_t inc)
-{
-    S->t[0] += inc;
-    S->t[1] += (S->t[0] < inc);
-}
-
 /* Initialize the hashing state. */
 static ossl_inline void blake2s_init0(BLAKE2S_CTX *S)
 {
     int i;
 
     memset(S, 0, sizeof(BLAKE2S_CTX));
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         S->h[i] = blake2s_IV[i];
     }
 }
@@ -78,7 +76,7 @@ static void blake2s_init_param(BLAKE2S_CTX *S, const BLAKE2S_PARAM *P)
     OPENSSL_assert(sizeof(BLAKE2S_PARAM) == 32);
     blake2s_init0(S);
     /* IV XOR ParamBlock */
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         S->h[i] ^= load32(&p[i*4]);
     }
 }
@@ -104,67 +102,104 @@ int BLAKE2s_Init(BLAKE2S_CTX *c)
 
 /* Permute the state while xoring in the block of data. */
 static void blake2s_compress(BLAKE2S_CTX *S,
-                            const uint8_t block[BLAKE2S_BLOCKBYTES])
+                            const uint8_t *blocks,
+                            size_t len)
 {
     uint32_t m[16];
     uint32_t v[16];
     size_t i;
+    size_t increment;
 
-    for(i = 0; i < 16; ++i) {
-        m[i] = load32(block + i * sizeof(m[i]));
-    }
+    /*
+     * There are two distinct usage vectors for this function:
+     *
+     * a) BLAKE2s_Update uses it to process complete blocks,
+     *    possibly more than one at a time;
+     *
+     * b) BLAK2s_Final uses it to process last block, always
+     *    single but possibly incomplete, in which case caller
+     *    pads input with zeros.
+     */
+    assert(len < BLAKE2S_BLOCKBYTES || len % BLAKE2S_BLOCKBYTES == 0);
+
+    /*
+     * Since last block is always processed with separate call,
+     * |len| not being multiple of complete blocks can be observed
+     * only with |len| being less than BLAKE2S_BLOCKBYTES ("less"
+     * including even zero), which is why following assignment doesn't
+     * have to reside inside the main loop below.
+     */
+    increment = len < BLAKE2S_BLOCKBYTES ? len : BLAKE2S_BLOCKBYTES;
 
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         v[i] = S->h[i];
     }
 
-    v[ 8] = blake2s_IV[0];
-    v[ 9] = blake2s_IV[1];
-    v[10] = blake2s_IV[2];
-    v[11] = blake2s_IV[3];
-    v[12] = S->t[0] ^ blake2s_IV[4];
-    v[13] = S->t[1] ^ blake2s_IV[5];
-    v[14] = S->f[0] ^ blake2s_IV[6];
-    v[15] = S->f[1] ^ blake2s_IV[7];
+    do {
+        for (i = 0; i < 16; ++i) {
+            m[i] = load32(blocks + i * sizeof(m[i]));
+        }
+
+        /* blake2s_increment_counter */
+        S->t[0] += increment;
+        S->t[1] += (S->t[0] < increment);
+
+        v[ 8] = blake2s_IV[0];
+        v[ 9] = blake2s_IV[1];
+        v[10] = blake2s_IV[2];
+        v[11] = blake2s_IV[3];
+        v[12] = S->t[0] ^ blake2s_IV[4];
+        v[13] = S->t[1] ^ blake2s_IV[5];
+        v[14] = S->f[0] ^ blake2s_IV[6];
+        v[15] = S->f[1] ^ blake2s_IV[7];
 #define G(r,i,a,b,c,d) \
-    do { \
-        a = a + b + m[blake2s_sigma[r][2*i+0]]; \
-        d = rotr32(d ^ a, 16); \
-        c = c + d; \
-        b = rotr32(b ^ c, 12); \
-        a = a + b + m[blake2s_sigma[r][2*i+1]]; \
-        d = rotr32(d ^ a, 8); \
-        c = c + d; \
-        b = rotr32(b ^ c, 7); \
-    } while(0)
+        do { \
+            a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+            d = rotr32(d ^ a, 16); \
+            c = c + d; \
+            b = rotr32(b ^ c, 12); \
+            a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+            d = rotr32(d ^ a, 8); \
+            c = c + d; \
+            b = rotr32(b ^ c, 7); \
+        } while (0)
 #define ROUND(r)  \
-    do { \
-        G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-        G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-        G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-        G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-        G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-        G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-        G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-        G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-    } while(0)
-    ROUND(0);
-    ROUND(1);
-    ROUND(2);
-    ROUND(3);
-    ROUND(4);
-    ROUND(5);
-    ROUND(6);
-    ROUND(7);
-    ROUND(8);
-    ROUND(9);
-
-    for(i = 0; i < 8; ++i) {
-        S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-    }
+        do { \
+            G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+            G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+            G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+            G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+            G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+            G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+            G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+            G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+        } while (0)
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+        /* almost 3x reduction on x86_64, 4.5x on ARMv8, 4x on ARMv4 */
+        for (i = 0; i < 10; i++) {
+            ROUND(i);
+        }
+#else
+        ROUND(0);
+        ROUND(1);
+        ROUND(2);
+        ROUND(3);
+        ROUND(4);
+        ROUND(5);
+        ROUND(6);
+        ROUND(7);
+        ROUND(8);
+        ROUND(9);
+#endif
 
+        for (i = 0; i < 8; ++i) {
+            S->h[i] = v[i] ^= v[i + 8] ^ S->h[i];
+        }
 #undef G
 #undef ROUND
+        blocks += increment;
+        len -= increment;
+    } while (len);
 }
 
 /* Absorb the input data into the hash state.  Always returns 1. */
@@ -173,23 +208,42 @@ int BLAKE2s_Update(BLAKE2S_CTX *c, const void *data, size_t datalen)
     const uint8_t *in = data;
     size_t fill;
 
-    while(datalen > 0) {
-        fill = sizeof(c->buf) - c->buflen;
-        /* Must be >, not >=, so that last block can be hashed differently */
-        if(datalen > fill) {
+    /*
+     * Intuitively one would expect intermediate buffer, c->buf, to
+     * store incomplete blocks. But in this case we are interested to
+     * temporarily stash even complete blocks, because last one in the
+     * stream has to be treated in special way, and at this point we
+     * don't know if last block in *this* call is last one "ever". This
+     * is the reason for why |datalen| is compared as >, and not >=.
+     */
+    fill = sizeof(c->buf) - c->buflen;
+    if (datalen > fill) {
+        if (c->buflen) {
             memcpy(c->buf + c->buflen, in, fill); /* Fill buffer */
-            blake2s_increment_counter(c, BLAKE2S_BLOCKBYTES);
-            blake2s_compress(c, c->buf); /* Compress */
+            blake2s_compress(c, c->buf, BLAKE2S_BLOCKBYTES);
             c->buflen = 0;
             in += fill;
             datalen -= fill;
-        } else { /* datalen <= fill */
-            memcpy(c->buf + c->buflen, in, datalen);
-            c->buflen += datalen; /* Be lazy, do not compress */
-            return 1;
+        }
+        if (datalen > BLAKE2S_BLOCKBYTES)  {
+            size_t stashlen = datalen % BLAKE2S_BLOCKBYTES;
+            /*
+	     * If |datalen| is a multiple of the blocksize, stash
+	     * last complete block, it can be final one...
+             */
+            stashlen = stashlen ? stashlen : BLAKE2S_BLOCKBYTES;
+            datalen -= stashlen;
+            blake2s_compress(c, in, datalen);
+            in += datalen;
+            datalen = stashlen;
         }
     }
 
+    assert(datalen <= BLAKE2S_BLOCKBYTES);
+
+    memcpy(c->buf + c->buflen, in, datalen);
+    c->buflen += datalen; /* Be lazy, do not compress */
+
     return 1;
 }
 
@@ -201,14 +255,13 @@ int BLAKE2s_Final(unsigned char *md, BLAKE2S_CTX *c)
 {
     int i;
 
-    blake2s_increment_counter(c, (uint32_t)c->buflen);
     blake2s_set_lastblock(c);
     /* Padding */
     memset(c->buf + c->buflen, 0, sizeof(c->buf) - c->buflen);
-    blake2s_compress(c, c->buf);
+    blake2s_compress(c, c->buf, c->buflen);
 
     /* Output full hash to temp buffer */
-    for(i = 0; i < 8; ++i) {
+    for (i = 0; i < 8; ++i) {
         store32(md + sizeof(c->h[i]) * i, c->h[i]);
     }
 
diff --git a/test/evptests.txt b/test/evptests.txt
index d0acbef..fb94416 100644
--- a/test/evptests.txt
+++ b/test/evptests.txt
@@ -34,6 +34,14 @@ Digest = BLAKE2s256
 Input = 3132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930
 Output = fdaedb290a0d5af9870864fec2e090200989dc9cd53a3c092129e8535e8b4f66
 
+Digest = BLAKE2s256
+Input = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
+Output = 1FA877DE67259D19863A2A34BCC6962A2B25FCBF5CBECD7EDE8F1FA36688A796
+
+Digest = BLAKE2s256
+Input = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F8081
+Output = C80ABEEBB669AD5DEEB5F5EC8EA6B7A05DDF7D31EC4C0A2EE20B0B98CAEC6746
+
 Digest = BLAKE2b512
 Input = 
 Output = 786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce
@@ -62,6 +70,14 @@ Digest = BLAKE2b512
 Input = 3132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930
 Output = 686f41ec5afff6e87e1f076f542aa466466ff5fbde162c48481ba48a748d842799f5b30f5b67fc684771b33b994206d05cc310f31914edd7b97e41860d77d282
 
+Digest = BLAKE2b512
+Input = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
+Output = 2319E3789C47E2DAA5FE807F61BEC2A1A6537FA03F19FF32E87EECBFD64B7E0E8CCFF439AC333B040F19B0C4DDD11A61E24AC1FE0F10A039806C5DCC0DA3D115
+
+Digest = BLAKE2b512
+Input = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F8081
+Output = DF0A9D0C212843A6A934E3902B2DD30D17FBA5F969D2030B12A546D8A6A45E80CF5635F071F0452E9C919275DA99BED51EB1173C1AF0518726B75B0EC3BAE2B5
+
 # SHA(1) tests (from shatest.c)
 Digest = SHA1
 Input = 616263